In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import itertools
import pickle, gzip
import gc

In [2]:
train_series = pd.read_csv('training_set.csv')
metadata_train = pd.read_csv('training_set_metadata.csv')

simple_features = train_series.groupby(
    ['object_id', 'passband'])['flux'].agg(
    ['mean', 'median', 'max', 'min', 'std']).unstack('passband')


#construct time series using binned observations:
ts_mod = train_series[['object_id', 'mjd', 'passband', 'flux']].copy()
#bin by 5 days, reducing the size of data but still giving a time series
ts_mod['mjd_d5'] = (ts_mod['mjd'] / 5).astype(int)
ts_mod = ts_mod.groupby(['object_id', 'mjd_d5', 'passband'])['flux'].mean().reset_index()

#pivotting
ts_piv = pd.pivot_table(ts_mod, 
                        index='object_id', 
                        columns=['mjd_d5', 'passband'], 
                        values=['flux'],
                        dropna=False)

gc.enable()

In [3]:
del metadata_train['ra'],metadata_train['decl'],metadata_train['gal_l'], metadata_train['gal_b'],metadata_train['hostgal_photoz'],metadata_train['hostgal_photoz_err'], metadata_train['distmod'], metadata_train['mwebv']
#Bin into ddf and non-ddf training
ddf = metadata_train[(metadata_train['ddf'] == 1)]
del ddf['ddf']

ddf_far_away= (ddf[(ddf['hostgal_specz'] > 0)])
ddf_far_away.set_index('object_id', inplace=True)
ddf_nearby= ddf[(ddf['hostgal_specz'] <=0)]
ddf_nearby.set_index('object_id', inplace=True)
non_ddf = metadata_train[(metadata_train['ddf'] == 0)]
del non_ddf['ddf']

non_ddf_far_away= non_ddf[(non_ddf['hostgal_specz'] >0)]
non_ddf_far_away.set_index('object_id', inplace=True)
non_ddf_nearby= non_ddf[(non_ddf['hostgal_specz'] <=0 )]
non_ddf_nearby.set_index('object_id', inplace=True)
del ddf, non_ddf, ddf_far_away['hostgal_specz'], non_ddf_far_away['hostgal_specz'], ddf_nearby['hostgal_specz'], non_ddf_nearby['hostgal_specz']

gc.collect()

bins = [ddf_far_away, ddf_nearby, non_ddf_far_away, non_ddf_nearby]

In [4]:
def get_data_point(object_id, bin_name):
    x = torch.tensor(ts_piv.loc[object_id].values.reshape(-1, 1, 6), dtype = torch.float32)
    x[x != x] = 0
    y = torch.tensor([classes.index(bin_name.loc[object_id].values)])
    return x, y

def random_data_point(bin_name):
    object_id = bin_name.sample().index.values[0]
    return get_data_point(object_id, bin_name)

In [5]:
import numpy as np
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim

In [11]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, batch_size=1):
        super(RNN, self).__init__()

        self.hidden_dim = hidden_size
        self.batch_size = batch_size
        #TODO add dropout or something
        self.lstm = nn.LSTM(input_size, hidden_size) #,dropout=.2, num_layers=2)
        self.hidden2out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.hidden = self.init_hidden()
    
    def forward(self, sequence):
        x = sequence.view(len(sequence), self.batch_size , -1)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        output  = self.hidden2out(lstm_out[-1])
        output = self.softmax(output)
        return output

    def init_hidden(self):
        return (torch.zeros(1, self.batch_size, self.hidden_dim),
                torch.zeros(1, self.batch_size, self.hidden_dim))

In [12]:
classes = tuple(metadata_train.target.unique())

In [13]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return classes[category_i], category_i

In [14]:
model = RNN(6, 32, 14)
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [15]:
with torch.no_grad():
    x, y = random_data_point(non_ddf_far_away)
    y_hat = model(x)
    print("Predicted: {}, Actual: {}".format(categoryFromOutput(y_hat), categoryFromOutput(y)))

    
print_every = 100
all_data_points = len(non_ddf_far_away.index)
current_loss = 0

for epoch in range(1):  # again, normally you would NOT do 300 epochs, it is toy data
    for i, obj_id in enumerate(non_ddf_far_away.index):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        x_train, y_train = get_data_point(obj_id, non_ddf_far_away)

        # Step 3. Run our forward pass.
        y_hat = model(x_train)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = criterion(y_hat, y_train)
        current_loss += loss
        loss.backward()
        optimizer.step()
        #print(f"Epoch {epoch+1} {i/all_data_points*100}%")
        
        if (i % print_every == 0):
            
            print(f"Epoch {epoch+1} {i/all_data_points*100}% ===> Avg Loss: {current_loss/1000}")
            current_loss=0
        if i == 50:
            break

Predicted: (62, 8), Actual: (92, 0)
Epoch 1 0.0% ===> Avg Loss: 0.0024833115749061108


In [16]:
import random
correct = 0
sample = 100
for i in range(sample):
    obj = random.choice(ddf_far_away.index)
    x, y = get_data_point(obj, ddf_far_away)
    y_hat = model(x)
    label = categoryFromOutput(y_hat)[0]
    if classes[y]==label:
        correct += 1

print(f"Accuracy: {correct/sample*100}")

Accuracy: 54.0


In [17]:
ddf_nearby["target"].value_counts(normalize=True) * 100

65    57.222222
16    30.000000
92    10.740741
6      1.296296
53     0.740741
Name: target, dtype: float64