In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import itertools
import pickle, gzip
import gc

In [2]:
train_series = pd.read_csv('training_set.csv')
metadata_train = pd.read_csv('training_set_metadata.csv')

simple_features = train_series.groupby(
    ['object_id', 'passband'])['flux'].agg(
    ['mean', 'median', 'max', 'min', 'std']).unstack('passband')


#construct time series using binned observations:
ts_mod = train_series[['object_id', 'mjd', 'passband', 'flux']].copy()
#bin by 5 days, reducing the size of data but still giving a time series
ts_mod['mjd_d5'] = (ts_mod['mjd'] / 5).astype(int)
ts_mod = ts_mod.groupby(['object_id', 'mjd_d5', 'passband'])['flux'].mean().reset_index()

#pivotting
ts_piv = pd.pivot_table(ts_mod, 
                        index='object_id', 
                        columns=['mjd_d5', 'passband'], 
                        values=['flux'],
                        dropna=False)

gc.enable()

In [3]:
del metadata_train['ra'],metadata_train['decl'],metadata_train['gal_l'], metadata_train['gal_b'],metadata_train['hostgal_photoz'],metadata_train['hostgal_photoz_err'], metadata_train['distmod'], metadata_train['mwebv']
#Bin into ddf and non-ddf training
ddf = metadata_train[(metadata_train['ddf'] == 1)]
del ddf['ddf']

ddf_far_away= (ddf[(ddf['hostgal_specz'] > 0)])
ddf_far_away.set_index('object_id', inplace=True)
ddf_nearby= ddf[(ddf['hostgal_specz'] <=0)]
ddf_nearby.set_index('object_id', inplace=True)
non_ddf = metadata_train[(metadata_train['ddf'] == 0)]
del non_ddf['ddf']

non_ddf_far_away= non_ddf[(non_ddf['hostgal_specz'] >0)]
non_ddf_far_away.set_index('object_id', inplace=True)
non_ddf_nearby= non_ddf[(non_ddf['hostgal_specz'] <=0 )]
non_ddf_nearby.set_index('object_id', inplace=True)
del ddf, non_ddf, ddf_far_away['hostgal_specz'], non_ddf_far_away['hostgal_specz'], ddf_nearby['hostgal_specz'], non_ddf_nearby['hostgal_specz']

gc.collect()

bins = [ddf_far_away, ddf_nearby, non_ddf_far_away, non_ddf_nearby]

In [4]:
def get_data_point(object_id, bin_name):
    x = torch.tensor(ts_piv.loc[object_id].values.reshape(-1, 1, 6), dtype = torch.float32)
    x[x != x] = 0
    y = torch.tensor([classes.index(bin_name.loc[object_id].target)])
    return x, y

def random_data_point(bin_name):
    object_id = bin_name.sample().index.values[0]
    return get_data_point(object_id, bin_name)

In [5]:
import torch
import torch.nn as nn
import torch.autograd as autograd
import torch.optim as optim

In [6]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, batch_size=1):
        super(RNN, self).__init__()

        self.hidden_dim = hidden_size
        self.batch_size = batch_size
        #TODO add dropout or something
        self.lstm = nn.LSTM(input_size, hidden_size) #,dropout=.2, num_layers=2)
        self.hidden2out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.hidden = self.init_hidden()
    
    def forward(self, sequence):
        x = sequence.view(len(sequence), self.batch_size , -1)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        output  = self.hidden2out(lstm_out[-1])
        output = self.softmax(output)
        return output

    def init_hidden(self):
        return (torch.zeros(1, self.batch_size, self.hidden_dim),
                torch.zeros(1, self.batch_size, self.hidden_dim))

In [39]:
classes = tuple(metadata_train.target.unique())
vc = non_ddf_far_away.target.value_counts(normalize=True)
weights = torch.tensor([1 - vc.loc[i] if i in vc.index else 0 for i in classes])
weights

tensor([0.0000, 0.9364, 0.7748, 0.6260, 0.0000, 0.0000, 0.9640, 0.9661, 0.9101,
        0.8766, 0.9714, 0.0000, 0.9747, 0.0000])

In [8]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return classes[category_i], category_i

In [81]:
def train(bin_name, epochs=1):
    # Calculate weights of each class to balance
    vc = bin_name.target.value_counts(normalize=True)
    weights = torch.tensor([1 - vc.loc[i] if i in vc.index else 0 for i in classes])
    
    # Initialize Model
    model = RNN(6, 32, 14)
    criterion = nn.NLLLoss(weight = weights)
    optimizer = optim.SGD(model.parameters(), lr=0.05, nesterov=True)
    
    n = len(bin_name)
    print_every = 100
    current_loss = 0
    
    start_t = time.time()
    
    for epoch in range(epochs):
        randomized_bin = bin_name.sample(frac=1)
        for i, obj_id in enumerate(randomized_bin.index):
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Also, we need to clear out the hidden state of the LSTM,
            # detaching it from its history on the last instance.
            model.hidden = model.init_hidden()

            # Step 2. Get our inputs ready for the network, that is, turn them into
            # Tensors of light curve inputs.
            x_train, y_train = get_data_point(obj_id, randomized_bin)

            # Step 3. Run our forward pass.
            y_hat = model(x_train)

            # Step 4. Compute the loss, gradients, and update the parameters by
            #  calling optimizer.step()
            loss = criterion(y_hat, y_train)
            current_loss += loss
            loss.backward()
            optimizer.step()

            if (i % print_every == 0):
                print("Epoch {} {:.0f}% ===> Avg Loss: {:.3f}".format(epoch+1, (i/n*100), current_loss/print_every))
                current_loss=0
        predict(model, bin_name, sample=.5)
    print("Finished")
    return model

In [79]:
def predict(model, bin_name, sample=1):
    labels = []
    right = 0
    s = bin_name.sample(frac=sample)
    with torch.no_grad():
        for obj_id in s.index:
            x, y = get_data_point(obj_id, s)
            y_hat = model(x)
            label = categoryFromOutput(y_hat)[0]
            labels.append([obj_id, label])
            if classes[y]==label:
                right += 1
    print(f"Accuracy: {right/len(s)*100}")
    return labels

In [75]:
model = train(ddf_far_away, epochs=5)

Epoch 1 0% ===> Avg Loss: 0.026
Epoch 1 6% ===> Avg Loss: 1.928
Epoch 1 13% ===> Avg Loss: 1.616
Epoch 1 19% ===> Avg Loss: 1.452
Epoch 1 25% ===> Avg Loss: 1.427
Epoch 1 32% ===> Avg Loss: 1.382
Epoch 1 38% ===> Avg Loss: 1.663
Epoch 1 44% ===> Avg Loss: 1.474
Epoch 1 51% ===> Avg Loss: 1.667
Epoch 1 57% ===> Avg Loss: 1.527
Epoch 1 63% ===> Avg Loss: 1.466
Epoch 1 70% ===> Avg Loss: 1.477
Epoch 1 76% ===> Avg Loss: 1.539
Epoch 1 82% ===> Avg Loss: 1.481
Epoch 1 89% ===> Avg Loss: 1.547
Epoch 1 95% ===> Avg Loss: 1.585
Accuracy: 21.5100076007094
Epoch 2 0% ===> Avg Loss: 1.090
Epoch 2 6% ===> Avg Loss: 1.712
Epoch 2 13% ===> Avg Loss: 1.453
Epoch 2 19% ===> Avg Loss: 1.430
Epoch 2 25% ===> Avg Loss: 1.376
Epoch 2 32% ===> Avg Loss: 1.540
Epoch 2 38% ===> Avg Loss: 1.491
Epoch 2 44% ===> Avg Loss: 1.522
Epoch 2 51% ===> Avg Loss: 1.639
Epoch 2 57% ===> Avg Loss: 1.266
Epoch 2 63% ===> Avg Loss: 1.397
Epoch 2 70% ===> Avg Loss: 1.465
Epoch 2 76% ===> Avg Loss: 1.339
Epoch 2 82% ===> Avg

In [80]:
y_hat = predict(model, ddf_far_away)

Accuracy: 59.83502538071066
