In [8]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import itertools
import pickle, gzip
import gc
from sklearn.model_selection import train_test_split

In [9]:
train_series = pd.read_csv('Data/training_set.csv')
metadata_train = pd.read_csv('Data/training_set_metadata.csv')

simple_features = train_series.groupby(
    ['object_id', 'passband'])['flux'].agg(
    ['mean', 'median', 'max', 'min', 'std']).unstack('passband')


#construct time series using binned observations:
ts_mod = train_series[['object_id', 'mjd', 'passband', 'flux']].copy()
#bin by 5 days, reducing the size of data but still giving a time series
ts_mod['mjd_d5'] = (ts_mod['mjd'] / 5).astype(int)
ts_mod = ts_mod.groupby(['object_id', 'mjd_d5', 'passband'])['flux'].mean().reset_index()

#pivotting
ts_piv = pd.pivot_table(ts_mod, 
                        index='object_id', 
                        columns=['mjd_d5', 'passband'], 
                        values=['flux'],
                        dropna=False)

gc.enable()
ts_piv.head()

Unnamed: 0_level_0,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux
mjd_d5,11916,11916,11916,11916,11916,11916,11917,11917,11917,11917,...,12133,12133,12133,12133,12134,12134,12134,12134,12134,12134
passband,0,1,2,3,4,5,0,1,2,3,...,2,3,4,5,0,1,2,3,4,5
object_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
615,,,,,,,,,,,...,,,,,,,,,,
713,,,,,,,,,,,...,-6.882203,-8.101076,-5.94797,-11.432619,-10.452725,,,,,
730,,,,,,,,,,,...,,,,,,,,,,
745,,,,,,,,,,,...,,,,,,,,,,
1124,,,,,,,,,,,...,,,,,,,,,,


In [10]:
del metadata_train['ra'],metadata_train['decl'],metadata_train['gal_l'], metadata_train['gal_b'],metadata_train['hostgal_photoz'],metadata_train['hostgal_photoz_err'], metadata_train['distmod'], metadata_train['mwebv']
#Bin into ddf and non-ddf training
ddf = metadata_train[(metadata_train['ddf'] == 1)]
del ddf['ddf']

ddf_far_away= (ddf[(ddf['hostgal_specz'] > 0)])
ddf_far_away.set_index('object_id', inplace=True)
ddf_nearby= ddf[(ddf['hostgal_specz'] <=0)]
ddf_nearby.set_index('object_id', inplace=True)
non_ddf = metadata_train[(metadata_train['ddf'] == 0)]
del non_ddf['ddf']

non_ddf_far_away= non_ddf[(non_ddf['hostgal_specz'] >0)]
non_ddf_far_away.set_index('object_id', inplace=True)
non_ddf_nearby= non_ddf[(non_ddf['hostgal_specz'] <=0 )]
non_ddf_nearby.set_index('object_id', inplace=True)
del ddf, non_ddf, ddf_far_away['hostgal_specz'], non_ddf_far_away['hostgal_specz'], ddf_nearby['hostgal_specz'], non_ddf_nearby['hostgal_specz']

gc.collect()

bins = [ddf_far_away, ddf_nearby, non_ddf_far_away, non_ddf_nearby]

In [11]:
#Split into validation and training. \n,
ddf_far_away_train, ddf_far_away_validation = train_test_split(ddf_far_away, test_size=0.1)
ddf_nearby_train, ddf_nearby_validation = train_test_split(ddf_nearby, test_size=0.1)
    
non_ddf_far_away_train, non_ddf_far_away_validation = train_test_split(non_ddf_far_away, test_size=0.1)
non_ddf_nearby_train, non_ddf_nearby_validation = train_test_split(non_ddf_nearby, test_size=0.1)
    

In [12]:
non_ddf_nearby.head(10)

Unnamed: 0_level_0,target
object_id,Unnamed: 1_level_1
1153371,65
1215304,16
1288208,16
1415987,16
1497514,6
1516088,6
1597464,65
1854631,6
1904697,16
1919887,65


In [13]:
import numpy as np
import torch
import torch.nn as nn
import torch.autograd as autograd

In [24]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, batch_size=1):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.batch_size = batch_size
        
        self.lstm = nn.LSTM(input_size, hidden_size)
        self.hidden2out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.hidden = self.initHidden()

    """def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden"""
    
    def forward(self, sequence):
        x = sequence.view(len(sequence), self.batch_size , -1)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y  = self.hidden2out(lstm_out[-1])
        output = self.softmax(y)
        return output

    def initHidden(self):
        return (autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_size)),
                autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_size)))
test_rnn = RNN(6, 32, 14)
x, y = random_data_point(ddf_far_away)
train(y, x, test_rnn)

(tensor([[-2.5791, -2.5396, -2.7720, -2.6920, -2.4992, -2.6784, -2.6655, -2.6473,
          -2.6109, -2.4752, -2.7240, -2.7767, -2.6928, -2.6509]],
        grad_fn=<LogSoftmaxBackward>), 2.692038059234619)

In [23]:
learning_rate = 0.001
criterion = nn.NLLLoss()

def train(y_train, x_train, rnn):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    output = rnn(x_train)

    loss = criterion(output, y_train)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        gd = p.grad.data
        gd[gd != gd] = 0
        p.data.add_(-learning_rate, gd)

    return output, loss.item()

In [15]:
classes = tuple(metadata_train.target.unique())

In [16]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return classes[category_i], category_i


In [17]:
def get_data_point(object_id, bin_name):
    x = torch.tensor(ts_piv.loc[object_id].values.reshape(-1, 1, 6), dtype = torch.float32)
    x[x != x] = 0
    y = torch.tensor([classes.index(bin_name.loc[object_id].values)])
    return x, y

In [18]:
def random_data_point(bin_name):
    object_id = bin_name.sample().index.values[0]
    return get_data_point(object_id, bin_name)

In [14]:
rnn_ddf_far_away = RNN(6, 8, 14)

In [17]:
import time
import math

n_iters = 10000
print_every = 500
plot_every = 100



# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1):
    x_train, y_train = random_data_point(ddf_far_away)
    output, loss = train(y_train, x_train, rnn_ddf_far_away)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        print(current_loss)

    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0

240.17262196540833
461.7537053823471
439.93983685970306
285.1291801929474
209.43176209926605
202.8104727268219
221.295130610466
198.92257511615753
210.00329864025116
189.03089725971222
251.8365182876587
318.70534443855286
163.02913761138916
324.42315089702606
210.76154720783234
215.37013685703278
220.11072897911072
174.0618976354599
187.43395400047302
155.9377360343933
