In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import itertools
import pickle, gzip
import gc

In [26]:
train_series = pd.read_csv('training_sample_set.csv')
metadata_train = pd.read_csv('training_sample_set_metadata.csv')
metadata_train.head()

Unnamed: 0,object_id,ra,decl,gal_l,gal_b,ddf,hostgal_specz,hostgal_photoz,hostgal_photoz_err,distmod,mwebv,target
0,615,349.046051,-61.943836,320.79653,-51.753706,1,0.0,0.0,0.0,,0.017,92
1,713,53.085938,-27.784405,223.525509,-54.460748,1,1.8181,1.6267,0.2552,45.4063,0.007,88
2,730,33.574219,-6.579593,170.455585,-61.548219,1,0.232,0.2262,0.0157,40.2561,0.021,42
3,745,0.189873,-45.586655,328.254458,-68.969298,1,0.3037,0.2813,1.1523,40.7951,0.007,90
4,1124,352.711273,-63.823658,316.922299,-51.059403,1,0.1934,0.2415,0.0176,40.4166,0.024,90


In [31]:

baseline_Xtrain=metadata_train[['ra', 'decl', 'gal_l', 'gal_b', 'ddf', 'hostgal_specz', 'hostgal_photoz', 'hostgal_photoz_err', 'distmod', 'mwebv']].copy()
baseline_Ytrain= metadata_train[['target']].copy()
baseline_Ytrain.head(10)

Unnamed: 0,target
0,92
1,88
2,42
3,90
4,90
5,65
6,90
7,42
8,90
9,65


In [30]:


simple_features = train_series.groupby(
    ['object_id', 'passband'])['flux', 'flux_err'].agg(
    ['mean', 'median', 'max', 'min', 'std']).unstack('passband')


#construct time series using binned observations:
ts_mod = train_series[['object_id', 'mjd', 'passband', 'flux', 'flux_err']].copy()
#bin by 5 days, reducing the size of data but still giving a time series
ts_mod['mjd_d5'] = (ts_mod['mjd'] / 5).astype(int)
ts_mod = ts_mod.groupby(['object_id', 'mjd_d5', 'passband'])['flux', 'flux_err'].mean().reset_index()

#pivotting
ts_piv = pd.pivot_table(ts_mod, 
                        index='object_id', 
                        columns=['mjd_d5', 'passband'], 
                        values=['flux'],
                        dropna=False)

gc.enable()
ts_piv.head()

Unnamed: 0_level_0,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux,flux
mjd_d5,11950,11950,11950,11950,11950,11950,11953,11953,11953,11953,...,12123,12123,12123,12123,12124,12124,12124,12124,12124,12124
passband,0,1,2,3,4,5,0,1,2,3,...,2,3,4,5,0,1,2,3,4,5
object_id,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3,Unnamed: 17_level_3,Unnamed: 18_level_3,Unnamed: 19_level_3,Unnamed: 20_level_3,Unnamed: 21_level_3
615,,-938.945678,-613.334595,-498.17006,-391.232605,-355.88678,,-815.188599,-548.01355,-475.516052,...,,,,,,0.003448,304.355044,252.858406,266.557327,244.999603
713,,,,,,,,,,,...,,,,,,,,,,


In [11]:
del metadata_train['ra'],metadata_train['decl'],metadata_train['gal_l'], metadata_train['gal_b'],metadata_train['hostgal_photoz'],metadata_train['hostgal_photoz_err'], metadata_train['distmod'], metadata_train['mwebv']
#Bin into ddf and non-ddf training
ddf = metadata_train[(metadata_train['ddf'] == 1)]
del ddf['ddf']
ddf_far_away= (ddf[(ddf['hostgal_specz'] > 0)])
ddf_far_away.set_index('object_id', inplace=True)
ddf_nearby= ddf[(ddf['hostgal_specz'] <=0)]
ddf_nearby.set_index('object_id', inplace=True)
non_ddf = metadata_train[(metadata_train['ddf'] == 0)]
del non_ddf['ddf']
non_ddf_far_away= non_ddf[(non_ddf['hostgal_specz'] >0)]
non_ddf_far_away.set_index('object_id', inplace=True)
non_ddf_nearby= non_ddf[(non_ddf['hostgal_specz'] <=0 )]
non_ddf_nearby.set_index('object_id', inplace=True)
del ddf, non_ddf, ddf_far_away['hostgal_specz'], non_ddf_far_away['hostgal_specz'], ddf_nearby['hostgal_specz'], non_ddf_nearby['hostgal_specz']
gc.collect()
ddf_far_away.head(10)


Unnamed: 0_level_0,target
object_id,Unnamed: 1_level_1
713,88
730,42
745,90
1124,90
1598,90
1632,42
1920,90
2072,90
2103,42
2300,42


In [33]:
import numpy as np
import torch
import torch.nn as nn

In [36]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size, hidden_size)
        self.i2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, input, hidden):
        hidden = self.i2h(input)
        output = self.i2o(hidden)
        output = self.softmax(output)
        return output

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

rnn = RNN(6, 32, 13)

In [35]:
learning_rate = 0.005
criterion = nn.NLLLoss()

def train(y_train, x_train):
    hidden = rnn.initHidden()

    rnn.zero_grad()

    for i in range(x_train.size()[0]):
        output, hidden = rnn(x_train[i], hidden)

    loss = criterion(output, y_train)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        gd = p.grad.data
        gd[gd != gd] = 0
        p.data.add_(-learning_rate, gd)

    return output, loss.item()

In [70]:
classes = tuple(metadata_train.target.unique())
print(classes)

(92, 88, 42, 90, 65, 16, 67, 95, 62, 15, 52, 6, 64)


In [67]:
test_x = torch.tensor(ts_piv.loc[713].values.reshape(-1, 1, 6), dtype = torch.float32)
test_y = torch.tensor([classes.index(ddf_far_away.loc[713].values)])
y = ddf_far_away.loc[713].values[0]

In [68]:
test_x[test_x != test_x] = 0

In [69]:
def categoryFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return classes[category_i], category_i


(88, 1)


In [65]:
import time
import math

n_iters = 100000
print_every = 5000
plot_every = 1000



# Keep track of losses for plotting
current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(1, n_iters + 1):
    output, loss = train(test_y, test_x)
    current_loss += loss

    if iter % print_every == 0:
        correct = '✓' if guess == category else '✗ (%s)' % category
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, correct))
    # Add current loss avg to list of losses
    if iter % plot_every == 0:
        all_losses.append(current_loss / plot_every)
        current_loss = 0
        
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

plt.figure()
plt.plot(all_losses)

KeyboardInterrupt: 

In [72]:
torch.exp(output)

tensor([[4.7035e-08, 1.0000e+00, 1.0862e-08, 3.5450e-08, 2.9733e-07, 9.1874e-08,
         6.2914e-08, 1.5906e-07, 1.8016e-07, 4.6697e-08, 3.0166e-08, 9.4686e-08,
         5.6874e-10]], grad_fn=<ExpBackward>)