In [1]:
from torch import from_numpy, Tensor
from torch import (
    nn,
    randn,
    optim,
    zeros,
)
from os.path import join
from os import getcwd
from sklearn.model_selection import train_test_split
from numpy import array
from torch.cuda import is_available

import pandas as pd
import numpy as np

import torch

In [2]:
pwd = getcwd()
datadir = join(pwd, 'data/')
file_train = join(datadir, 'train.csv')
file_test = join(datadir, 'test.csv')
file_challenge = join(datadir, 'challenge_data.csv')

In [3]:
df_train = pd.read_csv(file_train)
df_train['label'] = 'train'
df_train = df_train.sort_values(by=['user_id', 'challenge_sequence'])
df_test = pd.read_csv(file_test)
df_test['label'] = 'test'
df_test = df_test.sort_values(by=['user_id', 'challenge_sequence'])
df = pd.concat([df_train, df_test])

In [4]:
df.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge,label
0,4576_1,4576,1,CI23714,train
1,4576_2,4576,2,CI23855,train
2,4576_3,4576,3,CI24917,train
3,4576_4,4576,4,CI23663,train
4,4576_5,4576,5,CI23933,train


In [5]:
df.tail()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge,label
397315,113838_6,113838,6,CI23691,test
397316,113838_7,113838,7,CI24138,test
397317,113838_8,113838,8,CI23714,test
397318,113838_9,113838,9,CI24530,test
397319,113838_10,113838,10,CI23612,test


In [6]:
challenges = pd.read_csv(file_challenge)

In [7]:
challenges.shape

(5606, 9)

In [8]:
challenges.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [9]:
# challenges = challenges.drop('challenge_ID', axis=1)
# challenges = challenges.drop('author_gender', axis=1)
# challenges = challenges.drop('author_org_ID', axis=1)
# challenges['category_id'] = challenges['category_id'].fillna(0)
# challenges['category_id'] = challenges['category_id'].astype(int)
# # challenges['category_id'].value_counts()
# challenges = challenges.drop('publish_date', axis=1)

In [10]:
challenges.tail()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
5601,CI29079,1,SI2864,,17-06-2010,AI567059,M,AOI101717,29.0
5602,CI29080,1,SI2865,,25-06-2010,AI567060,F,AOI101718,29.0
5603,CI29081,1,SI2865,,25-06-2010,AI566257,M,AOI100108,29.0
5604,CI29082,1,SI2865,,25-06-2010,AI563777,M,AOI100108,29.0
5605,CI29083,1,SI2865,,25-06-2010,AI564006,F,AOI100022,29.0


In [11]:
df = df.groupby(
    by=['user_id'],
    as_index=False,
)[['challenge']].agg(lambda x: list(x))
df['count'] = df['challenge'].apply(lambda x: len(x))

In [12]:
df.head()

Unnamed: 0,user_id,challenge,count
0,4576,"[CI23714, CI23855, CI24917, CI23663, CI23933, ...",13
1,4577,"[CI23855, CI23933, CI24917, CI24915, CI23714, ...",10
2,4578,"[CI23663, CI23855, CI24917, CI23933, CI23975, ...",10
3,4579,"[CI26939, CI26940, CI26941, CI26942, CI26943, ...",10
4,4580,"[CI23663, CI23855, CI23933, CI23975, CI24530, ...",13


In [13]:
drop_cols = [
    'publish_date',
    'author_gender',
    'author_org_ID',
    'author_ID',
    'challenge_series_ID',
]
for col in drop_cols:
    challenges = challenges.drop(col, axis=1)
challenges = challenges.fillna(0)
challenges['total_submissions'] = challenges['total_submissions'].astype(int)
challenges['category_id'] = challenges['category_id'].astype(int)
challenges = challenges.sort_values(by=['challenge_ID'])

In [14]:
challenges.head()

Unnamed: 0,challenge_ID,programming_language,total_submissions,category_id
0,CI23478,2,37,0
1,CI23479,2,48,32
2,CI23480,1,15,0
3,CI23481,1,236,70
4,CI23482,2,137,0


In [15]:
challenges = pd.get_dummies(
    challenges, 
    columns=['programming_language',],
)

In [16]:
challenges.head()

Unnamed: 0,challenge_ID,total_submissions,category_id,programming_language_1,programming_language_2,programming_language_3
0,CI23478,37,0,0,1,0
1,CI23479,48,32,0,1,0
2,CI23480,15,0,1,0,0
3,CI23481,236,70,1,0,0
4,CI23482,137,0,0,1,0


In [17]:
rows = challenges.to_numpy().tolist()
challenges_ = {}
for row in rows:
    challenges_[row[0]] = row[1:]

In [18]:
is_cuda = is_available()
device = torch.device('cpu')
if is_cuda:
    device = torch.device('cuda')

In [19]:
rows = df.to_numpy().tolist()
outputs = randn(
    len(df), 
    65,
    device = device,
)
outputs.shape

torch.Size([109264, 65])

In [20]:
sequences = []
outputs = np.zeros((len(df), 65), dtype=np.float32)
for i, row in enumerate(rows):
    sequence = []
    j = 0
    for challenge in row[1]:
        #print(challenges_[challenge])
        for x in challenges_[challenge]:
            sequence.append(x)
            outputs[i][j] = x
            j += 1
    #sequence = Tensor(sequence)
    sequences.append(sequence)
#print(sequences)
outputs = from_numpy(outputs)

In [21]:
df['sequences'] = sequences

In [22]:
outputs.shape

torch.Size([109264, 65])

In [23]:
df.head()

Unnamed: 0,user_id,challenge,count,sequences
0,4576,"[CI23714, CI23855, CI24917, CI23663, CI23933, ...",13,"[14723, 29, 1, 0, 0, 20993, 29, 1, 0, 0, 43409..."
1,4577,"[CI23855, CI23933, CI24917, CI24915, CI23714, ...",10,"[20993, 29, 1, 0, 0, 15086, 31, 1, 0, 0, 43409..."
2,4578,"[CI23663, CI23855, CI24917, CI23933, CI23975, ...",10,"[8897, 45, 1, 0, 0, 20993, 29, 1, 0, 0, 43409,..."
3,4579,"[CI26939, CI26940, CI26941, CI26942, CI26943, ...",10,"[5911, 28, 1, 0, 0, 5186, 28, 1, 0, 0, 2613, 2..."
4,4580,"[CI23663, CI23855, CI23933, CI23975, CI24530, ...",13,"[8897, 45, 1, 0, 0, 20993, 29, 1, 0, 0, 15086,..."


In [24]:
train = df[df['count'] == 13]
test = df[df['count'] == 10]
train = train.reset_index(drop=True)
test= test.reset_index(drop=True)

In [25]:
train.head()

Unnamed: 0,user_id,challenge,count,sequences
0,4576,"[CI23714, CI23855, CI24917, CI23663, CI23933, ...",13,"[14723, 29, 1, 0, 0, 20993, 29, 1, 0, 0, 43409..."
1,4580,"[CI23663, CI23855, CI23933, CI23975, CI24530, ...",13,"[8897, 45, 1, 0, 0, 20993, 29, 1, 0, 0, 15086,..."
2,4581,"[CI26155, CI26156, CI26157, CI26158, CI26159, ...",13,"[6968, 69, 1, 0, 0, 4566, 69, 1, 0, 0, 3962, 6..."
3,4582,"[CI23855, CI24915, CI24917, CI23933, CI23663, ...",13,"[20993, 29, 1, 0, 0, 7389, 29, 1, 0, 0, 43409,..."
4,4585,"[CI23855, CI23975, CI24917, CI25135, CI23848, ...",13,"[20993, 29, 1, 0, 0, 9204, 61, 1, 0, 0, 43409,..."


In [26]:
test.head()

Unnamed: 0,user_id,challenge,count,sequences
0,4577,"[CI23855, CI23933, CI24917, CI24915, CI23714, ...",10,"[20993, 29, 1, 0, 0, 15086, 31, 1, 0, 0, 43409..."
1,4578,"[CI23663, CI23855, CI24917, CI23933, CI23975, ...",10,"[8897, 45, 1, 0, 0, 20993, 29, 1, 0, 0, 43409,..."
2,4579,"[CI26939, CI26940, CI26941, CI26942, CI26943, ...",10,"[5911, 28, 1, 0, 0, 5186, 28, 1, 0, 0, 2613, 2..."
3,4583,"[CI23663, CI23855, CI23975, CI23714, CI23848, ...",10,"[8897, 45, 1, 0, 0, 20993, 29, 1, 0, 0, 9204, ..."
4,4584,"[CI23855, CI23975, CI25135, CI23848, CI23714, ...",10,"[20993, 29, 1, 0, 0, 9204, 61, 1, 0, 0, 5446, ..."


In [27]:
inputs = outputs[:, :50]

In [28]:
inputs.shape

torch.Size([109264, 50])

In [29]:
train_train, train_test = train_test_split(
    train,
    test_size=.3,
    random_state=1,
)
train_train = train_train.reset_index(drop=True)
train_test = train_test.reset_index(drop=True)

In [30]:
train_train.head()

Unnamed: 0,user_id,challenge,count,sequences
0,68688,"[CI27778, CI27781, CI27792, CI27794, CI27793, ...",13,"[353, 70, 1, 0, 0, 880, 70, 1, 0, 0, 760, 70, ..."
1,97384,"[CI26160, CI26164, CI26920, CI26925, CI26929, ...",13,"[2550, 69, 1, 0, 0, 3773, 69, 1, 0, 0, 935, 69..."
2,29179,"[CI24917, CI25022, CI26052, CI23933, CI23714, ...",13,"[43409, 66, 1, 0, 0, 1716, 29, 1, 0, 0, 10560,..."
3,93064,"[CI23648, CI24871, CI24958, CI25056, CI24876, ...",13,"[2186, 30, 1, 0, 0, 1134, 37, 1, 0, 0, 6842, 4..."
4,87851,"[CI23714, CI24530, CI25074, CI24440, CI25141, ...",13,"[14723, 29, 1, 0, 0, 6814, 29, 1, 0, 0, 372, 8..."


In [31]:
train_test.head()

Unnamed: 0,user_id,challenge,count,sequences
0,49166,"[CI23933, CI24141, CI24187, CI23769, CI23887, ...",13,"[15086, 31, 1, 0, 0, 773, 31, 1, 0, 0, 2836, 4..."
1,30196,"[CI26051, CI26250, CI26251, CI23836, CI25141, ...",13,"[4787, 30, 1, 0, 0, 312, 29, 1, 0, 0, 287, 29,..."
2,82271,"[CI23855, CI23875, CI24117, CI24111, CI25135, ...",13,"[20993, 29, 1, 0, 0, 538, 114, 1, 0, 0, 3010, ..."
3,71460,"[CI23662, CI23848, CI24525, CI24189, CI24532, ...",13,"[2142, 49, 1, 0, 0, 3395, 61, 1, 0, 0, 4728, 8..."
4,67247,"[CI23648, CI25090, CI25135, CI24866, CI24876, ...",13,"[2186, 30, 1, 0, 0, 2359, 41, 1, 0, 0, 5446, 6..."


In [32]:
class Model(nn.Module):
    def __init__(self, input_size, output_size, hidden_dim, n_layers):
        super(Model, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers

        self.rnn = nn.RNN(input_size, hidden_dim, n_layers, batch_first=True)   
        self.fc = nn.Linear(hidden_dim, output_size)
    
    def forward(self, x):        
        batch_size = x.size(0)
        hidden = self.init_hidden(batch_size)
        out, hidden = self.rnn(x, hidden)
        out = out.contiguous().view(-1, self.hidden_dim)
        out = self.fc(out)        
        return out, hidden
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.n_layers, batch_size, self.hidden_dim).to(device)
        return hidden

In [33]:
output_train = from_numpy(array(train_train['sequences'].values.tolist()))
input_train = output_train[:, :50]

In [34]:
input_train[:5]

tensor([[  353,    70,     1,     0,     0,   880,    70,     1,     0,     0,
           760,    70,     1,     0,     0,   472,    70,     1,     0,     0,
           564,    70,     1,     0,     0,   902,    70,     1,     0,     0,
           812,    70,     1,     0,     0,   385,    70,     1,     0,     0,
           536,    70,     1,     0,     0,   349,    70,     1,     0,     0],
        [ 2550,    69,     1,     0,     0,  3773,    69,     1,     0,     0,
           935,    69,     1,     0,     0,   268,    69,     1,     0,     0,
           523,    69,     1,     0,     0,  1041,    69,     1,     0,     0,
           549,    69,     1,     0,     0,   407,    69,     1,     0,     0,
           645,    69,     1,     0,     0,   731,    69,     1,     0,     0],
        [43409,    66,     1,     0,     0,  1716,    29,     1,     0,     0,
         10560,    29,     1,     0,     0, 15086,    31,     1,     0,     0,
         14723,    29,     1,     0,     0,  8897,

In [35]:
model = Model(
    input_size=50,
    output_size=65,
    hidden_dim=12,
    n_layers=100,
)

In [36]:
model = model.to(device)

In [37]:
n_epochs = 200
lr = .01
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr=lr)

In [38]:
input_train = input_train.to(device)
output_train = output_train.to(device)
for epoch in range(1, n_epochs+1):
    optimizer.zero_grad()
    output, hidden = model(input_train)
    output = output.to(device)
    loss = criterion(output, output_train.view(1, -1).long())
    loss.backward()
    optimizer.step()
    if epoch%10 == 0:
        print('Epoch '+str(epoch)+'/'+str(n_epochs)+' loss: '+loss.item())

RuntimeError: input must have 3 dimensions, got 2