In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense,Embedding,LSTM,Dropout,Bidirectional,Input,BatchNormalization
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping,ModelCheckpoint

Using TensorFlow backend.


In [4]:
train = pd.read_csv("train.csv")
train.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933


In [5]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4577_1,4577,1,CI23855
1,4577_2,4577,2,CI23933
2,4577_3,4577,3,CI24917
3,4577_4,4577,4,CI24915
4,4577_5,4577,5,CI23714


In [6]:
label = train[train.challenge_sequence > 10][['user_id','challenge']]
label.rename(columns={'challenge':'label'},inplace=True)
label.head()

Unnamed: 0,user_id,label
10,4576,CI24958
11,4576,CI23667
12,4576,CI23691
23,4580,CI24915
24,4580,CI25727


In [7]:
df = train[train.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()
df.head()

Unnamed: 0,user_id,challenge
0,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...
1,4580,CI23663 CI23855 CI23933 CI23975 CI24530 CI2371...
2,4581,CI26155 CI26156 CI26157 CI26158 CI26159 CI2616...
3,4582,CI23855 CI24915 CI24917 CI23933 CI23663 CI2495...
4,4585,CI23855 CI23975 CI24917 CI25135 CI23848 CI2371...


In [15]:
df = df.merge(label)
df.head()

Unnamed: 0,user_id,challenge,label
0,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI24958
1,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI23667
2,4576,CI23714 CI23855 CI24917 CI23663 CI23933 CI2513...,CI23691
3,4580,CI23663 CI23855 CI23933 CI23975 CI24530 CI2371...,CI24915
4,4580,CI23663 CI23855 CI23933 CI23975 CI24530 CI2371...,CI25727


In [16]:
# Validation split for early stopping
df_train, df_validation = train_test_split(df.sample(frac=1,random_state=123), test_size=0.05, random_state=123)

In [17]:
df_train.head()

Unnamed: 0,user_id,challenge,label
43773,27556,CI23836 CI24024 CI24958 CI23791 CI24157 CI2418...,CI26755
80313,46677,CI23887 CI23933 CI24138 CI24871 CI23769 CI2495...,CI24030
38680,24951,CI26938 CI27040 CI27044 CI27045 CI27047 CI2704...,CI26177
142050,79110,CI26894 CI26898 CI26899 CI26901 CI26900 CI2689...,CI26221
106266,60263,CI27028 CI27035 CI27041 CI27043 CI27044 CI2704...,CI27050


In [18]:
# Load all the challenges
challenges = pd.read_csv('challenge_data.csv')
challenges.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32.0
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70.0
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,


In [19]:
df_train.head()

Unnamed: 0,user_id,challenge,label
43773,27556,CI23836 CI24024 CI24958 CI23791 CI24157 CI2418...,CI26755
80313,46677,CI23887 CI23933 CI24138 CI24871 CI23769 CI2495...,CI24030
38680,24951,CI26938 CI27040 CI27044 CI27045 CI27047 CI2704...,CI26177
142050,79110,CI26894 CI26898 CI26899 CI26901 CI26900 CI2689...,CI26221
106266,60263,CI27028 CI27035 CI27041 CI27043 CI27044 CI2704...,CI27050


In [20]:
# Encode challenges
encoder = LabelEncoder()
encoder.fit(challenges['challenge_ID'])
df_train['brand_id_encoded'] = encoder.transform(df_train.label)
df_validation['brand_id_encoded'] = encoder.transform(df_validation.label)
df_train.head()

Unnamed: 0,user_id,challenge,label,brand_id_encoded
43773,27556,CI23836 CI24024 CI24958 CI23791 CI24157 CI2418...,CI26755,3277
80313,46677,CI23887 CI23933 CI24138 CI24871 CI23769 CI2495...,CI24030,552
38680,24951,CI26938 CI27040 CI27044 CI27045 CI27047 CI2704...,CI26177,2699
142050,79110,CI26894 CI26898 CI26899 CI26901 CI26900 CI2689...,CI26221,2743
106266,60263,CI27028 CI27035 CI27041 CI27043 CI27044 CI2704...,CI27050,3572


In [21]:
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['challenge'])

In [22]:
# Create sequences
sequences_train = tokenizer.texts_to_sequences(df_train['challenge'])
sequences_validation = tokenizer.texts_to_sequences(df_validation['challenge'])

In [29]:
len(sequences_train)

198166

In [24]:
# Constants
NB_WORDS = len(tokenizer.word_index)
MAX_SEQUENCE_LENGTH = 10
N_CATEGORIES = challenges.shape[0]

In [25]:
NB_WORDS
#N_CATEGORIES

4961

In [31]:
# Pad sequences
x_train = pad_sequences(sequences_train, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
x_validation = pad_sequences(sequences_validation, maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
x_train

array([[ 62, 377,   5, ..., 733, 746, 705],
       [ 51,   6,  21, ...,   4, 128,  25],
       [301, 261, 136, ..., 161, 203, 267],
       ...,
       [561, 477, 359, ..., 347, 143, 136],
       [182, 221, 111, ...,  75,  94, 121],
       [ 36,  10, 135, ..., 109,  41,  34]])

In [32]:
# Set Labels
y_train = df_train['brand_id_encoded'].values
y_validation= df_validation['brand_id_encoded'].values
y_train

array([3277,  552, 2699, ..., 3565, 3450, 1212])

In [33]:
    # NN architecture
    def get_model(path='',lr=0.001):
        adam = Adam(lr=lr)
        inp = Input(shape=(MAX_SEQUENCE_LENGTH, ))
        x = Embedding(NB_WORDS,256)(inp)
        x = BatchNormalization()(x)
        x = Bidirectional(LSTM(128, dropout=0.1, recurrent_dropout=0.1))(x)
        x = Dropout(0.4)(x)
        x = Dense(N_CATEGORIES, activation="softmax")(x)
        model = Model(inputs=inp, outputs=x)
        if path != '':
            model.load_weights(path)
        model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
        return model

In [34]:
# Initialize the model
model = get_model()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [36]:
# Model callbacks
path = 'best_model_weights'
es_callback = EarlyStopping(monitor="val_loss", patience=5)
mc_callback = ModelCheckpoint('{}.hdf5'.format(path), monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=1)
#mc_callback = ModelCheckpoint(filepath='C:\\Users\\Prasanta\\Downloads\\PGPBABI\\Hackathon\\McKinseyReco\\weights.hdf5', monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=1)
callbacks = [es_callback,mc_callback]
#callbacks = [mc_callback]

In [35]:
print(x_train.shape)
print(y_train.shape)

(198166, 10)
(198166,)


In [37]:
# Fit the model

model.fit(x_train,
          y_train,
          epochs=10,
          batch_size=1024,
          validation_data=(x_validation, y_validation),
          callbacks = callbacks)

Instructions for updating:
Use tf.cast instead.
Train on 198166 samples, validate on 10430 samples
Epoch 1/10


KeyboardInterrupt: 

In [None]:
# Load best weights
model = get_model('{}.hdf5'.format(path))

In [None]:
# Test preprocessing
def padding(text):
    return pad_sequences(tokenizer.texts_to_sequences(text), maxlen=MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
test_text = test[test.challenge_sequence <= 10].groupby('user_id').challenge.aggregate(lambda x: ' '.join(x)).reset_index()
x_test = padding(test_text.challenge)

In [None]:
# Get top 3 predictions for each user
pred = model.predict(x_test,batch_size=2048)
pred = pred.argsort(axis=1)[:,-3:][:,::-1]

In [None]:
# Write Predictions
df_list = []
for i in range(3):
    test_11 = test_text[['user_id']]
    test_11['user_sequence'] = test_11.user_id.astype(str) + '_'+str(i+11)
    test_11['challenge'] = encoder.inverse_transform(pred[:,i])
    df_list.append(test_11[['user_sequence','challenge']])
pd.concat(df_list).to_csv('bes_submission.csv',index=False)