In [1]:
import numpy as np
import pandas as pd

from keras.models import Sequential
from keras.layers import Dense, Input, Dropout,LSTM, Activation
from keras.utils import np_utils
from nltk.tokenize import RegexpTokenizer

from keras.callbacks import Callback
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv('./dataset/hm_train.csv')
test = pd.read_csv('./dataset/hm_test.csv')

In [3]:
train.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,affection
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise
3,27676,24h,We had a serious talk with some friends of our...,2,bonding
4,27677,24h,I went with grandchildren to butterfly display...,1,affection


In [4]:
test.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence
0,88305,3m,I spent the weekend in Chicago with my friends.,1
1,88306,3m,We moved back into our house after a remodel. ...,2
2,88307,3m,My fiance proposed to me in front of my family...,1
3,88308,3m,I ate lobster at a fancy restaurant with some ...,1
4,88309,3m,I went out to a nice restaurant on a date with...,5


In [5]:
print(train.shape)
print(test.shape)

(60321, 5)
(40213, 4)


In [6]:
labels = train['predicted_category']
print(np.unique(labels,return_counts=True))

(array(['achievement', 'affection', 'bonding', 'enjoy_the_moment',
       'exercise', 'leisure', 'nature'], dtype=object), array([20274, 20880,  6561,  6508,   729,  4242,  1127]))


In [7]:
label_dict = { 0 : 'achievement', 1 : 'affection', 2 : 'bonding', 3 : 'enjoy_the_moment', 4 : 'exercise', 5 : 'leisure', 6 : 'nature'}

In [8]:
rev_mapping = { 'achievement' : 0, 'affection' : 1, 'bonding' : 2, 'enjoy_the_moment' : 3, 'exercise' : 4, 'leisure' : 5, 'nature' : 6}

In [9]:
train['predicted_category'].replace(rev_mapping, inplace=True)

In [10]:
train.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,1
1,27674,24h,I was happy when my son got 90% marks in his e...,1,1
2,27675,24h,I went to the gym this morning and did yoga.,1,4
3,27676,24h,We had a serious talk with some friends of our...,2,2
4,27677,24h,I went with grandchildren to butterfly display...,1,1


In [11]:
data = train.values

In [12]:
X_train = data[:40000,2]
Y_train = data[:40000,4]

X_test = data[40000:60000,2]
Y_test = data[40000:60000,4]

print (X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)
print ("-------------------------")
print (X_test[0], Y_test[0])
print (X_train[0], Y_train[0])

(40000,) (40000,) (20000,) (20000,)
-------------------------
I received the promotion at work I had hoped for after working for the same company for 12 years. 0
I went on a successful date with someone I felt sympathy and connection with. 1


In [13]:
tokenizer = RegexpTokenizer("[a-zA-Z]+")

In [14]:
for ix in range(X_train.shape[0]):
    X_train[ix] = tokenizer.tokenize(X_train[ix])
    
for ix in range(X_test.shape[0]):
    X_test[ix] = tokenizer.tokenize(X_test[ix])
    
Y_train = np_utils.to_categorical(Y_train)

In [15]:
Y_test = np_utils.to_categorical(Y_test)

In [16]:
print(X_train[0],Y_train[0])

['I', 'went', 'on', 'a', 'successful', 'date', 'with', 'someone', 'I', 'felt', 'sympathy', 'and', 'connection', 'with'] [0. 1. 0. 0. 0. 0. 0.]


In [17]:
np.unique(np.array([len(ix) for ix in X_train]) , return_counts=True)

(array([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
          12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,
          23,   24,   25,   26,   27,   28,   29,   30,   31,   32,   33,
          34,   35,   36,   37,   38,   39,   40,   41,   42,   43,   44,
          45,   46,   47,   48,   49,   50,   51,   52,   53,   54,   55,
          56,   57,   58,   59,   60,   61,   62,   63,   64,   65,   66,
          67,   68,   69,   70,   71,   72,   73,   74,   75,   76,   77,
          78,   79,   80,   81,   82,   83,   84,   85,   86,   87,   88,
          89,   90,   91,   92,   93,   94,   95,   96,   97,   98,   99,
         100,  101,  102,  103,  104,  105,  106,  107,  108,  109,  110,
         111,  112,  113,  114,  115,  116,  117,  118,  119,  120,  121,
         122,  123,  124,  126,  127,  128,  129,  130,  131,  133,  134,
         135,  136,  137,  138,  139,  140,  142,  143,  144,  145,  146,
         147,  148,  149,  150,  151, 

In [18]:
np.unique(np.array([len(ix) for ix in X_test]) , return_counts=True)

(array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
         14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
         27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
         40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
         53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
         92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
        105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
        119, 120, 121, 123, 124, 125, 127, 128, 129, 130, 131, 132, 133,
        134, 135, 136, 137, 138, 139, 140, 141, 144, 145, 146, 147, 148,
        150, 151, 153, 154, 157, 158, 159, 161, 164, 166, 168, 169, 172,
        173, 176, 183, 186, 191, 192, 195, 205, 206, 209, 213, 224, 229,
        235, 236, 238, 247, 250, 252, 255, 268, 269

In [19]:
embeddings_index = {}

f = open('glove.6B.50d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [21]:
embedding_matrix_train = np.zeros((X_train.shape[0], 40, 50))
embedding_matrix_test = np.zeros((X_test.shape[0], 40, 50))

for ix in range(X_train.shape[0]):
#     print(len(X_train[ix]))
    for ij in range(min(len(X_train[ix]),40)):
#         print(ix,ij)
        if X_train[ix][ij].lower() in embeddings_index.keys() and embeddings_index[X_train[ix][ij].lower()].shape[0]==50:
            embedding_matrix_train[ix][ij] = embeddings_index[X_train[ix][ij].lower()]

In [22]:
for ix in range(X_test.shape[0]):
#     print(len(X_test[ix]))
    for ij in range(min(len(X_test[ix]),40)):
#         print(ix,ij)
        if X_test[ix][ij].lower() in embeddings_index.keys() and embeddings_index[X_test[ix][ij].lower()].shape[0]==50:
            embedding_matrix_test[ix][ij] = embeddings_index[X_test[ix][ij].lower()]

In [23]:
print(embedding_matrix_train.shape, embedding_matrix_test.shape)

(40000, 40, 50) (20000, 40, 50)


In [24]:
model = Sequential()
model.add(LSTM(128, input_shape=(40,50), return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(7))
model.add(Activation('softmax'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_1 (LSTM)                (None, 40, 128)           91648     
_________________________________________________________________
dropout_1 (Dropout)          (None, 40, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 7)                 903       
_________________________________________________________________
activation_1 (Activation)    (None, 7)                 0         
Total params: 224,135
Trainable params: 224,135
Non-trainable params: 0
_________________________________________________________________


In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [26]:
hist = model.fit(embedding_matrix_train,Y_train,validation_data=(embedding_matrix_test, Y_test),
                epochs = 20, batch_size=32,shuffle=True)

Train on 40000 samples, validate on 20000 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
 6784/40000 [====>.........................] - ETA: 1:51 - loss: 0.2597 - acc: 0.9158

KeyboardInterrupt: 

In [28]:
model.save("model.h5")

In [34]:
data_test = test.values[:,2]
print(data_test.shape)

(40213,)


In [35]:
print(data_test[0])

I spent the weekend in Chicago with my friends.


In [36]:
for ix in range(data_test.shape[0]):
    data_test[ix] = tokenizer.tokenize(data_test[ix])

In [37]:
embedding_matrix_ans = np.zeros((data_test.shape[0], 40, 50))

for ix in range(data_test.shape[0]):
#     print(len(X_train[ix]))
    for ij in range(min(len(data_test[ix]),40)):
#         print(ix,ij)
        if data_test[ix][ij].lower() in embeddings_index.keys() and embeddings_index[data_test[ix][ij].lower()].shape[0]==50:
            embedding_matrix_ans[ix][ij] = embeddings_index[data_test[ix][ij].lower()]

In [38]:
pred = model.predict_classes(embedding_matrix_ans)

In [39]:
test['predicted_category'] = pred

In [40]:
test.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,88305,3m,I spent the weekend in Chicago with my friends.,1,2
1,88306,3m,We moved back into our house after a remodel. ...,2,0
2,88307,3m,My fiance proposed to me in front of my family...,1,1
3,88308,3m,I ate lobster at a fancy restaurant with some ...,1,2
4,88309,3m,I went out to a nice restaurant on a date with...,5,1


In [41]:
test['predicted_category'].replace(label_dict, inplace=True)

In [42]:
test.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,88305,3m,I spent the weekend in Chicago with my friends.,1,bonding
1,88306,3m,We moved back into our house after a remodel. ...,2,achievement
2,88307,3m,My fiance proposed to me in front of my family...,1,affection
3,88308,3m,I ate lobster at a fancy restaurant with some ...,1,bonding
4,88309,3m,I went out to a nice restaurant on a date with...,5,affection


In [44]:
ans_df = pd.DataFrame()

In [45]:
ans_df['hmid'] = test['hmid']

In [46]:
ans_df['predicted_category'] = test['predicted_category']

In [47]:
ans_df.head()

Unnamed: 0,hmid,predicted_category
0,88305,bonding
1,88306,achievement
2,88307,affection
3,88308,bonding
4,88309,affection


In [51]:
ans_df.to_csv('ans.csv', sep=',',index=False)