In [None]:
#Import libraries
import pandas as pd
import numpy as np
import os
import pickle
import warnings
warnings.filterwarnings("ignore")


In [None]:
#Import the data
happy_data = pd.read_csv('happy.csv')
happy_data.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,affection
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,affection
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,exercise
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,bonding
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,affection


In [None]:
#Check the shape of the dataset
happy_data.shape

(100535, 9)

In [None]:
#Check the number of items in each of the class
happy_data['predicted_category'].value_counts()

affection           34168
achievement         33993
enjoy_the_moment    11144
bonding             10727
leisure              7458
nature               1843
exercise             1202
Name: predicted_category, dtype: int64

In [None]:
#Check count for each sentence count
happy_data['num_sentence'].value_counts()

1     83711
2      9542
3      3847
4      1624
5       821
6       336
7       183
8       107
10       68
9        61
11       35
13       26
12       21
16       17
18       17
14       14
17       14
19       12
21       10
25        7
15        7
23        7
24        5
26        5
22        4
29        3
31        3
30        3
20        3
27        2
32        2
37        2
40        2
56        1
46        1
53        1
51        1
48        1
69        1
35        1
45        1
44        1
42        1
58        1
34        1
28        1
60        1
Name: num_sentence, dtype: int64

In [None]:
#We'll use only those rows which has 10 or less sentences as others has less count
new_data = happy_data[happy_data['num_sentence'] <= 10]
new_data['num_sentence'].value_counts()

1     83711
2      9542
3      3847
4      1624
5       821
6       336
7       183
8       107
10       68
9        61
Name: num_sentence, dtype: int64

In [None]:
#Encode the response variable
class_map = {
    "affection" : 0,
    "achievement"  : 1,       
    "bonding" : 2,    
    "enjoy_the_moment" : 3,     
    "leisure"  : 4,    
    "nature" : 5,    
    "exercise" : 6
}

In [None]:
new_data['predicted_category'] = new_data['predicted_category'].map(class_map)

In [None]:
new_data.head()

Unnamed: 0,hmid,wid,reflection_period,original_hm,cleaned_hm,modified,num_sentence,ground_truth_category,predicted_category
0,27673,2053,24h,I went on a successful date with someone I fel...,I went on a successful date with someone I fel...,True,1,,0
1,27674,2,24h,I was happy when my son got 90% marks in his e...,I was happy when my son got 90% marks in his e...,True,1,,0
2,27675,1936,24h,I went to the gym this morning and did yoga.,I went to the gym this morning and did yoga.,True,1,,6
3,27676,206,24h,We had a serious talk with some friends of our...,We had a serious talk with some friends of our...,True,2,bonding,2
4,27677,6227,24h,I went with grandchildren to butterfly display...,I went with grandchildren to butterfly display...,True,1,,0


**Text Preprocessing**

In [None]:
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
preprocessed_text = []
happy_texts = new_data['cleaned_hm'].values.tolist()
for item in happy_texts:

  #Remove the punctuations
  new_text = re.sub('[^a-zA-Z]',' ',item)

  #Lower the cases
  new_text = new_text.lower()

  #Split the words
  new_text = new_text.split()

  #Remove Stopwords
  new_text = [word for word in new_text if word not in stopwords.words('english')]
  
  preprocessed_text.append(new_text)
  

In [None]:
#Save the preprocessed text
file = 'preprocessed_text.pkl'
pickle.dump(preprocessed_text,open(file,'wb'))

In [None]:
#Load the text preprocessed data
preprocessed_text = pickle.load(open("preprocessed_text.pkl","rb"))

In [None]:
preprocessed_text[:2]

[['went', 'successful', 'date', 'someone', 'felt', 'sympathy', 'connection'],
 ['happy', 'son', 'got', 'marks', 'examination']]

In [None]:
len(preprocessed_text)

100300

In [None]:
#Findout the sentence which has maxm words
max_words = 1

for w in preprocessed_text:
  if len(w) > max_words:
    max_words = len(w)

print('Maxm no: of words in a sentence: {}'.format(max_words)) 

Maxm no: of words in a sentence: 129


In [None]:
#Findout number of words in each sentences
num_words = []
for word in preprocessed_text:
  words = len(word)
  num_words.append(words)

In [None]:
from collections import Counter
sorted(Counter(num_words).items(),key = lambda x: x[1],reverse=True)


[(5, 12571),
 (6, 12125),
 (4, 11403),
 (7, 10025),
 (3, 8446),
 (8, 8015),
 (9, 6403),
 (10, 4973),
 (11, 3797),
 (2, 3316),
 (12, 2937),
 (13, 2452),
 (14, 1941),
 (15, 1685),
 (16, 1354),
 (17, 1105),
 (18, 822),
 (19, 756),
 (20, 664),
 (21, 553),
 (22, 503),
 (24, 448),
 (23, 443),
 (25, 313),
 (28, 265),
 (26, 263),
 (27, 225),
 (29, 208),
 (30, 190),
 (31, 157),
 (1, 147),
 (32, 129),
 (34, 126),
 (33, 116),
 (35, 114),
 (37, 88),
 (36, 80),
 (38, 79),
 (40, 78),
 (51, 77),
 (41, 65),
 (42, 62),
 (39, 62),
 (47, 61),
 (53, 53),
 (43, 45),
 (46, 40),
 (44, 39),
 (45, 38),
 (49, 38),
 (48, 31),
 (61, 26),
 (50, 26),
 (63, 23),
 (52, 20),
 (54, 20),
 (62, 20),
 (59, 17),
 (55, 16),
 (64, 16),
 (57, 15),
 (56, 15),
 (66, 14),
 (60, 13),
 (65, 12),
 (58, 8),
 (71, 8),
 (79, 8),
 (129, 7),
 (125, 7),
 (80, 6),
 (69, 6),
 (72, 6),
 (75, 6),
 (87, 6),
 (68, 5),
 (73, 5),
 (78, 5),
 (67, 4),
 (74, 4),
 (124, 4),
 (95, 3),
 (70, 3),
 (89, 3),
 (77, 2),
 (96, 2),
 (76, 2),
 (97, 1),
 (84, 

**Applying Word2Vec**

In [None]:
import gensim

In [None]:
#Define number of dimensions into which each word is vectorized
embedding_dim = 100

In [None]:
w2v_model = gensim.models.Word2Vec(sentences=preprocessed_text,
                                   size=embedding_dim,
                                   window=5,
                                   workers=4,
                                   min_count=1)
word_vocab = list(w2v_model.wv.vocab)

In [None]:
#len(word_vocab)

23628

**Save the word vectors for later use**

In [None]:
w2v_model.wv.save_word2vec_format("happy_w2v.txt")

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

In [None]:
# Create a tokenizer object
tokenizer = Tokenizer()
tokenizer.fit_on_texts(preprocessed_text)

#Get index position of each unique word in preprocessed_text
word_index = tokenizer.word_index

#Convert the words in each sentence into its correponding index position
#each sentence will be encoded like [4,611,205,41,9566]
sequences = tokenizer.texts_to_sequences(preprocessed_text)

#Pad the sequences to be the same length.
max_length = 54
padded_data = pad_sequences(sequences=sequences,maxlen=max_length,padding='post')
labels = new_data['predicted_category'].values




**Split the data into a training set and a validation set**

In [None]:
indices = np.arange(padded_data.shape[0])

#Assign a seed for reproducibility
np.random.seed(99)
np.random.shuffle(indices)
padded_data = padded_data[indices]
labels = labels[indices]

In [None]:
#Convert labels into a 1D matrix
max_label_no = np.max(labels) + 1
labels = np.eye(max_label_no)[labels]

In [None]:
#Do 80-20 split
validation_ratio = 0.2
num_validation_samples = int(validation_ratio*padded_data.shape[0])

x_train = padded_data[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
x_val = padded_data[-num_validation_samples:]
y_val = labels[-num_validation_samples:]

print('Shape of x_train:', x_train.shape)
print('Shape of y_train:', y_train.shape)

print('Shape of x_val:', x_val.shape)
print('Shape of y_val:', y_val.shape)

Shape of x_train: (80240, 54)
Shape of y_train: (80240, 7)
Shape of x_val: (20060, 54)
Shape of y_val: (20060, 7)


**Create a weight matrix for words in training docs**

In [None]:
#Load the embedded file into memory as a dictionary of word to embedding array.
embeddings_index = {}
f = open(os.path.join('', 'happy_w2v.txt'),  encoding = "utf-8")
for line in f:
  values = line.split()
  word = values[0]
  vec = np.asarray(values[1:])
  embeddings_index[word] = vec
f.close()

In [None]:
#Use embedding_index dictionary and word_index to compute the embedding matrix:
input_dim = len(word_index)+1
embedding_matrix = np.zeros((input_dim,embedding_dim))

for word,i in word_index.items():
  if i > (len(word_index)+1):
    continue
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None:
    embedding_matrix[i] = embedding_vector

**Build the Model**

In [None]:
model = Sequential()
embedding_layer = Embedding(input_dim=input_dim,
                            output_dim=embedding_dim,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length = max_length,
                            trainable=False
                            )
model.add(embedding_layer)
model.add(LSTM(units=80,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(units=7,activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 54, 100)           2362900   
_________________________________________________________________
lstm (LSTM)                  (None, 96)                75648     
_________________________________________________________________
dense (Dense)                (None, 7)                 679       
Total params: 2,439,227
Trainable params: 76,327
Non-trainable params: 2,362,900
_________________________________________________________________


**Save the best model**


In [None]:
from keras.callbacks import ModelCheckpoint
filepath = 'lstm-model-weights-{epoch:03d}-{val_accuracy:03f}.h5'
checkpoint = ModelCheckpoint(filepath, verbose=1, monitor='val_accuracy',save_best_only=True, mode='auto')
callbacks_list = [checkpoint]

**Train the model**

In [None]:
model.fit(x_train, y_train, batch_size=128, epochs=25, validation_data=(x_val, y_val), callbacks=callbacks_list)


Epoch 1/25
Epoch 00001: val_accuracy improved from -inf to 0.78923, saving model to lstm96-model-weights-001-0.789232.h5
Epoch 2/25
Epoch 00002: val_accuracy improved from 0.78923 to 0.80718, saving model to lstm96-model-weights-002-0.807178.h5
Epoch 3/25
Epoch 00003: val_accuracy improved from 0.80718 to 0.82871, saving model to lstm96-model-weights-003-0.828714.h5
Epoch 4/25
Epoch 00004: val_accuracy did not improve from 0.82871
Epoch 5/25
Epoch 00005: val_accuracy improved from 0.82871 to 0.83978, saving model to lstm96-model-weights-005-0.839781.h5
Epoch 6/25
Epoch 00006: val_accuracy did not improve from 0.83978
Epoch 7/25
Epoch 00007: val_accuracy improved from 0.83978 to 0.84920, saving model to lstm96-model-weights-007-0.849202.h5
Epoch 8/25
Epoch 00008: val_accuracy improved from 0.84920 to 0.86002, saving model to lstm96-model-weights-008-0.860020.h5
Epoch 9/25
Epoch 00009: val_accuracy did not improve from 0.86002
Epoch 10/25
Epoch 00010: val_accuracy improved from 0.86002 t

<tensorflow.python.keras.callbacks.History at 0x7fe4e44da668>

**Load the weights of best model**

In [None]:
filepath = 'lstm-model-weights-022-0.875972.h5'
model = load_model(filepath=filepath)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 54, 100)           2362900   
_________________________________________________________________
lstm (LSTM)                  (None, 80)                57920     
_________________________________________________________________
dense (Dense)                (None, 7)                 567       
Total params: 2,421,387
Trainable params: 58,487
Non-trainable params: 2,362,900
_________________________________________________________________


**Prediction**

In [None]:
#Prediction for a random single value
index_position = np.random.randint(len(y_val))
prediction = np.argmax(model.predict(x_val[index_position:index_position+1]),axis=-1)
predicted_matrix = np.eye(max_label_no)[prediction]
print('Actual Label:-',y_val[index_position:index_position+1])
print('Predicted Label:-',predicted_matrix)
print("----------")
for i,k in class_map.items():
  if k == np.argmax(y_val[index_position:index_position+1]):
    print("Acutal reason for happiness is {}".format(i))
  if k == prediction[0]:
    print("Predicted reason for happiness is {}".format(i))


Actual Label:- [[0. 0. 0. 1. 0. 0. 0.]]
Predicted Label:- [[0. 0. 0. 1. 0. 0. 0.]]
----------
Acutal reason for happiness is enjoy_the_moment
Predicted reason for happiness is enjoy_the_moment
