In [1]:
import numpy as np
import pandas as pd
import nltk
import tensorflow
from tensorflow.keras.utils import pad_sequences
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.layers import LSTM,Dense , SimpleRNN , Embedding , Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [2]:
train_data = open(r"chat_recognition\train.txt").read()
test_data = open(r"chat_recognition\test.txt").read()
val_data = open(r"chat_recognition\val.txt").read()

In [3]:
train_data = train_data.split('\n')[:-1]
test_data = test_data.split('\n')[:-1]
val_data = val_data.split('\n')[:-1]

In [4]:
len(train_data)

16000

In [5]:
len(test_data)

2000

In [6]:
len(val_data)

2000

In [7]:
total_message = train_data + test_data + val_data

In [8]:
x = []
y = []
for i in total_message:
    message = i.split(';')
    x.append(message[0])
    y.append(message[1])

In [9]:
import numpy as np
np.unique(y)

array(['anger', 'fear', 'joy', 'love', 'sadness', 'surprise'], dtype='<U8')

In [10]:
import pandas as pd
pd.DataFrame(y).value_counts()

joy         6761
sadness     5797
anger       2709
fear        2373
love        1641
surprise     719
dtype: int64

In [11]:
x[:5] , y[:5]

(['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'im grabbing a minute to post i feel greedy wrong',
  'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'i am feeling grouchy'],
 ['sadness', 'sadness', 'anger', 'love', 'anger'])

In [12]:
stemming = PorterStemmer()
def text_cleaning(ls_sentence):
    clean_message_ls = []
    for sentence in ls_sentence:
        lower_sentence = sentence.lower()
        ls_of_words = nltk.word_tokenize(lower_sentence)
        ls_of_removed_stopwords = [i for i in ls_of_words if i not in stopwords.words('english')]
        stemmed_word_ls = [stemming.stem(word) for word in ls_of_removed_stopwords]
        cleaned_message = " ".join(stemmed_word_ls)
        clean_message_ls.append(cleaned_message)
    return clean_message_ls

cleaned_message_ls = text_cleaning(x)

In [13]:
len(cleaned_message_ls) , len(x)

(20000, 20000)

In [14]:
tokenizer = Tokenizer(oov_token = "<nothing>") # out of word vocabulary

In [15]:
tokenizer.fit_on_texts(cleaned_message_ls)

In [16]:
tokenizer.document_count

20000

In [17]:
# tokenizer.word_counts

In [18]:
min((tokenizer.word_counts).values())

1

In [19]:
len(tokenizer.word_index.keys())

11597

In [20]:
sequence = tokenizer.texts_to_sequences(cleaned_message_ls)

In [21]:
sequence[0]

[61, 2, 522]

In [22]:
max(list(map(len, sequence)))

35

In [23]:
sequences = pad_sequences(sequences = sequence , padding='post' , maxlen = 35)

Label Encoding

In [24]:
dt = {i:n for n,i in enumerate(np.unique(y))}

In [25]:
label = np.array([dt[i] for i in y])

In [26]:
label

array([4, 4, 0, ..., 2, 2, 2])

One Hot Encoding

In [27]:
from tensorflow.keras.utils import to_categorical

In [28]:
y2 = to_categorical(label)

In [29]:
y2

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [30]:
x_train , x_test , y_train , y_test = train_test_split(sequences , label , test_size=0.2)

In [31]:
x_train.shape , y_train.shape

((16000, 35), (16000,))

In [32]:
x_test.shape , y_test.shape

((4000, 35), (4000,))

In [33]:
model = Sequential()
model.add(SimpleRNN(32 , input_shape = (35,1) , return_sequences = False)) # If we are adding no more RNN layers we write return_sequence= Flase else True
model.add(Dense(len(set(y_train)), activation = 'softmax'))

model.compile(optimizer = 'adam' , loss ='sparse_categorical_crossentropy' , metrics = ['accuracy'])
model.summary()

  super().__init__(**kwargs)


    FOR LABEL ENCODING : SPARSE CATEGORICAL CROSS ENTROPY
    FOR ONE HOT ENCODING : CATEGORICAL CROSS ENTROPY

In [34]:
y_train.shape

(16000,)

In [35]:
x_train[0].shape

(35,)

In [109]:
history = model.fit(x_train , y_train , epochs = 5 , validation_data = (x_test , y_test))

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.3172 - loss: 1.6134 - val_accuracy: 0.3300 - val_loss: 1.5865
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.3345 - loss: 1.5806 - val_accuracy: 0.3313 - val_loss: 1.5748
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.3383 - loss: 1.5747 - val_accuracy: 0.3300 - val_loss: 1.5756
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.3300 - loss: 1.5718 - val_accuracy: 0.3335 - val_loss: 1.5762
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.3452 - loss: 1.5754 - val_accuracy: 0.3320 - val_loss: 1.5773


In [41]:
pd.Series(y).value_counts()

joy         6761
sadness     5797
anger       2709
fear        2373
love        1641
surprise     719
dtype: int64

In [67]:
model2 = Sequential([
    LSTM(units = 72, input_shape = (35,1), return_sequences = True),
    LSTM(units = 64 , dropout = 0.4),
    Dense(units = 6, activation = 'softmax')
])

model2.compile(optimizer = 'adam' , loss='sparse_categorical_crossentropy' , metrics = ['accuracy'])
model2.summary()

In [68]:
history = model2.fit(x_train , y_train , epochs = 20 , validation_data = (x_test , y_test))

Epoch 1/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 24ms/step - accuracy: 0.3320 - loss: 1.6061 - val_accuracy: 0.3430 - val_loss: 1.5647
Epoch 2/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 23ms/step - accuracy: 0.3412 - loss: 1.5766 - val_accuracy: 0.3455 - val_loss: 1.5580
Epoch 3/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 23ms/step - accuracy: 0.3355 - loss: 1.5793 - val_accuracy: 0.3433 - val_loss: 1.5590
Epoch 4/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 22ms/step - accuracy: 0.3370 - loss: 1.5797 - val_accuracy: 0.3430 - val_loss: 1.5600
Epoch 5/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 23ms/step - accuracy: 0.3351 - loss: 1.5791 - val_accuracy: 0.3495 - val_loss: 1.5583
Epoch 6/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 23ms/step - accuracy: 0.3392 - loss: 1.5753 - val_accuracy: 0.3433 - val_loss: 1.5574
Epoch 7/20
[1m5

In [69]:
predictions = model2.predict(x_test)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step


In [75]:
np.argmax(predictions , axis = 1)

array([2, 2, 2, ..., 2, 2, 2], dtype=int64)

In [76]:
y_test

array([4, 2, 2, ..., 4, 4, 2])

In [79]:
pd.Series(y_train).value_counts()

2    5389
4    4610
0    2175
1    1934
3    1300
5     592
dtype: int64

In [80]:
predictions[0]

array([0.13819131, 0.1078889 , 0.365781  , 0.09979309, 0.24518138,
       0.04316438], dtype=float32)

In [83]:
pd.Series(y_train).value_counts()

2    5389
4    4610
0    2175
1    1934
3    1300
5     592
dtype: int64

In [87]:
dt

{'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5}