In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.models import Sequential
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import string
from textblob import TextBlob
import joblib


In [None]:
len(open('data/test.txt', 'r').readlines())

In [None]:
len(open('data/train.txt', 'r').readlines())

In [None]:
len(open('data/val.txt', 'r').readlines())

In [None]:
test_data = open('data/test.txt', 'r').readlines()
test_data

In [None]:
train_data = open('data/train.txt', 'r').readlines()
train_data

In [None]:
val_data = open('data/val.txt', 'r').readlines()
val_data

In [None]:
total_data = test_data + train_data + val_data

In [None]:
len(total_data)

In [None]:
x = []
y = []

for item in total_data:
    text, label = item.split(';')
    label = label.replace('\n', '')
    x.append(text)
    y.append(label)


In [None]:
x

In [None]:
y

#### Text Cleaning Steps


In [None]:
stem = PorterStemmer()

In [None]:
def text_cleaning(sentence):
    clean_text = []
    for sent in sentence:
        lower_sent = sent.lower()  # Lower case
        word_tokenized = word_tokenize(lower_sent)
        removed_stopwords = [word for word in word_tokenized if word not in stopwords.words('english')]   
        # remove stop words
        stemmed_sent = [stem.stem(word) for word in removed_stopwords]
        cleaned = " ".join(stemmed_sent)  # join
        clean_text.append(cleaned)
    return clean_text
        

In [None]:
clean_Data = text_cleaning(x)

In [None]:
clean_Data


In [41]:
tokenizer = Tokenizer(oov_token='<nothing>')

In [42]:
tokenizer.fit_on_texts(clean_Data)

In [43]:
tokenizer.word_index

{'<nothing>': 1,
 'feel': 2,
 'like': 3,
 'im': 4,
 'get': 5,
 'time': 6,
 'know': 7,
 'realli': 8,
 'make': 9,
 'go': 10,
 'want': 11,
 'love': 12,
 'littl': 13,
 'think': 14,
 'peopl': 15,
 'day': 16,
 'thing': 17,
 'one': 18,
 'would': 19,
 'even': 20,
 'still': 21,
 'ive': 22,
 'life': 23,
 'bit': 24,
 'way': 25,
 'need': 26,
 'someth': 27,
 'much': 28,
 'dont': 29,
 'work': 30,
 'start': 31,
 'could': 32,
 'say': 33,
 'look': 34,
 'see': 35,
 'tri': 36,
 'back': 37,
 'good': 38,
 'pretti': 39,
 'come': 40,
 'right': 41,
 'alway': 42,
 'help': 43,
 'also': 44,
 'today': 45,
 'year': 46,
 'take': 47,
 'friend': 48,
 'use': 49,
 'around': 50,
 'cant': 51,
 'person': 52,
 'made': 53,
 'though': 54,
 'hate': 55,
 'well': 56,
 'got': 57,
 'happi': 58,
 'thought': 59,
 'someon': 60,
 'didnt': 61,
 'never': 62,
 'felt': 63,
 'find': 64,
 'write': 65,
 'lot': 66,
 'hope': 67,
 'quit': 68,
 'live': 69,
 'week': 70,
 'everi': 71,
 'sure': 72,
 'less': 73,
 'read': 74,
 'enough': 75,
 'give':

In [44]:
tokenizer.document_count

20000

In [53]:
sequences = tokenizer.texts_to_sequences(clean_Data)

In [62]:
sequences= pad_sequences(sequences, maxlen=35, padding='post')

In [71]:
sequences.shape

(20000, 35)

In [63]:
sequences[0:5]

array([[   4,    2,  123,  625,    4, 3696,   41,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   4, 1451,  108,    2,  387,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [  62,    9, 1269,   92,   11,    2,    3,  333,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [  85, 6002,  822, 3184, 4452,  600,    2,  269,  547,  976,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0],
       [   2,   13,  472,   

In [64]:
unique_labels = pd.Series(np.array(y)).unique()

In [65]:
unique_labels

array(['sadness', 'joy', 'fear', 'anger', 'love', 'surprise'],
      dtype=object)

In [66]:
len(unique_labels)

6

In [67]:
labels_dict = {
    'sadness': 0,
    'love':1,
    'anger':2,
    'surprise':3,
    'fear': 4,
    'joy':5
}

In [76]:
def label_encoder(labels):
    label = []
    for x in labels:
        label.append(labels_dict[x])
    label = np.array(label)
    return label

In [79]:
label = label_encoder(labels=y)

In [80]:
label[0:5]

array([0, 0, 0, 5, 0])

In [72]:
sequences.shape

(20000, 35)

In [81]:
label.shape

(20000,)

In [82]:
x_train, x_test, y_train, y_test = train_test_split(sequences, label,test_size=0.2, random_state=42)

In [83]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((16000, 35), (4000, 35), (16000,), (4000,))