### Import the dependencies

In [1]:
import nltk
import string
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.layers import Dense, Dropout, Lambda
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import MeanSquaredError, CategoricalCrossentropy
from tensorflow.sparse import  SparseTensor, to_dense

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder

import gc

In [2]:
gc.collect()

20

In [3]:
# Set display column width to 500

pd.set_option('display.max_colwidth', 500)

### Read the training data

In [4]:
data_train = pd.read_csv("train.txt", sep=';', header=None)
data_val = pd.read_csv("val.txt", sep=';', header=None)
data_test = pd.read_csv("test.txt", sep=';', header=None)
data_train.columns = ['text', 'label']
data_val.columns = ['text', 'label']
data_test.columns = ['text', 'label']

data = pd.concat((data_train,data_val,data_test),axis=0)

data.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,love
4,i am feeling grouchy,anger


In [5]:
data

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,love
4,i am feeling grouchy,anger
...,...,...
1995,i just keep feeling like someone is being unkind to me and doing me wrong and then all i can think of doing is to get back at them and the people they are close to,anger
1996,im feeling a little cranky negative after this doctors appointment,anger
1997,i feel that i am useful to my people and that gives me a great feeling of achievement,joy
1998,im feeling more comfortable with derby i feel as though i can start to step out my shell,joy


### Remove punctuation

In [6]:
data["punct_count"] = len([char for char in data["text"] if char in string.punctuation])

data["punct_count"].value_counts()

0    20000
Name: punct_count, dtype: int64

### Tokenization

In [7]:
from nltk.tokenize import word_tokenize

data['text_tokenized'] = data["text"].apply(lambda x: word_tokenize(x))


In [8]:
data.head()

Unnamed: 0,text,label,punct_count,text_tokenized
0,i didnt feel humiliated,sadness,0,"[i, didnt, feel, humiliated]"
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,sadness,0,"[i, can, go, from, feeling, so, hopeless, to, so, damned, hopeful, just, from, being, around, someone, who, cares, and, is, awake]"
2,im grabbing a minute to post i feel greedy wrong,anger,0,"[im, grabbing, a, minute, to, post, i, feel, greedy, wrong]"
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,love,0,"[i, am, ever, feeling, nostalgic, about, the, fireplace, i, will, know, that, it, is, still, on, the, property]"
4,i am feeling grouchy,anger,0,"[i, am, feeling, grouchy]"


In [9]:
data["label"].value_counts()

joy         6761
sadness     5797
anger       2709
fear        2373
love        1641
surprise     719
Name: label, dtype: int64

### Remove stopwords

In [10]:
# nltk stopwords
stopwords = nltk.corpus.stopwords.words('english')

# Additional Stop words

stop_words = ["arent", "cant", "couldnt", "didnt", "doesnt", "dont", "hadnt", "hasnt", "havent", "hed", "hell", "hes", "Id", "Ill", "Im", "Ive", "isnt", "lets", "mightnt", "mustnt", "shant", "shed", "shell", "shes", "shouldnt", "thats", "theres", "theyd", "theyll", "theyre", "theyve", "wed", "were", "weve", "werent", "whatll", "whatre", "whats", "whatve", "wheres", "whos", "wholl", "whore", "whos", "whove", "wont", "wouldnt", "youd", "youll", "youre", "youve"]

In [11]:
def remove_stopwords(word_list):
    return " ".join([WordNetLemmatizer().lemmatize(word) for word in word_list if word not in stopwords and word not in stop_words])

data["text_tokenized_nostop"] = data["text_tokenized"].apply(lambda x: remove_stopwords(x))
data.head()

Unnamed: 0,text,label,punct_count,text_tokenized,text_tokenized_nostop
0,i didnt feel humiliated,sadness,0,"[i, didnt, feel, humiliated]",feel humiliated
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,sadness,0,"[i, can, go, from, feeling, so, hopeless, to, so, damned, hopeful, just, from, being, around, someone, who, cares, and, is, awake]",go feeling hopeless damned hopeful around someone care awake
2,im grabbing a minute to post i feel greedy wrong,anger,0,"[im, grabbing, a, minute, to, post, i, feel, greedy, wrong]",im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,love,0,"[i, am, ever, feeling, nostalgic, about, the, fireplace, i, will, know, that, it, is, still, on, the, property]",ever feeling nostalgic fireplace know still property
4,i am feeling grouchy,anger,0,"[i, am, feeling, grouchy]",feeling grouchy


### Vectorize

In [13]:
gc.collect()

40

In [14]:
tfidf_vect = TfidfVectorizer()
X_tfidf_vect = tfidf_vect.fit_transform(data.iloc[:data.shape[0],:]["text_tokenized_nostop"])

X_features = pd.DataFrame(X_tfidf_vect.toarray())
X_scaler = StandardScaler()
X_features_scaled = X_scaler.fit_transform(X_features)
X_features_scaled


array([[-0.01212179, -0.00707124, -0.00707124, ..., -0.00707124,
        -0.01217736, -0.00707124],
       [-0.01212179, -0.00707124, -0.00707124, ..., -0.00707124,
        -0.01217736, -0.00707124],
       [-0.01212179, -0.00707124, -0.00707124, ..., -0.00707124,
        -0.01217736, -0.00707124],
       ...,
       [-0.01212179, -0.00707124, -0.00707124, ..., -0.00707124,
        -0.01217736, -0.00707124],
       [-0.01212179, -0.00707124, -0.00707124, ..., -0.00707124,
        -0.01217736, -0.00707124],
       [-0.01212179, -0.00707124, -0.00707124, ..., -0.00707124,
        -0.01217736, -0.00707124]])

In [15]:
gc.collect()

20

In [16]:
X = X_features.to_numpy()
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
encoder = OneHotEncoder()
y = encoder.fit_transform(data[["label"]])
pd.DataFrame(y.toarray())

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
19995,1.0,0.0,0.0,0.0,0.0,0.0
19996,1.0,0.0,0.0,0.0,0.0,0.0
19997,0.0,0.0,1.0,0.0,0.0,0.0
19998,0.0,0.0,1.0,0.0,0.0,0.0


In [18]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return SparseTensor(indices, coo.data, coo.shape)

In [19]:
type(y)

scipy.sparse.csr.csr_matrix

In [20]:
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
X_train = X[:data_train.shape[0],:]
y_train = y[:data_train.shape[0]]
X_val = X[data_train.shape[0]:data_train.shape[0] + data_val.shape[0],:]
y_val = y[data_train.shape[0]:data_train.shape[0] + data_val.shape[0]]
X_test = X[data_train.shape[0] + data_val.shape[0]:data_train.shape[0]
              + data_val.shape[0] 
              + data_test.shape[0],:]
y_test = y[data_train.shape[0] + data_val.shape[0]:data_train.shape[0]
              + data_val.shape[0] 
              + data_test.shape[0]:]

# y = to_dense(convert_sparse_matrix_to_sparse_tensor(y))


In [21]:
y_train

<16000x6 sparse matrix of type '<class 'numpy.float64'>'
	with 16000 stored elements in Compressed Sparse Row format>

In [22]:
y_train = to_dense(convert_sparse_matrix_to_sparse_tensor(y_train))
y_val = to_dense(convert_sparse_matrix_to_sparse_tensor(y_val))
y_test = to_dense(convert_sparse_matrix_to_sparse_tensor(y_test))

In [23]:
seq = Sequential()
seq.add(Dense(units=1000,activation="relu",input_dim=X_train.shape[1]))
seq.add(Dropout(0.2))
seq.add(Dense(units=50,activation="relu"))
seq.add(Dropout(0.2))
seq.add(Dense(units=6,activation="softmax"))

seq.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

In [24]:
gc.collect()

20

In [25]:
# Train the model
# fit_model = seq.fit(X_train,y_train,epochs=20)
fit_model = seq.fit(X_train,y_train,epochs=20, validation_data=(X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [26]:
gc.collect()

1502

In [27]:
y_test_pred = seq.predict(X_test,use_multiprocessing=True)

In [28]:
y_test_nparray = y[data_train.shape[0] + data_val.shape[0]:data_train.shape[0]
              + data_val.shape[0] 
              + data_test.shape[0]]

In [29]:
def float_to_int(fl):
    int_2d = []
    for row in fl:
        i_row = []
        for x in row:
            i_row.append(int(x+0.5))
        int_2d.append(i_row)
    
    return np.array(int_2d)
    

In [30]:
y_test_pred_int = float_to_int(y_test_pred)

In [31]:
y_test_pred = float_to_int(y_test_pred)

In [32]:
y_test_pred

array([[0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0],
       ...,
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 1]])

In [33]:
y[data_train.shape[0] + data_val.shape[0]:data_train.shape[0]\
              + data_val.shape[0] 
              + data_test.shape[0]:].toarray()

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [34]:
(y_test_pred == y_test_nparray).sum()/(y_test_pred.shape[0] * y_test_pred.shape[1])

0.96175