### Import the dependencies

In [1]:
import nltk
import string
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.layers import Dense, Dropout, Lambda
from tensorflow.keras.models import Sequential
from tensorflow.keras.losses import MeanSquaredError, CategoricalCrossentropy
from tensorflow.sparse import  SparseTensor, to_dense

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.preprocessing import StandardScaler,LabelEncoder, OneHotEncoder

import gc

### Run garbage collection to free up memory

In [2]:
gc.collect()

58

### Increase the size of the columns to be able to view more data

In [3]:
# Set display column width to 500

pd.set_option('display.max_colwidth', 500)

### Read the training data

In [4]:
# Read the csv files

data_train = pd.read_csv("train.txt", sep=';', header=None)
data_val = pd.read_csv("val.txt", sep=';', header=None)
data_test = pd.read_csv("test.txt", sep=';', header=None)

# Set the column headers of the dataframes

data_train.columns = ['text', 'label']
data_val.columns = ['text', 'label']
data_test.columns = ['text', 'label']

# Combine all three dataframes into one

data = pd.concat((data_train,data_val,data_test),axis=0)

data.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,love
4,i am feeling grouchy,anger


### Remove punctuation

In [5]:
### Create a punct_count column to contain the number of punctuation characters in each line of data
data["punct_count"] = len([char for char in data["text"] if char in string.punctuation])

### Get the frequency of the number of punctuation counts 
data["punct_count"].value_counts()

0    20000
Name: punct_count, dtype: int64

### Tokenization

In [6]:
from nltk.tokenize import word_tokenize

data['text_tokenized'] = data["text"].apply(lambda x: word_tokenize(x))

In [7]:
data.head()

Unnamed: 0,text,label,punct_count,text_tokenized
0,i didnt feel humiliated,sadness,0,"[i, didnt, feel, humiliated]"
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,sadness,0,"[i, can, go, from, feeling, so, hopeless, to, so, damned, hopeful, just, from, being, around, someone, who, cares, and, is, awake]"
2,im grabbing a minute to post i feel greedy wrong,anger,0,"[im, grabbing, a, minute, to, post, i, feel, greedy, wrong]"
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,love,0,"[i, am, ever, feeling, nostalgic, about, the, fireplace, i, will, know, that, it, is, still, on, the, property]"
4,i am feeling grouchy,anger,0,"[i, am, feeling, grouchy]"


In [8]:
data["label"].value_counts()

joy         6761
sadness     5797
anger       2709
fear        2373
love        1641
surprise     719
Name: label, dtype: int64

### Remove stopwords

In [9]:
# nltk stopwords
stopwords = nltk.corpus.stopwords.words('english')

# Additional Stop words
stop_words = ["arent", "cant", "couldnt", "didnt", "doesnt", "dont", "hadnt", "hasnt", "havent", "hed", "hell", "hes", "Id", "Ill", "Im", "Ive", "isnt", "lets", "mightnt", "mustnt", "shant", "shed", "shell", "shes", "shouldnt", "thats", "theres", "theyd", "theyll", "theyre", "theyve", "wed", "were", "weve", "werent", "whatll", "whatre", "whats", "whatve", "wheres", "whos", "wholl", "whore", "whos", "whove", "wont", "wouldnt", "youd", "youll", "youre", "youve"]

In [10]:
def remove_stopwords(word_list):
    return " ".join([WordNetLemmatizer().lemmatize(word) for word in word_list if word not in stopwords and word not in stop_words])

data["text_tokenized_nostop"] = data["text_tokenized"].apply(lambda x: remove_stopwords(x))
data.head()

Unnamed: 0,text,label,punct_count,text_tokenized,text_tokenized_nostop
0,i didnt feel humiliated,sadness,0,"[i, didnt, feel, humiliated]",feel humiliated
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,sadness,0,"[i, can, go, from, feeling, so, hopeless, to, so, damned, hopeful, just, from, being, around, someone, who, cares, and, is, awake]",go feeling hopeless damned hopeful around someone care awake
2,im grabbing a minute to post i feel greedy wrong,anger,0,"[im, grabbing, a, minute, to, post, i, feel, greedy, wrong]",im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,love,0,"[i, am, ever, feeling, nostalgic, about, the, fireplace, i, will, know, that, it, is, still, on, the, property]",ever feeling nostalgic fireplace know still property
4,i am feeling grouchy,anger,0,"[i, am, feeling, grouchy]",feeling grouchy


### Vectorize

### Run garbage collection to free up memory

In [11]:
gc.collect()

27

### Vectorize the text using the TfidfVectorizer

In [12]:
# Create the TfidfVectorizer and fit it to the entire text corpus (training, validation, test)
tfidf_vect = TfidfVectorizer()
X_tfidf_vect = tfidf_vect.fit_transform(data.iloc[:data.shape[0],:]["text_tokenized_nostop"])

# Get the features (tokens) that were identified by the vectorizer
X_features = pd.DataFrame(X_tfidf_vect.toarray())

# Scale the features 
X_scaler = StandardScaler()
X_features_scaled = X_scaler.fit_transform(X_features)
X = X_features_scaled

### Run garbage collection to free up memory

In [13]:
gc.collect()

20

In [14]:
encoder = OneHotEncoder()
y = encoder.fit_transform(data[["label"]])
pd.DataFrame(y.toarray())

Unnamed: 0,0,1,2,3,4,5
0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
19995,1.0,0.0,0.0,0.0,0.0,0.0
19996,1.0,0.0,0.0,0.0,0.0,0.0
19997,0.0,0.0,1.0,0.0,0.0,0.0
19998,0.0,0.0,1.0,0.0,0.0,0.0


### Convert sparse matrix to SparseTensor

In [15]:
def convert_sparse_matrix_to_sparse_tensor(X):
    coo = X.tocoo()
    indices = np.mat([coo.row, coo.col]).transpose()
    return SparseTensor(indices, coo.data, coo.shape)

In [16]:
type(y)

scipy.sparse.csr.csr_matrix

### Retrieve the training, validation, and test datasets from the larger dataset

In [17]:
# Train
X_train = X[:data_train.shape[0],:]
y_train = y[:data_train.shape[0]]

# Validation
X_val = X[data_train.shape[0]:data_train.shape[0] + data_val.shape[0],:]
y_val = y[data_train.shape[0]:data_train.shape[0] + data_val.shape[0]]

# Test
X_test = X[data_train.shape[0] + data_val.shape[0]:data_train.shape[0]
              + data_val.shape[0] 
              + data_test.shape[0],:]
y_test = y[data_train.shape[0] + data_val.shape[0]:data_train.shape[0]
              + data_val.shape[0] 
              + data_test.shape[0]]

### Convert target dataframes to dense tensors

In [18]:
y_train = to_dense(convert_sparse_matrix_to_sparse_tensor(y_train))
y_val = to_dense(convert_sparse_matrix_to_sparse_tensor(y_val))
y_test = to_dense(convert_sparse_matrix_to_sparse_tensor(y_test))

### Create the neural network

In [19]:
seq = Sequential()
seq.add(Dense(units=1000,activation="relu",input_dim=X_train.shape[1]))
seq.add(Dropout(0.2))
seq.add(Dense(units=50,activation="relu"))
seq.add(Dropout(0.2))
seq.add(Dense(units=6,activation="softmax"))

seq.compile(loss="categorical_crossentropy", optimizer="rmsprop", metrics=["accuracy"])

### Run garbage collection to free up memory

In [20]:
gc.collect()

20

### Fit the sequential model to the training, validating with the validation data

In [21]:
# Train the model
# fit_model = seq.fit(X_train,y_train,epochs=20)
fit_model = seq.fit(X_train,y_train,epochs=20, validation_data=(X_val, y_val))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


### Run garbage collection to free up memory

In [22]:
gc.collect()

1502

### Predict the labels of the test dataset

In [23]:
y_test_pred = seq.predict(X_test,use_multiprocessing=True)

### Display the raw predictions

In [24]:
y_test_pred

array([[5.24492789e-05, 8.82536533e-09, 5.33850698e-06, 4.33755115e-10,
        9.99942183e-01, 4.38483222e-10],
       [7.29042540e-06, 4.23976021e-10, 1.02398783e-08, 3.59674821e-14,
        9.99992728e-01, 7.03856935e-15],
       [3.11584372e-05, 1.61219832e-05, 3.02527472e-03, 2.68819256e-09,
        9.96927440e-01, 3.05058667e-09],
       ...,
       [2.66448613e-22, 1.20579375e-14, 1.00000000e+00, 1.24946702e-08,
        1.14732930e-13, 2.68466977e-20],
       [7.73647257e-11, 4.24790613e-07, 9.99984264e-01, 1.51705044e-05,
        1.28426521e-07, 1.29354183e-09],
       [2.16530892e-03, 5.52239120e-01, 9.49000847e-03, 8.11773643e-05,
        1.72783539e-01, 2.63240933e-01]], dtype=float32)

### Function to convert lists of floats into lists of ints

In [25]:
def float_to_int(fl):
    int_2d = []
    for row in fl:
        i_row = []
        for x in row:
            i_row.append(int(x+0.5))
        int_2d.append(i_row)
    
    return np.array(int_2d)
    

### Convert the elements of the prediction dataframe to int

In [26]:
y_test_pred = float_to_int(y_test_pred)

In [27]:
y_test_pred

array([[0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 1, 0],
       ...,
       [0, 0, 1, 0, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0]])

### Retrieve the test portion of the larger dataframe 

In [28]:
y_test_nparray = y[data_train.shape[0] + data_val.shape[0]:data_train.shape[0]
              + data_val.shape[0] 
              + data_test.shape[0]]

In [29]:
y[data_train.shape[0] + data_val.shape[0]:data_train.shape[0]\
              + data_val.shape[0] 
              + data_test.shape[0]:].toarray()

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.]])

In [30]:
y_test_nparray

<2000x6 sparse matrix of type '<class 'numpy.float64'>'
	with 2000 stored elements in Compressed Sparse Row format>

### Determine the accuracy of the predicted results

In [31]:
(y_test_pred == y_test_nparray).sum()/(y_test_pred.shape[0] * y_test_pred.shape[1])

0.9255833333333333