# CS4662 
## Group Project: Twitter Emotion Identification
### Instructor: Dr. Mohammad Porhomayoun

### Ponaroth Eab
### Using Artificial Neural Network

Spring 2020

In [1]:
# import modules
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import Normalizer


In [2]:

json_list = ['anger', 'fear', 'greed', 'hateful', 'joy', 'sadness']

all_list = []
limit = 10000

for name in json_list:
    # load json data as a list of strings
    with open('raw_data/'+ name + '.json') as my_file:
        myfile = json.load(my_file)
        count = 0
        
        # turn list to a list of tuples and append to all_list (only 10,000 from each list) 
        for i in myfile:
            if count < limit:
                all_list.append((i, name))
                count = count + 1
                
            else:
                break
                

In [3]:
all_list[-1]

('Sad thing is sheep will buy into this POS https://t.co/hpeo1CSC70',
 'sadness')

In [4]:
# df = pd.DataFrame(np.array(list).reshape(-1,2), columns = ["comment", "emotion"])
df = pd.DataFrame(all_list, columns=['comment', 'label'])

In [5]:
# randomize the items inside dataframe
df = df.sample(frac=1).reset_index(drop=True)


In [6]:
df

Unnamed: 0,comment,label
0,Pump and Dump.\n\nRun by fake Tether.,fear
1,115% Profit on #TRX - BitMEX Binance Free Cryp...,greed
2,RT @MikesBrideKatie: Meanwhile video clips of ...,hateful
3,Earning #cryptocurrency for selling my stuff o...,fear
4,SHIT ON IT,hateful
...,...,...
53356,"Governments, especially totalitarian ones, hat...",hateful
53357,Earning #cryptocurrency for selling my stuff o...,fear
53358,"The project is just a bomb, the guys are worki...",joy
53359,Adam Hession is the lad in school that is anno...,anger


In [7]:
df.label.value_counts()

greed      10000
hateful    10000
fear       10000
joy        10000
sadness     9765
anger       3596
Name: label, dtype: int64

In [8]:
# define X and y
X = df.comment
y = df.label

In [9]:
# Splitting the dataset into testing and training:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

In [10]:
# examine the object shapes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(42688,)
(10673,)
(42688,)
(10673,)


In [11]:
# 1 Hidden Layer with 3 neurons:
my_ANN = MLPClassifier(hidden_layer_sizes=(3,), activation= 'logistic', 
                       solver='adam', alpha=1e-5, random_state=1, 
                       learning_rate_init = 0.1, verbose=True, tol=0.0001)


In [12]:
vect = CountVectorizer()

In [13]:
# fit and transform X_train into X_train_dtm
X_train_dtm = vect.fit_transform(X_train)

X_train_dtm.shape

(42688, 81332)

In [14]:
# transform X_test into X_test_dtm
X_test_dtm = vect.transform(X_test)
X_test_dtm.shape

(10673, 81332)

In [15]:
# Training on the training set:
my_ANN.fit(X_train_dtm, y_train)

Iteration 1, loss = 0.58981224
Iteration 2, loss = 0.24891376
Iteration 3, loss = 0.22001534
Iteration 4, loss = 0.21053045
Iteration 5, loss = 0.20522883
Iteration 6, loss = 0.20112858
Iteration 7, loss = 0.20099487
Iteration 8, loss = 0.20716413
Iteration 9, loss = 0.19417642
Iteration 10, loss = 0.18654148
Iteration 11, loss = 0.18428149
Iteration 12, loss = 0.18623239
Iteration 13, loss = 0.20206095
Iteration 14, loss = 0.17941599
Iteration 15, loss = 0.18242487
Iteration 16, loss = 0.18321286
Iteration 17, loss = 0.18479406
Iteration 18, loss = 0.18157932
Iteration 19, loss = 0.18303848
Iteration 20, loss = 0.18667198
Iteration 21, loss = 0.17862962
Iteration 22, loss = 0.17393844
Iteration 23, loss = 0.17375750
Iteration 24, loss = 0.18749656
Iteration 25, loss = 0.17904399
Iteration 26, loss = 0.18093449
Iteration 27, loss = 0.19934276
Iteration 28, loss = 0.19163642
Iteration 29, loss = 0.18681951
Iteration 30, loss = 0.18615587
Iteration 31, loss = 0.18362854
Iteration 32, los

MLPClassifier(activation='logistic', alpha=1e-05, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(3,), learning_rate='constant',
              learning_rate_init=0.1, max_fun=15000, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=1, shuffle=True, solver='adam', tol=0.0001,
              validation_fraction=0.1, verbose=True, warm_start=False)

In [16]:
# Testing on the testing set:
y_predict_ann = my_ANN.predict(X_test_dtm)
print(y_predict_ann)

['anger' 'anger' 'hateful' ... 'fear' 'joy' 'sadness']


In [17]:
from sklearn.metrics import accuracy_score

# We can now compare the "predicted labels" for the Testing Set with its "actual labels" to evaluate the accuracy 
score_ann = accuracy_score(y_test, y_predict_ann)
print('\n','accuracy: ', score_ann)
# tried preprocessing normalize and scale, but didn't help with accuracy


 accuracy:  0.9071488803522908


In [18]:
from sklearn.metrics import confusion_matrix, classification_report

# print classification report
print(classification_report(y_test, y_predict_ann))
cm = confusion_matrix(y_test, y_predict_ann)
print(cm)

              precision    recall  f1-score   support

       anger       0.62      0.93      0.74       773
        fear       0.97      0.92      0.94      2009
       greed       0.90      0.93      0.92      1975
     hateful       0.96      0.77      0.86      2041
         joy       0.95      0.95      0.95      1938
     sadness       0.94      0.96      0.95      1937

    accuracy                           0.91     10673
   macro avg       0.89      0.91      0.89     10673
weighted avg       0.92      0.91      0.91     10673

[[ 720    1    5   29    5   13]
 [   7 1848  110    9   21   14]
 [   3   32 1839    7   58   36]
 [ 388   12   10 1574    7   50]
 [   9   13   52    7 1844   13]
 [  39    3   20    9    9 1857]]


In [19]:
# define a range for the "number of neurons" in the hidden layer for a network with 1 hidden layer
# in this case neuron number is 1-9
neuron_number = [(i,) for i in range(3,23,3)]

# create a dictionary for grid parameter:
param_grid = dict(hidden_layer_sizes = neuron_number)

# creat the grid, and define the metric for evaluating the model: 
grid = GridSearchCV(my_ANN, param_grid, cv=10, scoring='accuracy', verbose=True)

grid

GridSearchCV(cv=10, error_score=nan,
             estimator=MLPClassifier(activation='logistic', alpha=1e-05,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(3,),
                                     learning_rate='constant',
                                     learning_rate_init=0.1, max_fun=15000,
                                     max_iter=200, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state=1, shuffle=True,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=True,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'hi

In [20]:
# fit the grid (start the grid search):
grid.fit(X_test_dtm, y_test)

Fitting 10 folds for each of 7 candidates, totalling 70 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Iteration 1, loss = 1.22517596
Iteration 2, loss = 0.37150850
Iteration 3, loss = 0.19368216
Iteration 4, loss = 0.16253164
Iteration 5, loss = 0.12901140
Iteration 6, loss = 0.12836144
Iteration 7, loss = 0.10955305
Iteration 8, loss = 0.10933124
Iteration 9, loss = 0.12180485
Iteration 10, loss = 0.11359013
Iteration 11, loss = 0.09474571
Iteration 12, loss = 0.09397957
Iteration 13, loss = 0.08596055
Iteration 14, loss = 0.08013062
Iteration 15, loss = 0.08246686
Iteration 16, loss = 0.08122290
Iteration 17, loss = 0.07732008
Iteration 18, loss = 0.07439696
Iteration 19, loss = 0.08002192
Iteration 20, loss = 0.07851521
Iteration 21, loss = 0.08003967
Iteration 22, loss = 0.07330170
Iteration 23, loss = 0.07380136
Iteration 24, loss = 0.09889719
Iteration 25, loss = 0.11105544
Iteration 26, loss = 0.08164519
Iteration 27, loss = 0.07618557
Iteration 28, loss = 0.07412820
Iteration 29, loss = 0.07722438
Iteration 30, loss = 0.11874844
Iteration 31, loss = 0.09494354
Iteration 32, los

Iteration 34, loss = 0.07586827
Iteration 35, loss = 0.07373956
Iteration 36, loss = 0.07450073
Iteration 37, loss = 0.07440877
Iteration 38, loss = 0.07389413
Iteration 39, loss = 0.07287538
Iteration 40, loss = 0.07302174
Iteration 41, loss = 0.07322761
Iteration 42, loss = 0.07349314
Iteration 43, loss = 0.07316506
Iteration 44, loss = 0.07182438
Iteration 45, loss = 0.07309639
Iteration 46, loss = 0.07729228
Iteration 47, loss = 0.07331825
Iteration 48, loss = 0.07271158
Iteration 49, loss = 0.07353461
Iteration 50, loss = 0.07339589
Iteration 51, loss = 0.07244493
Iteration 52, loss = 0.07421627
Iteration 53, loss = 0.07224033
Iteration 54, loss = 0.07306058
Iteration 55, loss = 0.08692068
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 1.24838732
Iteration 2, loss = 0.44498139
Iteration 3, loss = 0.27433990
Iteration 4, loss = 0.21798374
Iteration 5, loss = 0.17535327
Iteration 6, loss = 0.15417648
Iteration 7, loss = 

Iteration 53, loss = 0.11648278
Iteration 54, loss = 0.11504563
Iteration 55, loss = 0.09204115
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.91473892
Iteration 2, loss = 0.15834155
Iteration 3, loss = 0.09202742
Iteration 4, loss = 0.07415667
Iteration 5, loss = 0.06974468
Iteration 6, loss = 0.05782625
Iteration 7, loss = 0.05709829
Iteration 8, loss = 0.04930138
Iteration 9, loss = 0.06512752
Iteration 10, loss = 0.05872670
Iteration 11, loss = 0.04984311
Iteration 12, loss = 0.04640000
Iteration 13, loss = 0.04309353
Iteration 14, loss = 0.04262891
Iteration 15, loss = 0.04466264
Iteration 16, loss = 0.05848669
Iteration 17, loss = 0.04688952
Iteration 18, loss = 0.04438132
Iteration 19, loss = 0.04741634
Iteration 20, loss = 0.04188412
Iteration 21, loss = 0.04091852
Iteration 22, loss = 0.03854647
Iteration 23, loss = 0.03796741
Iteration 24, loss = 0.03650157
Iteration 25, loss = 0.03937467
Iteration 26, loss = 0.

Iteration 12, loss = 0.05160623
Iteration 13, loss = 0.04442408
Iteration 14, loss = 0.06489635
Iteration 15, loss = 0.06897609
Iteration 16, loss = 0.06359454
Iteration 17, loss = 0.05630063
Iteration 18, loss = 0.05729456
Iteration 19, loss = 0.04738473
Iteration 20, loss = 0.04969624
Iteration 21, loss = 0.04905646
Iteration 22, loss = 0.04834780
Iteration 23, loss = 0.04435482
Iteration 24, loss = 0.04198128
Iteration 25, loss = 0.06256937
Iteration 26, loss = 0.06000882
Iteration 27, loss = 0.05309616
Iteration 28, loss = 0.05111301
Iteration 29, loss = 0.05097258
Iteration 30, loss = 0.05125244
Iteration 31, loss = 0.04981795
Iteration 32, loss = 0.04620977
Iteration 33, loss = 0.04509723
Iteration 34, loss = 0.04717613
Iteration 35, loss = 0.05178338
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.83362918
Iteration 2, loss = 0.14692378
Iteration 3, loss = 0.09953879
Iteration 4, loss = 0.08111109
Iteration 5, loss 

Iteration 20, loss = 0.03888845
Iteration 21, loss = 0.03713982
Iteration 22, loss = 0.03360151
Iteration 23, loss = 0.03281994
Iteration 24, loss = 0.03361027
Iteration 25, loss = 0.04341278
Iteration 26, loss = 0.03907421
Iteration 27, loss = 0.03670784
Iteration 28, loss = 0.03744193
Iteration 29, loss = 0.04115090
Iteration 30, loss = 0.03785176
Iteration 31, loss = 0.04508030
Iteration 32, loss = 0.04048371
Iteration 33, loss = 0.03784291
Iteration 34, loss = 0.04403309
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.76333794
Iteration 2, loss = 0.13457571
Iteration 3, loss = 0.08492090
Iteration 4, loss = 0.06869702
Iteration 5, loss = 0.06021189
Iteration 6, loss = 0.06429009
Iteration 7, loss = 0.05205919
Iteration 8, loss = 0.05489749
Iteration 9, loss = 0.04831886
Iteration 10, loss = 0.05146905
Iteration 11, loss = 0.04913243
Iteration 12, loss = 0.04971305
Iteration 13, loss = 0.04596363
Iteration 14, loss = 0.

Iteration 8, loss = 0.04436293
Iteration 9, loss = 0.04131382
Iteration 10, loss = 0.05892366
Iteration 11, loss = 0.04258594
Iteration 12, loss = 0.03884751
Iteration 13, loss = 0.03993439
Iteration 14, loss = 0.04608159
Iteration 15, loss = 0.04470529
Iteration 16, loss = 0.04800165
Iteration 17, loss = 0.04305810
Iteration 18, loss = 0.04026201
Iteration 19, loss = 0.04752954
Iteration 20, loss = 0.04608738
Iteration 21, loss = 0.04431497
Iteration 22, loss = 0.03900029
Iteration 23, loss = 0.06242154
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.72605527
Iteration 2, loss = 0.11671198
Iteration 3, loss = 0.07547079
Iteration 4, loss = 0.06184577
Iteration 5, loss = 0.05407842
Iteration 6, loss = 0.05588212
Iteration 7, loss = 0.05056861
Iteration 8, loss = 0.04777918
Iteration 9, loss = 0.04857382
Iteration 10, loss = 0.04636724
Iteration 11, loss = 0.04115275
Iteration 12, loss = 0.03577167
Iteration 13, loss = 0.03

Iteration 16, loss = 0.05413379
Iteration 17, loss = 0.06525587
Iteration 18, loss = 0.05853246
Iteration 19, loss = 0.05709870
Iteration 20, loss = 0.06670781
Iteration 21, loss = 0.07485096
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.73061058
Iteration 2, loss = 0.11653193
Iteration 3, loss = 0.07320098
Iteration 4, loss = 0.06151838
Iteration 5, loss = 0.05655710
Iteration 6, loss = 0.04859338
Iteration 7, loss = 0.04336973
Iteration 8, loss = 0.03981431
Iteration 9, loss = 0.04086843
Iteration 10, loss = 0.03937378
Iteration 11, loss = 0.03773232
Iteration 12, loss = 0.04243816
Iteration 13, loss = 0.03953486
Iteration 14, loss = 0.03513705
Iteration 15, loss = 0.03392772
Iteration 16, loss = 0.03362581
Iteration 17, loss = 0.07571253
Iteration 18, loss = 0.08612544
Iteration 19, loss = 0.06831010
Iteration 20, loss = 0.08091791
Iteration 21, loss = 0.06488180
Iteration 22, loss = 0.07682223
Iteration 23, loss = 0.

Iteration 8, loss = 0.04377539
Iteration 9, loss = 0.04389786
Iteration 10, loss = 0.04005574
Iteration 11, loss = 0.03849996
Iteration 12, loss = 0.03576349
Iteration 13, loss = 0.03716584
Iteration 14, loss = 0.03659748
Iteration 15, loss = 0.03523899
Iteration 16, loss = 0.03669819
Iteration 17, loss = 0.04522400
Iteration 18, loss = 0.04545284
Iteration 19, loss = 0.05897827
Iteration 20, loss = 0.06320097
Iteration 21, loss = 0.10881416
Iteration 22, loss = 0.07957914
Iteration 23, loss = 0.09360563
Iteration 24, loss = 0.07435930
Iteration 25, loss = 0.06263744
Iteration 26, loss = 0.05754365
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.69175156
Iteration 2, loss = 0.11694766
Iteration 3, loss = 0.07583405
Iteration 4, loss = 0.05991185
Iteration 5, loss = 0.05459550
Iteration 6, loss = 0.04811814
Iteration 7, loss = 0.04603221
Iteration 8, loss = 0.04718044
Iteration 9, loss = 0.04573868
Iteration 10, loss = 0.03

Iteration 19, loss = 0.05031422
Iteration 20, loss = 0.04328346
Iteration 21, loss = 0.04956470
Iteration 22, loss = 0.06877791
Iteration 23, loss = 0.05643571
Iteration 24, loss = 0.05323350
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.70839090
Iteration 2, loss = 0.11713882
Iteration 3, loss = 0.06893674
Iteration 4, loss = 0.05489187
Iteration 5, loss = 0.05379828
Iteration 6, loss = 0.04375576
Iteration 7, loss = 0.04396759
Iteration 8, loss = 0.04477682
Iteration 9, loss = 0.04368534
Iteration 10, loss = 0.04089750
Iteration 11, loss = 0.03640162
Iteration 12, loss = 0.03496709
Iteration 13, loss = 0.04143342
Iteration 14, loss = 0.03783835
Iteration 15, loss = 0.03718044
Iteration 16, loss = 0.04267609
Iteration 17, loss = 0.04365723
Iteration 18, loss = 0.04230891
Iteration 19, loss = 0.04774177
Iteration 20, loss = 0.04837961
Iteration 21, loss = 0.05191288
Iteration 22, loss = 0.05278862
Iteration 23, loss = 0.

Iteration 18, loss = 0.07388853
Iteration 19, loss = 0.07433662
Iteration 20, loss = 0.05984040
Iteration 21, loss = 0.05190926
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
Iteration 1, loss = 0.71525835
Iteration 2, loss = 0.11313608
Iteration 3, loss = 0.07142911
Iteration 4, loss = 0.05390368
Iteration 5, loss = 0.04591341
Iteration 6, loss = 0.04394883
Iteration 7, loss = 0.05319521
Iteration 8, loss = 0.04666650
Iteration 9, loss = 0.04114685
Iteration 10, loss = 0.03630954
Iteration 11, loss = 0.03694703
Iteration 12, loss = 0.03302328
Iteration 13, loss = 0.03634113
Iteration 14, loss = 0.03298703
Iteration 15, loss = 0.03349769
Iteration 16, loss = 0.04047851
Iteration 17, loss = 0.04282269
Iteration 18, loss = 0.05083626
Iteration 19, loss = 0.05276155
Iteration 20, loss = 0.04939899
Iteration 21, loss = 0.04433957
Iteration 22, loss = 0.04687019
Iteration 23, loss = 0.04112107
Training loss did not improve more than tol=0.000100 fo

[Parallel(n_jobs=1)]: Done  70 out of  70 | elapsed: 63.6min finished


Iteration 1, loss = 0.68741033
Iteration 2, loss = 0.12452603
Iteration 3, loss = 0.07360807
Iteration 4, loss = 0.06243809
Iteration 5, loss = 0.05084584
Iteration 6, loss = 0.04665181
Iteration 7, loss = 0.04717662
Iteration 8, loss = 0.04026544
Iteration 9, loss = 0.04645368
Iteration 10, loss = 0.04082447
Iteration 11, loss = 0.04022806
Iteration 12, loss = 0.03999551
Iteration 13, loss = 0.04143804
Iteration 14, loss = 0.03697980
Iteration 15, loss = 0.03967916
Iteration 16, loss = 0.04655990
Iteration 17, loss = 0.05266580
Iteration 18, loss = 0.05878768
Iteration 19, loss = 0.05576820
Iteration 20, loss = 0.05747663
Iteration 21, loss = 0.05962791
Iteration 22, loss = 0.05915528
Iteration 23, loss = 0.06095213
Iteration 24, loss = 0.06365177
Iteration 25, loss = 0.06663278
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


GridSearchCV(cv=10, error_score=nan,
             estimator=MLPClassifier(activation='logistic', alpha=1e-05,
                                     batch_size='auto', beta_1=0.9,
                                     beta_2=0.999, early_stopping=False,
                                     epsilon=1e-08, hidden_layer_sizes=(3,),
                                     learning_rate='constant',
                                     learning_rate_init=0.1, max_fun=15000,
                                     max_iter=200, momentum=0.9,
                                     n_iter_no_change=10,
                                     nesterovs_momentum=True, power_t=0.5,
                                     random_state=1, shuffle=True,
                                     solver='adam', tol=0.0001,
                                     validation_fraction=0.1, verbose=True,
                                     warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'hi

In [21]:
print("best score is: ", grid.best_score_)
print("best parameter is: ", grid.best_params_)

best score is:  0.8821319882480546
best parameter is:  {'hidden_layer_sizes': (21,)}
