# CS4662 
## Group Project: Twitter Emotion Identification
### Instructor: Dr. Mohammad Pourhomayoun

### Julie Kasparian

Spring 2020

In [104]:
# import modules
import json
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

In [105]:
json_list = ['anger', 'fear', 'greed', 'hateful', 'joy', 'sadness']

all_list = []
limit = 10000

for name in json_list:
    # load json data as a list of strings
    with open('/Users/juliekasparian/Desktop/raw_data/'+ name + '.json') as my_file:
        myfile = json.load(my_file)
        count = 0
        
        # turn list to a list of tuples and append to all_list (only 10,000 from each list) 
        for i in myfile:
            if count < limit:
                all_list.append((i, name))
                count = count + 1
                
            else:
                break
            
all_list[-1]

('Sad thing is sheep will buy into this POS https://t.co/hpeo1CSC70',
 'sadness')

In [106]:
# df = pd.DataFrame(np.array(list).reshape(-1,2), columns = ["comment", "emotion"])
df = pd.DataFrame(all_list, columns=['comment', 'label'])

# randomize the items inside dataframe
df = df.sample(frac=1).reset_index(drop=True)

df

Unnamed: 0,comment,label
0,Welcome! We have opened. Short Links Service\n...,fear
1,@LloydLlewJ Well my heart says you are a Scumb...,anger
2,@CryptoCobain Saw this coming! Sucks you gotta...,anger
3,5 years and Adam silver aint do shit,hateful
4,"Playboi Carti even on da list, get dis shit ou...",hateful
...,...,...
53356,RT @zackvoell: Bitcoin Week is a great idea gi...,joy
53357,Ripple is working to make XRP compliant with a...,anger
53358,RT @imoforever03: Hats off to every one at #in...,joy
53359,US Global Investors CEO: Bitcoin is Great But ...,joy


In [107]:
df.label.value_counts()

hateful    10000
greed      10000
joy        10000
fear       10000
sadness     9765
anger       3596
Name: label, dtype: int64

In [108]:
# define X and y
X = df.comment
y = df.label

In [109]:
# splitting the dataset into testing and training:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=3)

In [110]:
# examine the object shapes
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(34684,)
(18677,)
(34684,)
(18677,)


In [111]:
my_RandomForest = RandomForestClassifier(n_estimators = 90, bootstrap = True, random_state=3)

In [112]:
vect = CountVectorizer()

In [113]:
# fit and transform X_train into X_train_dtm
X_train_dtm = vect.fit_transform(X_train)
X_train_dtm.shape

(34684, 70311)

In [114]:
# transform X_test into X_test_dtm
X_test_dtm = vect.transform(X_test)
X_test_dtm.shape

(18677, 70311)

In [115]:
# training on the training set:
my_RandomForest.fit(X_train_dtm, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=90,
                       n_jobs=None, oob_score=False, random_state=3, verbose=0,
                       warm_start=False)

In [116]:
# testing on the testing set:
y_predict_rf = my_RandomForest.predict(X_test_dtm)

print(y_predict_rf)

['greed' 'fear' 'hateful' ... 'greed' 'greed' 'fear']


In [117]:
# we can now compare the "predicted labels" for the Testing Set with its "actual labels" to evaluate the accuracy 
score_rf = accuracy_score(y_test, y_predict_rf)

print('Random Forest Accuracy:', score_rf)

Random Forest Accuracy: 0.8876157841195053


In [118]:
# estimating the probability (likelihood) of Each Label: 
y_predict_prob_rf = my_RandomForest.predict_proba(X_test_dtm)

# this line prints the "predicted label" for the testing set:
print(y_predict_rf)

# this line prints the "estimated likelihood of label=1" for the testing set:
print(y_predict_prob_rf[:,1])

['greed' 'fear' 'hateful' ... 'greed' 'greed' 'fear']
[0.         1.         0.08888889 ... 0.0037037  0.06666667 0.58888889]


In [119]:
# print classification report
print(classification_report(y_test, y_predict_rf))
cm = confusion_matrix(y_test, y_predict_rf)
print(cm)

              precision    recall  f1-score   support

       anger       0.57      0.47      0.51      1294
        fear       0.96      0.92      0.94      3464
       greed       0.92      0.93      0.92      3485
     hateful       0.78      0.84      0.81      3496
         joy       0.94      0.96      0.95      3488
     sadness       0.95      0.95      0.95      3450

    accuracy                           0.89     18677
   macro avg       0.85      0.84      0.85     18677
weighted avg       0.89      0.89      0.89     18677

[[ 608    4   28  628   18    8]
 [   4 3184  122   81   38   35]
 [   5   77 3230   38   85   50]
 [ 440   17   46 2934   23   36]
 [   6   18   57   25 3357   25]
 [   9   24   41   76   35 3265]]
