# Preprocessing Training Data


In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv("emotion-labels-train.csv",encoding = "ISO-8859-1")

In [3]:
data.head(8)

Unnamed: 0,text,label
0,Just got back from seeing @GaryDelaney in Burs...,joy
1,Oh dear an evening of absolute hilarity I don'...,joy
2,Been waiting all week for this game â¤ï¸â¤ï...,joy
3,"@gardiner_love : Thank you so much, Gloria! Yo...",joy
4,I feel so blessed to work with the family that...,joy
5,"Today I reached 1000 subscribers on YT!! , #go...",joy
6,"@Singaholic121 Good morning, love! Happy first...",joy
7,#BridgetJonesBaby is the best thing I've seen ...,joy


In [4]:
data["label"].value_counts()

fear       1147
anger       857
joy         823
sadness     786
Name: label, dtype: int64

In [5]:
df_train_dic = { 'fear'    :   3,
                 'anger'   :   2,
                 'joy'     :   1,
                 'sadness' :   0
               }

In [6]:
data["label"] = data["label"].map(df_train_dic)

In [14]:
X_train= data.iloc[:,0:1]
y_train = data.iloc[:,1:]

In [15]:
X_train.head()

Unnamed: 0,text
0,Just got back from seeing @GaryDelaney in Burs...
1,Oh dear an evening of absolute hilarity I don'...
2,Been waiting all week for this game â¤ï¸â¤ï...
3,"@gardiner_love : Thank you so much, Gloria! Yo..."
4,I feel so blessed to work with the family that...


In [16]:
y_train.head()

Unnamed: 0,label
0,1
1,1
2,1
3,1
4,1


In [17]:
X_train.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

In [18]:
# Convertng headlines to lower case
for index in X_train:
    X_train[index]=X_train[index].str.lower()
x_lower = X_train
x_lower

Unnamed: 0,text
0,just got back from seeing garydelaney in burs...
1,oh dear an evening of absolute hilarity i don ...
2,been waiting all week for this game ...
3,gardiner love thank you so much gloria yo...
4,i feel so blessed to work with the family that...
...,...
3608,vivienlloyd thank you so much just home st...
3609,just put the winter duvet on ...
3610,silkinside tommyjoeratliff that s so pretty ...
3611,bluesfestbyron second artist announcement loo...


In [19]:
import nltk
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier



### TfidfVectorizer

In [20]:
Vectorizer = TfidfVectorizer()

In [24]:
x_train_v = Vectorizer.fit_transform(x_lower["text"])

In [62]:
import pickle
pickle.dump(Vectorizer,open("Vectorizer.pkl","wb"))

### CountVectorizer

In [25]:
cv = CountVectorizer()

In [26]:
x_train_c = cv.fit_transform(x_lower["text"])

## Preprocessing Test Dataset

In [27]:
df_test = pd.read_csv("emotion-labels-test.csv")

In [28]:
df_test.head()

Unnamed: 0,text,label
0,You must be knowing #blithe means (adj.) Happ...,joy
1,Old saying 'A #smile shared is one gained for ...,joy
2,Bridget Jones' Baby was bloody hilarious 😅 #Br...,joy
3,@Elaminova sparkling water makes your life spa...,joy
4,I'm tired of everybody telling me to chill out...,joy


In [29]:
df_test_dic = { 'fear'     :   3,
                 'anger'   :   2,
                 'joy'     :   1,
                 'sadness' :   0
               }

In [30]:
df_test["label"] = df_test["label"].map(df_test_dic)

In [31]:
X_text = df_test.iloc[:,0:1]
y_test = df_test.iloc[:,1:]

In [32]:
X_text.replace("[^a-zA-Z]"," ",regex=True, inplace=True)

In [33]:
# Convertng headlines to lower case
for index in X_text:
    X_text[index]=X_text[index].str.lower()
X_text_lower = X_text
X_text_lower

Unnamed: 0,text
0,you must be knowing blithe means adj happ...
1,old saying a smile shared is one gained for ...
2,bridget jones baby was bloody hilarious br...
3,elaminova sparkling water makes your life spa...
4,i m tired of everybody telling me to chill out...
...,...
3137,why does candice constantly pout gbbo
3138,redbus in unhappy with redbus cc when i ta...
3139,aceoperative no pull him afew weeks ago s...
3140,i m buying art supplies and i m debating how s...


In [34]:
test_dataset_v = Vectorizer.transform(X_text_lower['text']) # Vectorizer
test_dataset_c = cv.transform(X_text_lower['text']) # Countvectorizer

# RandomForestClassifier

In [44]:
rd = RandomForestClassifier()
rdw = rd.fit(x_tran_v,y)
predictions = rd.predict(test_dataset_v)

In [36]:
# ## Import library to check accuracy
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
matrix=confusion_matrix(y_test,predictions)
print(matrix)
print()
score=accuracy_score(y_test,predictions)
print(score)

report=classification_report(y_test,predictions)
print(report)

[[490  18  44 121]
 [ 15 587  12 100]
 [ 35  22 554 149]
 [ 75  22  39 859]]

0.792488860598345
              precision    recall  f1-score   support

           0       0.80      0.73      0.76       673
           1       0.90      0.82      0.86       714
           2       0.85      0.73      0.79       760
           3       0.70      0.86      0.77       995

    accuracy                           0.79      3142
   macro avg       0.81      0.79      0.80      3142
weighted avg       0.80      0.79      0.79      3142



Sentences labels is joy 
fear    =>  3, 
anger   =>  2,
joy    =>  1,
sadness =>  0

In [62]:
# @theclobra lol I thought maybe, couldn't decide if there was levity or not                                  => joy
# Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day. #laughter #challenge #kashmir #baloch  => joy
# Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day.  #challenge #kashmir #baloch           => joy
# @tomderivan73 ðŸ˜...I'll just people watch and enjoy a rare show of optimism                               => joy
# I love my family so much #lucky #grateful #smartassfamily  #love                                            => joy

In [37]:
rd.predict(Vectorizer.transform(["@theclobra lol I thought maybe, couldn't decide if there was levity or not","@tomderivan73 ðŸ˜...I'll just people watch and enjoy a rare show of optimism","I love my family so much #lucky #grateful #smartassfamily  #love","Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day.  #challenge #kashmir #baloch","Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day. #laughter #challenge #kashmir #baloch"]))

array([1, 1, 1, 3, 1], dtype=int64)

In RandomForestClassifier Without any hyperparameter tuning, the data is overfiting on "fear"

### Hyperparameter Tuning RandomForestClassifier

In [38]:
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 300, stop = 3000, num = 20)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(20, 1050,30)]
# Minimum number of samples required to split a node
min_samples_split = [3, 8,9 ,12,20]
# Minimum number of samples required at each leaf node
min_samples_leaf = [3, 5, 6,7,10]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
              'criterion':['entropy','gini']}
print(random_grid)

{'n_estimators': [300, 442, 584, 726, 868, 1010, 1152, 1294, 1436, 1578, 1721, 1863, 2005, 2147, 2289, 2431, 2573, 2715, 2857, 3000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [20, 55, 91, 126, 162, 197, 233, 268, 304, 339, 375, 410, 446, 481, 517, 552, 588, 623, 659, 694, 730, 765, 801, 836, 872, 907, 943, 978, 1014, 1050], 'min_samples_split': [3, 8, 9, 12, 20], 'min_samples_leaf': [3, 5, 6, 7, 10], 'criterion': ['entropy', 'gini']}


In [39]:
rf_randomcv=RandomizedSearchCV(estimator=rd,param_distributions=random_grid,n_iter=100,cv=3,verbose=2,
                               random_state=100,n_jobs=-1)
### fit the randomized model
rf_randomcv.fit(x_train_v,y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': [20, 55, 91, 126, 162, 197,
                                                      233, 268, 304, 339, 375,
                                                      410, 446, 481, 517, 552,
                                                      588, 623, 659, 694, 730,
                                                      765, 801, 836, 872, 907,
                                                      943, 978, 1014, 1050],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [3, 5, 6, 7, 10],
                                        'min_samples_split': [3, 8, 9, 12, 20],
                                        'n_estimator

In [60]:
from sklearn.metrics import accuracy_score
y_pred=rf_randomcv.predict(test_dataset_v)
matrix=confusion_matrix(y_test,predictions)
print(matrix)
print()
score=accuracy_score(y_test,predictions)
print(score)
report=classification_report(y_test,predictions)
print(report)

[[495  20  42 116]
 [ 13 595  18  88]
 [ 38  21 557 144]
 [ 68  29  42 856]]

0.7966263526416295
              precision    recall  f1-score   support

           0       0.81      0.74      0.77       673
           1       0.89      0.83      0.86       714
           2       0.85      0.73      0.79       760
           3       0.71      0.86      0.78       995

    accuracy                           0.80      3142
   macro avg       0.81      0.79      0.80      3142
weighted avg       0.81      0.80      0.80      3142



Sentences labels is joy 
fear    =>  3, 
anger   =>  2,
joy    =>   1,
sadness =>  0

In [70]:
# @theclobra lol I thought maybe, couldn't decide if there was levity or not                                  => joy
# Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day. #laughter #challenge #kashmir #baloch  => joy
# Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day.  #challenge #kashmir #baloch           => joy
# @tomderivan73 ðŸ˜...I'll just people watch and enjoy a rare show of optimism                               => joy
# I love my family so much #lucky #grateful #smartassfamily  #love                                            => joy

In [46]:
best_random_grid.predict(cv.transform(["@theclobra lol I thought maybe, couldn't decide if there was levity or not","@tomderivan73 ðŸ˜...I'll just people watch and enjoy a rare show of optimism","I love my family so much #lucky #grateful #smartassfamily  #love","Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day.  #challenge #kashmir #baloch","Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day. #laughter #challenge #kashmir #baloch"]))

array([1, 1, 1, 3, 1], dtype=int64)

In [None]:
# On @Varneyco/@FoxBusiness to talk latest on #Chelsea Bombing + #Ahmad_Khan_Rahami's trips to #Afghanistan/#Pakistan #tcot #terror                 =>    fear
# On @Varneyco/@FoxBusiness to talk latest on #Chelsea Bombing + #Ahmad_Khan_Rahami's trips to #Afghanistan/#Pakistan #tcot                         =>    fear
# âŠ° @FrameOfAnAngel âŠ± \n\n+ Of them. I'm here for answers, and if I scare her to death, there won't be answers for me. \n\nSo instead, I just + =>    fear
# But I was so intrigued by your style, boy.Always been a sucker for a wild boy #alarm -@AnneMarieIAm                                               =>    fear

In [49]:
best_random_grid.predict(cv.transform(["On @Varneyco/@FoxBusiness to talk latest on #Chelsea Bombing + #Ahmad_Khan_Rahami's trips to #Afghanistan/#Pakistan #tcot #terror","But I was so intrigued by your style, boy.Always been a sucker for a wild boy #alarm -@AnneMarieIAm "," âŠ° @FrameOfAnAngel âŠ± \n\n+ Of them. I'm here for answers, and if I scare her to death, there won't be answers for me. \n\nSo instead, I just +","On @Varneyco/@FoxBusiness to talk latest on #Chelsea Bombing + #Ahmad_Khan_Rahami's trips to #Afghanistan/#Pakistan #tcot"]))

array([3, 3, 3, 3], dtype=int64)

In [None]:
# @DailyMirror i love how theres no outrage that it's a white man but if it was a black man them BLM would be all over it regardless of reason =>  anger
# Me being on my dean really saving a lot of ppl, bc I don't snap nomore &amp; it take so much out of me.                                      =>  anger
# @TrueAggieFan oh so that's where Brian was! Where was my invite? #offended                                                                   =>  anger
# Sorry guys I have absolutely no idea what time i'll be on cam tomorrow but will keep you posted.                                             =>  anger


In [50]:
best_random_grid.predict(cv.transform(["@DailyMirror i love how theres no outrage that it's a white man but if it was a black man them BLM would be all over it regardless of reason","Me being on my dean really saving a lot of ppl, bc I don't snap nomore &amp; it take so much out of me.","@TrueAggieFan oh so that's where Brian was! Where was my invite? #offended","Sorry guys I have absolutely no idea what time i'll be on cam tomorrow but will keep you posted."]))

array([2, 2, 2, 2], dtype=int64)

In [None]:
# @trashcami this cured my depression                                                                                                            =>    sadness
# This world has some serious issues we should all go to therapy                                                                                 =>    sadness
# If I had a little bit of extra money I would blow the whole paycheck and go to one of the two of @KygoMusic's concerts in LA. #serious         =>    sadness
# @kempicepoland don't think he did, and he didn't have the rucksack or laptop in his possession, murky business, on that note I'm away to bed   =>    sadness


In [51]:
best_random_grid.predict(cv.transform(["@trashcami this cured my depression","@kempicepoland don't think he did, and he didn't have the rucksack or laptop in his possession, murky business, on that note I'm away to bed","If I had a little bit of extra money I would blow the whole paycheck and go to one of the two of @KygoMusic's concerts in LA. #serious","This world has some serious issues we should all go to therapy"]))

array([0, 0, 0, 0], dtype=int64)

In [52]:
best_random_grid.predict(cv.transform(["This world has some serious issues we should all go to therapy","Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day. #laughter #challenge #kashmir #baloch","Me being on my dean really saving a lot of ppl, bc I don't snap nomore &amp; it take so much out of me.","But I was so intrigued by your style, boy.Always been a sucker for a wild boy #alarm -@AnneMarieIAm "]))

array([0, 1, 2, 3], dtype=int64)

In [54]:
best_random_grid.predict(cv.transform(["i am happy","i am happy"]))

array([1, 0], dtype=int64)

In [61]:
import pickle
fiel = "npl_model.pkl"
pickle.dump(rf_randomcv,open(fiel,"wb"))

In [69]:
nb_hyper_tuneing.predict(cv.transform(["On @Varneyco/@FoxBusiness to talk latest on #Chelsea Bombing + #Ahmad_Khan_Rahami's trips to #Afghanistan/#Pakistan #tcot #terror","But I was so intrigued by your style, boy.Always been a sucker for a wild boy #alarm -@AnneMarieIAm "," âŠ° @FrameOfAnAngel âŠ± \n\n+ Of them. I'm here for answers, and if I scare her to death, there won't be answers for me. \n\nSo instead, I just +","On @Varneyco/@FoxBusiness to talk latest on #Chelsea Bombing + #Ahmad_Khan_Rahami's trips to #Afghanistan/#Pakistan #tcot",]).toarray())

array([1, 2, 2, 1], dtype=int64)

# DecisionTreeClassifier

In [55]:
 from sklearn.tree import DecisionTreeClassifier

In [57]:
dt = DecisionTreeClassifier()
dt.fit(x_train_v,y_train)

DecisionTreeClassifier()

In [58]:
y_pred=dt.predict(test_dataset_v)
matrix=confusion_matrix(y_test,predictions)
print(matrix)
print()
score=accuracy_score(y_test,predictions)
print(score)
report=classification_report(y_test,predictions)
print(report)

[[495  20  42 116]
 [ 13 595  18  88]
 [ 38  21 557 144]
 [ 68  29  42 856]]

0.7966263526416295
              precision    recall  f1-score   support

           0       0.81      0.74      0.77       673
           1       0.89      0.83      0.86       714
           2       0.85      0.73      0.79       760
           3       0.71      0.86      0.78       995

    accuracy                           0.80      3142
   macro avg       0.81      0.79      0.80      3142
weighted avg       0.81      0.80      0.80      3142



In [75]:
# @theclobra lol I thought maybe, couldn't decide if there was levity or not                                  => joy
# Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day. #laughter #challenge #kashmir #baloch  => joy
# Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day.  #challenge #kashmir #baloch           => joy
# @tomderivan73 ðŸ˜...I'll just people watch and enjoy a rare show of optimism                               => joy
# I love my family so much #lucky #grateful #smartassfamily  #love                                            => joy

In [59]:
dt.predict(cv.transform(["@theclobra lol I thought maybe, couldn't decide if there was levity or not","@tomderivan73 ðŸ˜...I'll just people watch and enjoy a rare show of optimism","I love my family so much #lucky #grateful #smartassfamily  #love","Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day.  #challenge #kashmir #baloch","Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day. #laughter #challenge #kashmir #baloch"]))

array([1, 1, 2, 2, 1], dtype=int64)

DecisionTreeClassifier without any  Hyperparameter Tuning, the dataset is overfiting on "fear"

### Hyperparameter Tunin DecisionTreeClassifier

In [89]:
criterion = ["gini", "entropy"]
splitter = ["best", "random"]
max_depth = [int(x) for x in np.linspace(20, 40,60)]
min_samples_split = [4,8,12,16,21]
min_samples_leaf = [2,4,6,8,10]
min_weight_fraction_leaf = [.2,.4,.6,.8,.9]
max_features = ['auto', 'sqrt','log2']
Decision_grid = {'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'criterion': criterion,
                'splitter' : splitter}
print(Decision_grid)

{'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [20, 20, 20, 21, 21, 21, 22, 22, 22, 23, 23, 23, 24, 24, 24, 25, 25, 25, 26, 26, 26, 27, 27, 27, 28, 28, 28, 29, 29, 29, 30, 30, 30, 31, 31, 31, 32, 32, 32, 33, 33, 33, 34, 34, 34, 35, 35, 35, 36, 36, 36, 37, 37, 37, 38, 38, 38, 39, 39, 40], 'min_samples_split': [4, 8, 12, 16, 21], 'min_samples_leaf': [2, 4, 6, 8, 10], 'criterion': ['gini', 'entropy'], 'splitter': ['best', 'random']}


In [90]:
Decision_tune=RandomizedSearchCV(estimator=dt,param_distributions=Decision_grid,n_iter=100,cv=3,verbose=2,n_jobs=-1,random_state = 420)

Decision_tune.fit(traindataset,data_train_y)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


RandomizedSearchCV(cv=3, estimator=DecisionTreeClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'criterion': ['gini', 'entropy'],
                                        'max_depth': [20, 20, 20, 21, 21, 21,
                                                      22, 22, 22, 23, 23, 23,
                                                      24, 24, 24, 25, 25, 25,
                                                      26, 26, 26, 27, 27, 27,
                                                      28, 28, 28, 29, 29, 29, ...],
                                        'max_features': ['auto', 'sqrt',
                                                         'log2'],
                                        'min_samples_leaf': [2, 4, 6, 8, 10],
                                        'min_samples_split': [4, 8, 12, 16, 21],
                                        'splitter': ['best', 'random']},
                   random_state=420, verbose=2)

In [92]:
Decision_tune.best_params_


{'splitter': 'random',
 'min_samples_split': 4,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 38,
 'criterion': 'entropy'}

In [94]:
Decision_tuneing=Decision_tune.best_estimator_

In [95]:
from sklearn.metrics import accuracy_score
y_pred=Decision_tuneing.predict(test_dataset)
print(confusion_matrix(data_test_y,y_pred))
print("Accuracy Score {}".format(accuracy_score(data_test_y,y_pred)))
print("Classification report: {}".format(classification_report(data_test_y,y_pred)))

[[  47    7    0  732]
 [   0  147    0  676]
 [   5   11   48  793]
 [   2   10    0 1135]]
Accuracy Score 0.3811237199003598
Classification report:               precision    recall  f1-score   support

           0       0.87      0.06      0.11       786
           1       0.84      0.18      0.29       823
           2       1.00      0.06      0.11       857
           3       0.34      0.99      0.51      1147

    accuracy                           0.38      3613
   macro avg       0.76      0.32      0.25      3613
weighted avg       0.73      0.38      0.28      3613



In [96]:
Decision_tuneing.predict(cv.transform(["@theclobra lol I thought maybe, couldn't decide if there was levity or not","@tomderivan73 ðŸ˜...I'll just people watch and enjoy a rare show of optimism","I love my family so much #lucky #grateful #smartassfamily  #love","Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day.  #challenge #kashmir #baloch","Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day. #laughter #challenge #kashmir #baloch"]))

array([3, 3, 3, 3, 3], dtype=int64)

In [28]:
from sklearn import svm


In [29]:
modelsvm = svm.SVC(kernel = "linear")

In [32]:
modelsvm.fit(x_tran_v,y)

SVC(kernel='linear')

In [33]:
modelsvm.predict(test_dataset[0])

array([1], dtype=int64)

In [35]:
modelsvm.predict(Vectorizer.transform(["@theclobra lol I thought maybe, couldn't decide if there was levity or not","@tomderivan73 ðŸ˜...I'll just people watch and enjoy a rare show of optimism","I love my family so much #lucky #grateful #smartassfamily  #love","Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day.  #challenge #kashmir #baloch","Nawaz Sharif is getting more funnier than @kapilsharmak9 day by day. #laughter #challenge #kashmir #baloch"]))

array([1, 1, 1, 3, 1], dtype=int64)

In [None]:
from sklearn.metrics import accuracy_score
y_pred=v.predict(test_dataset)
print(confusion_matrix(data_test_y,y_pred))
print("Accuracy Score {}".format(accuracy_score(data_test_y,y_pred)))
print("Classification report: {}".format(classification_report(data_test_y,y_pred)))