In [1]:
# Importing Libraries for ML

import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,roc_auc_score, roc_curve
from xgboost import XGBClassifier

import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings("ignore")

# Importing Data

df = pd.read_csv('sentiment.txt', delimiter = ";", )
df.head(3)

Unnamed: 0,i didnt feel humiliated,sadness
0,i can go from feeling so hopeless to so damned...,sadness
1,im grabbing a minute to post i feel greedy wrong,anger
2,i am ever feeling nostalgic about the fireplac...,love


In [2]:
df.shape

(15999, 2)

In [3]:
df = df.rename(columns={"i didnt feel humiliated":"sentence", "sadness":"emotion"})
df

Unnamed: 0,sentence,emotion
0,i can go from feeling so hopeless to so damned...,sadness
1,im grabbing a minute to post i feel greedy wrong,anger
2,i am ever feeling nostalgic about the fireplac...,love
3,i am feeling grouchy,anger
4,ive been feeling a little burdened lately wasn...,sadness
...,...,...
15994,i just had a very brief time in the beanbag an...,sadness
15995,i am now turning and i feel pathetic that i am...,sadness
15996,i feel strong and good overall,joy
15997,i feel like this was such a rude comment and i...,anger


In [4]:
df["emotion"].unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [5]:
df["emotion"] = df["emotion"].replace({'sadness':1, 'anger':2, 'love':3, 'surprise':4, 'fear':5, 'joy':6})
df

Unnamed: 0,sentence,emotion
0,i can go from feeling so hopeless to so damned...,1
1,im grabbing a minute to post i feel greedy wrong,2
2,i am ever feeling nostalgic about the fireplac...,3
3,i am feeling grouchy,2
4,ive been feeling a little burdened lately wasn...,1
...,...,...
15994,i just had a very brief time in the beanbag an...,1
15995,i am now turning and i feel pathetic that i am...,1
15996,i feel strong and good overall,6
15997,i feel like this was such a rude comment and i...,2


In [6]:
X = df["sentence"].tolist()
Y = df["emotion"]

In [7]:
lemmatizer = WordNetLemmatizer()

def preprocessing_text(string):
    pre1 = re.sub("[^a-zA-Z]"," ",string)
    pre2 = pre1.lower()
    pre3 = pre2.split()
    pre4 = set([lemmatizer.lemmatize(str(x)) for x in pre3])
    stop = set(stopwords.words("english"))
    pre5 = [w for w in pre4 if not w in stop] 
    
    preprocessed = " ".join(pre5)
    
    return preprocessed

In [8]:
X = [preprocessing_text(i) for i in X]

In [9]:
X

['someone care awake around hopeless go hopeful feeling damned',
 'grabbing minute im feel greedy wrong post',
 'ever feeling nostalgic know fireplace property still',
 'grouchy feeling',
 'little feeling ive sure wa lately wasnt burdened',
 'feel funny lot asleep time like milligram amount faster recommended also ive taking fallen',
 'feel year teenager old man confused life jaded',
 'petronas feel ha year profit huge performed made well',
 'romantic feel',
 'something seeing feel make suffering like mean',
 'feel expect encounter type spiritual running experience divine',
 'feel year think easiest dissatisfied time',
 'energy feel thirsty low',
 'feel precious proto let time sympathy find sign possible write corner writer contract trying alone publishing point life little immense general agent',
 'feel side anxiety reassured',
 'embarrassed didnt feel really',
 'feel pathetic pretty time',
 'barbie vintage started sixty began child collection doll feeling sentimental',
 'feel value w

In [10]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size=0.2, random_state=42, stratify=Y)

In [11]:
df["emotion"].value_counts()

6    5362
1    4665
2    2159
5    1937
3    1304
4     572
Name: emotion, dtype: int64

In [12]:
one = df[df["emotion"]==1]["emotion"].count()
two = df[df["emotion"]==2]["emotion"].count()
three = df[df["emotion"]==3]["emotion"].count()
four = df[df["emotion"]==4]["emotion"].count()
five = df[df["emotion"]==5]["emotion"].count()
six = df[df["emotion"]==6]["emotion"].count()
total = len(df["emotion"])

weight_one = (total-one)/total

weight_two = (total-two)/total

weight_three = (total-three)/total

weight_four = (total-four)/total

weight_five = (total-five)/total

weight_six= (total-six)/total


print(weight_one)
print(weight_two)
print(weight_three)
print(weight_four)
print(weight_five)
print(weight_six)

print(total)

0.7084192762047627
0.8650540658791175
0.9184949059316208
0.9642477654853429
0.87892993312082
0.6648540533783361
15999


In [13]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(x_train)

In [15]:
rf = RandomForestClassifier()

param ={ "n_estimators":[x for x in range(100,2000, 200)],
         "max_depth" : [x for x in range(1,6)],
         "min_samples_split" : [x for x in range(1,6)],
         "min_samples_leaf" : [x for x in range(1,6)],
       }

rf_cv = RandomizedSearchCV(rf, param_distributions=param, verbose=5)

rf_cv.fit(tfidf_matrix, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END max_depth=5, min_samples_leaf=5, min_samples_split=1, n_estimators=1900;, score=nan total time=   0.4s
[CV 2/5] END max_depth=5, min_samples_leaf=5, min_samples_split=1, n_estimators=1900;, score=nan total time=   0.4s
[CV 3/5] END max_depth=5, min_samples_leaf=5, min_samples_split=1, n_estimators=1900;, score=nan total time=   0.4s
[CV 4/5] END max_depth=5, min_samples_leaf=5, min_samples_split=1, n_estimators=1900;, score=nan total time=   0.5s
[CV 5/5] END max_depth=5, min_samples_leaf=5, min_samples_split=1, n_estimators=1900;, score=nan total time=   0.4s
[CV 1/5] END max_depth=1, min_samples_leaf=4, min_samples_split=1, n_estimators=1500;, score=nan total time=   0.3s
[CV 2/5] END max_depth=1, min_samples_leaf=4, min_samples_split=1, n_estimators=1500;, score=nan total time=   0.3s
[CV 3/5] END max_depth=1, min_samples_leaf=4, min_samples_split=1, n_estimators=1500;, score=nan total time=   0.4s
[CV 4/5] EN

RandomizedSearchCV(estimator=RandomForestClassifier(),
                   param_distributions={'max_depth': [1, 2, 3, 4, 5],
                                        'min_samples_leaf': [1, 2, 3, 4, 5],
                                        'min_samples_split': [1, 2, 3, 4, 5],
                                        'n_estimators': [100, 300, 500, 700,
                                                         900, 1100, 1300, 1500,
                                                         1700, 1900]},
                   verbose=5)

In [16]:
best_para = rf_cv.best_params_
best_para

{'n_estimators': 100,
 'min_samples_split': 3,
 'min_samples_leaf': 4,
 'max_depth': 3}

In [17]:
params ={'n_estimators': [x for x in range(best_para["n_estimators"]-50,  best_para["n_estimators"]+50, 10)],
         'min_samples_split': [best_para["min_samples_split"]+1, best_para["min_samples_split"], best_para["min_samples_split"]-1],
         'min_samples_leaf': [best_para["min_samples_leaf"]+1, best_para["min_samples_leaf"], best_para["min_samples_leaf"]+2],
         'max_depth': [best_para["max_depth"]]}

params

{'n_estimators': [50, 60, 70, 80, 90, 100, 110, 120, 130, 140],
 'min_samples_split': [4, 3, 2],
 'min_samples_leaf': [5, 4, 6],
 'max_depth': [3]}

In [18]:
rf = RandomForestClassifier()

gr_cv = GridSearchCV(rf, param_grid=params, verbose=2)

gr_cv.fit(tfidf_matrix, y_train)

Fitting 5 folds for each of 90 candidates, totalling 450 fits
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=4, n_estimators=50; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=4, n_estimators=50; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=4, n_estimators=50; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=4, n_estimators=50; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=4, n_estimators=50; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=4, n_estimators=60; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=4, n_estimators=60; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=4, n_estimators=60; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=4, n_estimators=60; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=5, min_s

[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=3, n_estimators=110; total time=   0.6s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=3, n_estimators=110; total time=   0.5s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=3, n_estimators=110; total time=   0.7s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=3, n_estimators=120; total time=   0.6s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=3, n_estimators=120; total time=   0.6s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=3, n_estimators=120; total time=   0.6s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=3, n_estimators=120; total time=   0.6s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=3, n_estimators=120; total time=   0.5s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=3, n_estimators=130; total time=   0.6s
[CV] END max_depth=3, min_samples_leaf=5, min_samples_split=3, n_estimators=130; total time=   0.7s


[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=4, n_estimators=80; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=4, n_estimators=80; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=4, n_estimators=80; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=4, n_estimators=80; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=4, n_estimators=80; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=4, n_estimators=90; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=4, n_estimators=90; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=4, n_estimators=90; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=4, n_estimators=90; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=4, n_estimators=90; total time=   0.4s
[CV] END m

[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=3, n_estimators=140; total time=   0.6s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=3, n_estimators=140; total time=   0.6s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=2, n_estimators=50; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=2, n_estimators=60; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=2, n_estimators=60; total time=   0.2s
[CV] END max_depth=3, min_samples_leaf=4, min_samples_split=2, n_estimators=60; total time=   0.2s
[CV] END

[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=4, n_estimators=110; total time=   0.5s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=4, n_estimators=110; total time=   0.5s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=4, n_estimators=110; total time=   0.5s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=4, n_estimators=110; total time=   0.5s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=4, n_estimators=120; total time=   0.6s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=4, n_estimators=120; total time=   0.5s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=4, n_estimators=120; total time=   0.5s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=4, n_estimators=120; total time=   0.5s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=4, n_estimators=120; total time=   0.6s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=4, n_estimators=130; total time=   0.6s


[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=2, n_estimators=70; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=2, n_estimators=80; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=2, n_estimators=80; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=2, n_estimators=80; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=2, n_estimators=80; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=2, n_estimators=80; total time=   0.3s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=2, n_estimators=90; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=2, n_estimators=90; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=2, n_estimators=90; total time=   0.4s
[CV] END max_depth=3, min_samples_leaf=6, min_samples_split=2, n_estimators=90; total time=   0.4s
[CV] END m

GridSearchCV(estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3], 'min_samples_leaf': [5, 4, 6],
                         'min_samples_split': [4, 3, 2],
                         'n_estimators': [50, 60, 70, 80, 90, 100, 110, 120,
                                          130, 140]},
             verbose=2)

In [21]:
gr_cv.best_params_

{'max_depth': 3,
 'min_samples_leaf': 4,
 'min_samples_split': 3,
 'n_estimators': 60}

In [18]:
pip = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("classifier", RandomForestClassifier( 
                                          #class_weight={1:weight_one,2:weight_two,3:weight_three,4:weight_four,5:weight_five,6:weight_six}
                                         ))])

In [19]:
pip.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer()),
                ('classifier', RandomForestClassifier())])

In [20]:
y_pred = pip.predict(x_test)

y_pred_prob = pip.predict_proba(x_test)

In [21]:
accuracy_score(y_test, y_pred)

0.8821875

In [47]:
q = preprocessing_text("how tasty")

pip.predict_proba([q])


array([[0.1 , 0.28, 0.  , 0.  , 0.29, 0.33]])

In [64]:
({'sadness':1, 'anger':2, 'love':3, 'surprise':4, 'fear':5, 'joy':6})

{'sadness': 1, 'anger': 2, 'love': 3, 'surprise': 4, 'fear': 5, 'joy': 6}