In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score

In [2]:
# Mounting the Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
columns = ["emotion", "text"]
df = pd.read_csv("drive/My Drive/Training_AI_Engineers/isear_dataset.csv", names=columns)

In [4]:
df.head()


Unnamed: 0,emotion,text
0,joy,On days when I feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


In [5]:
from sklearn.preprocessing import LabelEncoder

In [6]:
le = LabelEncoder()

In [7]:
le.fit(df["emotion"])


LabelEncoder()

In [8]:
le.classes_

array(['anger', 'disgust', 'fear', 'guilt', 'joy', 'sadness', 'shame'],
      dtype=object)

In [33]:
df["emotion"] = le.transform(df["emotion"])
df["emotion"]

0       4
1       2
2       0
3       5
4       1
       ..
7441    0
7442    5
7443    1
7444    6
7445    3
Name: emotion, Length: 7446, dtype: int64

In [10]:
df["emotion"].value_counts()

joy        1082
sadness    1074
anger      1069
fear       1063
disgust    1059
shame      1059
guilt      1040
Name: emotion, dtype: int64

In [11]:
df.describe()

Unnamed: 0,emotion,text
count,7446,7446
unique,7,7379
top,joy,When my grandfather died.
freq,1082,8


In [12]:
# feat = df["text"]
# targ = df["emotion"] 

In [13]:
import nltk

nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import re

def process_text(text):
    text = text if type(text) == str else ''

    # clean the words, remove symbols special chars
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern, '', text)

    # convert to lowercase
    text = text.lower()

    # first tokenize the text
    word_tokenized = word_tokenize(text)

    # let's remove the stop words
    list_stopwords = stopwords.words('english')
    no_stopwords = [word for word in word_tokenized if word not in list_stopwords]

    return no_stopwords

In [34]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df["emotion"], test_size=0.2, random_state=53)

### Count Vectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer()

In [18]:
x_traincv = cv.fit_transform(x_train)


In [19]:
xtrain_arrCV = x_traincv.toarray()

In [20]:
print(len(xtrain_arrCV))

5956


### Tfidf Vectorizer

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tv = TfidfVectorizer(min_df=1,stop_words='english')

In [26]:
x_traintv = tv.fit_transform(x_train)

In [27]:
x_traintv = tv.fit_transform(x_train)

In [28]:
x_traintv.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Naive Bayes Tfidf Vectorizer

In [29]:
from sklearn.naive_bayes import MultinomialNB

In [30]:
mnb = MultinomialNB()

In [35]:
#to make the training data integer
y_train=y_train.astype('int')

In [37]:
mnb.fit(x_traintv,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [38]:
x_testtv = tv.transform(x_test)

In [39]:
df.head()

Unnamed: 0,emotion,text
0,4,On days when I feel close to my partner and ot...
1,2,Every time I imagine that someone I love or I ...
2,0,When I had been obviously unjustly treated and...
3,5,When I think about the short time that we live...
4,1,At a gathering I found myself involuntarily si...


In [40]:
pred = mnb.predict(x_testtv)

In [41]:
pred

array([2, 1, 6, ..., 1, 4, 4])

In [42]:
#taking y_test to numpy array
actual = np.array(y_test)

In [43]:
count = 0
for i in range(len(pred)):
    if pred[i] == actual[i]:
        count = count+1

In [44]:
accuracy = count/len(pred)
accuracy

0.5342281879194631

### Naive-Bayes Count Vectorizer

In [45]:
mnbcv =  MultinomialNB()

In [46]:
#to make the training data integer
y_train = y_train.astype('int')
# y_train

In [47]:
mnbcv.fit(x_traincv,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [48]:
x_testcv = cv.transform(x_test)

In [49]:
pred = mnbcv.predict(x_testcv)

In [50]:
#prediction the test datas
pred

array([2, 1, 6, ..., 6, 1, 4])

In [51]:
#taking y_test to numpy array
actual = np.array(y_test)

In [52]:
actual

array([2, 1, 6, ..., 0, 6, 3])

In [53]:
countcv = 0
for i in range(len(pred)):
    if pred[i] == actual[i]:
        countcv = countcv + 1

In [54]:
countcv

808

In [55]:
len(pred)

1490

In [56]:
accuracy = countcv/len(pred)
accuracy

0.5422818791946309