In [1]:
import re
import nltk

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from nltk.corpus import stopwords

nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.model_selection import train_test_split, cross_val_score

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Mounting the Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [3]:
columns = ["emotion", "text"]
df = pd.read_csv("drive/My Drive/Training_AI_Engineers/isear_dataset.csv", names=columns)

In [4]:
df.head()


Unnamed: 0,emotion,text
0,joy,On days when I feel close to my partner and ot...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


In [5]:
import nltk
nltk.download('wordnet')
from nltk.corpus import stopwords

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
tokenizer = nltk.tokenize.TreebankWordTokenizer()
stop_words = stopwords.words("english")
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

def process_text(text):
    #convert text to lowercase
    text= text.apply(lambda x:x.lower())
    
    #remove multiple spaces from text
    text= text.apply(lambda x:re.sub(' +', ' ',x))
    
    # remove alpha numeric characeter from text using regex
    text= text.apply(lambda x:x.lower())
    text.str.replace(r"\d+", "")
    text.str.replace('[^\w\s]','')
    text.str.replace(r"[︰-＠]", "")
    text.str.replace(r"", "")
    
    #tokenize
    text = text.apply(tokenizer.tokenize)
    
    #remove english stop words form text
    text =text.apply(lambda x: [item for item in x if item not in stop_words])

    #stemming text
    # text =text.apply(lambda x: [stemmer.stem(e) for e in x])
    
    #stripping spaces from items of array of texts
    text=text.apply(lambda x: [e.strip() for e in x])
    
    #remove all the characters from array of texts if the length of item is 1
    text=text.apply( lambda x: [ y for y in x if len(y)>1 ])
    
    #remove all the digits from the text
    text=text.apply( lambda x: [ y for y in x if not y.isdigit()] )
    
    # lemattizing text
    # text = text.apply(lambda x: ' '.join(lemmatizer.lemmatize(token) for token in x))

    # joining texts
    text = text.apply(lambda x: ' '.join(token for token in x))

    return text

In [7]:
df = df.apply(lambda x: process_text(x) if x.name=='text' else x)

# Machine Learning Analysis (Naive Bayes)

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
le = LabelEncoder()

In [10]:
le.fit(df["emotion"])


LabelEncoder()

In [11]:
le.classes_

array(['anger', 'disgust', 'fear', 'guilt', 'joy', 'sadness', 'shame'],
      dtype=object)

In [12]:
df["emotion"] = le.transform(df["emotion"])
df["emotion"]

0       4
1       2
2       0
3       5
4       1
       ..
7441    0
7442    5
7443    1
7444    6
7445    3
Name: emotion, Length: 7446, dtype: int64

In [13]:
df["emotion"].value_counts()

4    1082
5    1074
0    1069
2    1063
6    1059
1    1059
3    1040
Name: emotion, dtype: int64

In [14]:
df.describe()

Unnamed: 0,emotion
count,7446.0
mean,3.002552
std,2.001643
min,0.0
25%,1.0
50%,3.0
75%,5.0
max,6.0


In [15]:
x_train, x_test, y_train, y_test = train_test_split(df['text'], df["emotion"], test_size=0.2, random_state=53)

### Count Vectorizer

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer()

In [18]:
x_traincv = cv.fit_transform(x_train)
x_traincv


<5956x7861 sparse matrix of type '<class 'numpy.int64'>'
	with 56116 stored elements in Compressed Sparse Row format>

In [19]:
xtrain_arrCV = x_traincv.toarray()
xtrain_arrCV

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [20]:
print(len(xtrain_arrCV))

5956


### Tfidf Vectorizer

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
tv = TfidfVectorizer(min_df=1,stop_words='english')

In [23]:
x_traintv = tv.fit_transform(x_train)

In [24]:
x_traintv = tv.fit_transform(x_train)

In [25]:
x_traintv.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### Naive Bayes Tfidf Vectorizer

In [26]:
from sklearn.naive_bayes import MultinomialNB

In [27]:
mnb = MultinomialNB()

In [28]:
#to make the training data integer
y_train=y_train.astype('int')

In [29]:
mnb.fit(x_traintv,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [30]:
x_testtv = tv.transform(x_test)

In [31]:
df.head()

Unnamed: 0,emotion,text
0,4,days feel close partner friends. feel peace al...
1,2,every time imagine someone love could contact ...
2,0,obviously unjustly treated possibility elucida...
3,5,think short time live relate periods life thin...
4,1,gathering found involuntarily sitting next two...


In [32]:
pred = mnb.predict(x_testtv)

In [33]:
pred

array([2, 1, 6, ..., 1, 4, 4])

In [34]:
#taking y_test to numpy array
actual = np.array(y_test)

In [35]:
count = 0
for i in range(len(pred)):
    if pred[i] == actual[i]:
        count = count+1

In [36]:
accuracy = count/len(pred)
accuracy

0.5335570469798657

### Naive-Bayes Count Vectorizer

In [37]:
mnbcv =  MultinomialNB()

In [38]:
#to make the training data integer
y_train = y_train.astype('int')
# y_train

In [39]:
mnbcv.fit(x_traincv,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [40]:
x_testcv = cv.transform(x_test)

In [41]:
pred = mnbcv.predict(x_testcv)

In [42]:
#prediction the test datas
pred

array([2, 1, 6, ..., 1, 0, 4])

In [43]:
#taking y_test to numpy array
actual = np.array(y_test)

In [44]:
actual

array([2, 1, 6, ..., 0, 6, 3])

In [45]:
countcv = 0
for i in range(len(pred)):
    if pred[i] == actual[i]:
        countcv = countcv + 1

In [46]:
countcv

804

In [47]:
len(pred)

1490

In [48]:
accuracy = countcv/len(pred)
accuracy

0.5395973154362416