<a href="https://colab.research.google.com/github/sronak/Data_Science/blob/main/Day47_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

In [5]:
data_train = pd.read_csv('mood_data.txt',names=['Text','Emotion'],sep=';')

In [6]:
data_train.shape

(16000, 2)

In [7]:
data_train.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [8]:
import string, re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [9]:
def clean_text(mood):
    mood = word_tokenize(mood)
    mood = " ".join(mood)
    mood = [char for char in mood if char not in string.punctuation]
    mood = ''.join(mood)
    mood = [word for word in mood.split() if mood.lower() not in stopwords.words('english')]
    return " ".join(mood)

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
data_train['cleaned_text'] = data_train['Text'].apply(clean_text)
data_train.head()

Unnamed: 0,Text,Emotion,cleaned_text
0,i didnt feel humiliated,sadness,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing a minute to post i feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy,anger,i am feeling grouchy


In [15]:
data_train["cleaned_text"].head()

0                              i didnt feel humiliated
1    i can go from feeling so hopeless to so damned...
2     im grabbing a minute to post i feel greedy wrong
3    i am ever feeling nostalgic about the fireplac...
4                                 i am feeling grouchy
Name: cleaned_text, dtype: object

In [16]:
features = data_train['cleaned_text']
processed_features = []
for sentence in range(0, len(features)):
    processed_feature = re.sub(r'\W', ' ', str(features[sentence]))
    processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)
    processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 
    
    processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)
    processed_feature = processed_feature.lower()
    processed_features.append(processed_feature)

In [17]:
processed_features[:5]

['i didnt feel humiliated',
 'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
 'im grabbing minute to post feel greedy wrong',
 'i am ever feeling nostalgic about the fireplace will know that it is still on the property',
 'i am feeling grouchy']

In [18]:
data_train['processed_text'] = processed_features
data_train

Unnamed: 0,Text,Emotion,cleaned_text,processed_text
0,i didnt feel humiliated,sadness,i didnt feel humiliated,i didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,i can go from feeling so hopeless to so damned...,i can go from feeling so hopeless to so damned...
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing a minute to post i feel greedy wrong,im grabbing minute to post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,i am ever feeling nostalgic about the fireplac...,i am ever feeling nostalgic about the fireplac...
4,i am feeling grouchy,anger,i am feeling grouchy,i am feeling grouchy
...,...,...,...,...
15995,i just had a very brief time in the beanbag an...,sadness,i just had a very brief time in the beanbag an...,i just had very brief time in the beanbag and ...
15996,i am now turning and i feel pathetic that i am...,sadness,i am now turning and i feel pathetic that i am...,i am now turning and feel pathetic that am sti...
15997,i feel strong and good overall,joy,i feel strong and good overall,i feel strong and good overall
15998,i feel like this was such a rude comment and i...,anger,i feel like this was such a rude comment and i...,i feel like this was such rude comment and im ...


In [19]:
final_data = data_train[["processed_text","Emotion"]]
final_data

Unnamed: 0,processed_text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing minute to post feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
15995,i just had very brief time in the beanbag and ...,sadness
15996,i am now turning and feel pathetic that am sti...,sadness
15997,i feel strong and good overall,joy
15998,i feel like this was such rude comment and im ...,anger


In [20]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [51]:
def tokenize(text): 
    tk = TweetTokenizer()
    return tk.tokenize(text)

vectorizer = CountVectorizer(analyzer = 'word',tokenizer = tokenize,lowercase = True,ngram_range=(1, 1))

In [52]:
count= vectorizer.fit_transform(final_data['processed_text'])

In [53]:
count.shape

(16000, 15206)

In [54]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score, precision_score,recall_score

In [55]:
x = final_data['processed_text'].values
y = final_data['Emotion'].values

In [56]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=100,test_size=0.3)

In [62]:
vectorizer = TfidfVectorizer(max_features=1000)
x_train_idf = vectorizer.fit_transform(x_train)
x_test_idf = vectorizer.transform(x_test)

In [63]:
data_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(),columns=["idf_weights"])
data_idf.sort_values(by=['idf_weights'],ascending = False).head()



Unnamed: 0,idf_weights
blah,7.758809
chest,7.684701
pregnant,7.433387
computer,7.379319
dream,7.379319


In [29]:
mnb = MultinomialNB()
mnb.fit(x_train_idf, y_train)

MultinomialNB()

In [30]:
pred_mnb = mnb.predict(x_test_idf)

acc = accuracy_score(y_test, pred_mnb)

results = pd.DataFrame([['Multinomial Naive Bayes', acc]],
               columns = ['Model', 'Accuracy'])

print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes  0.724792


In [32]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier()
clf_rf.fit(x_train_idf, y_train)

y_pred_rf = clf_rf.predict(x_test_idf)

acc = accuracy_score(y_test, y_pred_rf)

model_results = pd.DataFrame([['Random Forest(Gini)', acc]],
               columns = ['Model', 'Accuracy'])

results = results.append(model_results, ignore_index = True)
print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes  0.724792
1      Random Forest(Gini)  0.862083


In [33]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(criterion='entropy')
clf_rf.fit(x_train_idf, y_train)

y_pred_rf = clf_rf.predict(x_test_idf)

acc = accuracy_score(y_test, y_pred_rf)

model_results = pd.DataFrame([['Random Forest(Entropy)', acc]],
               columns = ['Model', 'Accuracy'])

results = results.append(model_results, ignore_index = True)


In [35]:
from sklearn.svm import SVC
clf_svc = SVC()
clf_rf.fit(x_train_idf,y_train)

y_pred_rf =clf_rf.predict(x_test_idf)
acc = accuracy_score(y_test, y_pred_rf)

model_results = pd.DataFrame([['SVC by SVM ', acc]],
               columns = ['Model', 'Accuracy'])

results = results.append(model_results, ignore_index = True)
print(results)

                     Model  Accuracy
0  Multinomial Naive Bayes  0.724792
1      Random Forest(Gini)  0.862083
2   Random Forest(Entropy)  0.850417
3              SVC by SVM   0.848542


In [36]:
confusion_matrix(y_test,y_pred_rf)

array([[ 516,    7,   46,    5,   37,    0],
       [  34,  460,   52,    5,   42,   22],
       [  13,    7, 1494,   34,   47,    8],
       [   5,    3,   90,  270,    5,    0],
       [  30,   30,  142,    9, 1190,    6],
       [   0,   36,   10,    0,    2,  143]])

**Conclusion** : Random Forest Classifier has been Performed