<a href="https://colab.research.google.com/github/sbmshukla/SentimentAnalysis/blob/master/Emotions_Using_ML_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/emotions/train.txt', sep=';', header=None, names= ['text', 'emotions'])

In [None]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [None]:
df.isna().sum()

Unnamed: 0,0
text,0
emotions,0


In [None]:
unique_emotions = df['emotions'].unique()
emotions_numbers = {emotion: i for i, emotion in enumerate(unique_emotions)}

In [None]:
emotions_numbers

{'sadness': 0, 'anger': 1, 'love': 2, 'surprise': 3, 'fear': 4, 'joy': 5}

In [None]:
df['emotions'] = df['emotions'].map(emotions_numbers)

In [None]:
df.head(5)

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [None]:
df['text'] = df['text'].apply(lambda x: x.lower())

In [None]:
import string

def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

In [None]:
df['text'] = df['text'].apply(remove_punctuation)

In [None]:
def remove_numbers(text):
  new = ''
  for i in text:
    if not i.isdigit():
      new += i
  return new

df['text'] = df['text'].apply(remove_numbers)

In [None]:
import re

def remove_url(text: str) -> str:
    # Remove http, https, www URLs
    return re.sub(r'http\S+|www\S+|https\S+', '', text)

df['text'] = df['text'].apply(remove_url)

In [None]:
def remove_emojies(text):
    new = ""
    for i in text:
        if i.isascii():  # <-- method call
            new += i
    return new

df['text'] = df['text'].apply(remove_emojies)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
stop_words = set(stopwords.words('english'))

len(stop_words)

198

In [None]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [None]:
def remove_stopword(text):
  words = word_tokenize(text)
  cleaned_text = []

  for i in words:
    if i not in stop_words:
      cleaned_text.append(i)

  return ' '.join(cleaned_text)

In [None]:
df['text'] = df['text'].apply(remove_stopword)

In [None]:
df.loc[1]['text']

'go feeling hopeless damned hopeful around someone cares awake'

In [None]:
df.head()

Unnamed: 0,text,emotions
0,didnt feel humiliated,sadness
1,go feeling hopeless damned hopeful around some...,sadness
2,im grabbing minute post feel greedy wrong,anger
3,ever feeling nostalgic fireplace know still pr...,love
4,feeling grouchy,anger


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotions'], test_size=0.20, random_state=42)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
bow_vectorizer = CountVectorizer()

In [None]:
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [None]:
X_train_bow

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 200058 stored elements and shape (12800, 13501)>

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
nb = MultinomialNB()

In [None]:
nb.fit(X_train_bow, y_train)

In [None]:
y_pred = nb.predict(X_test_bow)

In [None]:
from sklearn.metrics import accuracy_score, classification_report

In [None]:
accuracy_score(y_test, y_pred)

0.7390625

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.90      0.56      0.69       427
        fear       0.85      0.50      0.63       397
         joy       0.69      0.97      0.80      1021
        love       0.92      0.16      0.28       296
     sadness       0.74      0.94      0.83       946
    surprise       1.00      0.04      0.07       113

    accuracy                           0.74      3200
   macro avg       0.85      0.53      0.55      3200
weighted avg       0.78      0.74      0.70      3200



In [None]:
y_pred

array(['sadness', 'joy', 'sadness', ..., 'joy', 'joy', 'sadness'],
      dtype='<U8')

In [None]:
tfidf_vectorizer = TfidfVectorizer()

In [None]:
X_train_tfid = tfidf_vectorizer.fit_transform(X_train)
X_test_tfid = tfidf_vectorizer.transform(X_test)

In [None]:
nb = MultinomialNB()

In [None]:
nb.fit(X_train_tfid, y_train)

In [None]:
y_pred = nb.predict(X_test_tfid)

In [None]:
accuracy_score(y_test, y_pred)

0.6609375

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
lr.fit(X_train_tfid, y_train)

In [None]:
y_pred = lr.predict(X_test_tfid)

In [None]:
accuracy_score(y_test, y_pred)

0.8621875

In [None]:
from sklearn.svm import SVC

In [None]:
svc = SVC()

In [None]:
svc.fit(X_train_tfid, y_train)

In [None]:
y_pred = svc.predict(X_test_tfid)

In [None]:
accuracy_score(y_test, y_pred)

0.8521875

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.91      0.81      0.85       427
        fear       0.85      0.75      0.80       397
         joy       0.79      0.96      0.87      1021
        love       0.89      0.55      0.68       296
     sadness       0.90      0.93      0.92       946
    surprise       0.84      0.51      0.64       113

    accuracy                           0.85      3200
   macro avg       0.86      0.75      0.79      3200
weighted avg       0.86      0.85      0.85      3200



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=1000))
])

In [None]:
# Hyperparameters to tune
param_grid = {
    "tfidf__ngram_range": [(1,1), (1,2)],        # unigrams or unigrams+bigrams
    "tfidf__max_df": [0.9, 1.0],                 # ignore very frequent words
    "tfidf__min_df": [1, 2],                     # ignore rare words
    "clf__C": [0.01, 0.1, 1, 10],                # regularization strength
    "clf__penalty": ["l2"],                      # logistic regression penalty
    "clf__solver": ["lbfgs", "liblinear"]        # solvers
}

In [None]:
# Grid Search with cross-validation
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)

In [None]:
grid.fit(X_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [None]:
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Accuracy:", grid.best_score_)
print("Test Accuracy:", grid.score(X_test, y_test))

Best Parameters: {'clf__C': 10, 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'tfidf__max_df': 0.9, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 2)}
Best Cross-Validation Accuracy: 0.896484375
Test Accuracy: 0.9009375


In [None]:
y_pred = grid.predict(X_test)

In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.91      0.86      0.89       427
        fear       0.87      0.85      0.86       397
         joy       0.89      0.96      0.92      1021
        love       0.86      0.75      0.80       296
     sadness       0.93      0.95      0.94       946
    surprise       0.87      0.69      0.77       113

    accuracy                           0.90      3200
   macro avg       0.89      0.84      0.86      3200
weighted avg       0.90      0.90      0.90      3200



In [None]:
y_pred

array(['sadness', 'joy', 'sadness', ..., 'joy', 'joy', 'sadness'],
      dtype=object)

In [None]:
# Final Model Building

In [None]:
def custom_preprocessor(text):
  text = text.lower()
  text = remove_punctuation(text)
  text = remove_numbers(text)
  text = remove_url(text)
  text = remove_emojies(text)
  text = remove_stopword(text)

  return text

# Pipeline
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(preprocessor=custom_preprocessor)),
    ("clf", LogisticRegression(max_iter=1000))
])

# Hyperparameters to tune
param_grid = {
    "tfidf__ngram_range": [(2,3)],        # unigrams or unigrams+bigrams
    "tfidf__max_df": [0.9, 1.0],                 # ignore very frequent words
    "tfidf__min_df": [1, 2],                     # ignore rare words
    "clf__C": [0.01, 0.1, 1, 10],                # regularization strength
    "clf__penalty": ["l2"],                      # logistic regression penalty
    "clf__solver": ["lbfgs", "liblinear"]        # solvers
}

# Grid Search with cross-validation
grid = GridSearchCV(pipeline, param_grid, cv=5, scoring="accuracy", n_jobs=-1, verbose=1)

In [None]:
grid.fit(df['text'], df['emotions'])

Fitting 5 folds for each of 32 candidates, totalling 160 fits


In [None]:
print("Best Parameters:", grid.best_params_)
print("Best Cross-Validation Accuracy:", grid.best_score_)
print("Test Accuracy:", grid.score(X_test, y_test))

Best Parameters: {'clf__C': 10, 'clf__penalty': 'l2', 'clf__solver': 'liblinear', 'tfidf__max_df': 0.9, 'tfidf__min_df': 1, 'tfidf__ngram_range': (2, 3)}
Best Cross-Validation Accuracy: 0.7010625
Test Accuracy: 0.9978125


In [None]:
# Get best model (pipeline with best params)
best_model = grid.best_estimator_

In [None]:
# Test samples
test_texts = [
    "Prediction Is Not That Much Good",
    "Good Man",
    "very bad situation"
]

# Predict
predictions = best_model.predict(test_texts)

print(predictions)

['joy' 'joy' 'joy']


In [None]:
import pickle

# Save the entire pipeline or model
with open("grid.pkl", "wb") as f:
    pickle.dump(grid, f)

In [None]:
%%writefile preprocesser.py
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

nltk.download('stopwords')

nltk.download('punkt_tab')

stop_words = set(stopwords.words('english'))

def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

def remove_numbers(text):
  new = ''
  for i in text:
    if not i.isdigit():
      new += i
  return new

def remove_url(text: str) -> str:
    # Remove http, https, www URLs
    return re.sub(r'http\S+|www\S+|https\S+', '', text)


def remove_emojies(text):
    new = ""
    for i in text:
        if i.isascii():  # <-- method call
            new += i
    return new

def remove_stopword(text):
  words = word_tokenize(text)
  cleaned_text = []

  for i in words:
    if i not in stop_words:
      cleaned_text.append(i)

  return ' '.join(cleaned_text)



def custom_preprocessor(text):
  text = text.lower()
  text = remove_punctuation(text)
  text = remove_numbers(text)
  text = remove_url(text)
  text = remove_emojies(text)
  text = remove_stopword(text)

  return text

Writing preprocesser.py


In [None]:
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(preprocessor=custom_preprocessor)),
    ("clf", LogisticRegression(max_iter=1000))
])

In [None]:
pipeline.fit(df['text'], df['emotions'])

In [None]:
import pickle

# Save the entire pipeline or model
with open("lr_pipe.pkl", "wb") as f:
    pickle.dump(pipeline, f)

In [None]:
accuracy_score(y_test, pipeline.predict(X_test))

0.9484375