### End 2 End NLP Project
+ Emotion Detection In Text
+ Text Classifier

In [1]:
# Load EDA Pkgs
import pandas as pd
import numpy as np

In [2]:
file_list = ['train.csv', 'test.csv','val.csv']
column_names = ['Text', 'Emotion']
df = pd.concat(map(lambda file: pd.read_csv(file, sep=';', header=None, names=column_names), file_list))

In [3]:
df.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
# Value Counts
df['Emotion'].value_counts()

joy         6761
sadness     5797
anger       2709
fear        2373
love        1641
surprise     719
Name: Emotion, dtype: int64

In [5]:
!python -m pip install neattext




[notice] A new release of pip is available: 23.0 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
# Load Text Cleaning Pkgs
import neattext.functions as nfx

In [7]:
# Data Cleaning
dir(nfx)

['BTC_ADDRESS_REGEX',
 'CURRENCY_REGEX',
 'CURRENCY_SYMB_REGEX',
 'Counter',
 'DATE_REGEX',
 'EMAIL_REGEX',
 'EMOJI_REGEX',
 'HASTAG_REGEX',
 'MASTERCard_REGEX',
 'MD5_SHA_REGEX',
 'MOST_COMMON_PUNCT_REGEX',
 'NUMBERS_REGEX',
 'PHONE_REGEX',
 'PoBOX_REGEX',
 'SPECIAL_CHARACTERS_REGEX',
 'STOPWORDS',
 'STOPWORDS_de',
 'STOPWORDS_en',
 'STOPWORDS_es',
 'STOPWORDS_fr',
 'STOPWORDS_ru',
 'STOPWORDS_yo',
 'STREET_ADDRESS_REGEX',
 'TextFrame',
 'URL_PATTERN',
 'USER_HANDLES_REGEX',
 'VISACard_REGEX',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__generate_text',
 '__loader__',
 '__name__',
 '__numbers_dict',
 '__package__',
 '__spec__',
 '_lex_richness_herdan',
 '_lex_richness_maas_ttr',
 'clean_text',
 'defaultdict',
 'digit2words',
 'extract_btc_address',
 'extract_currencies',
 'extract_currency_symbols',
 'extract_dates',
 'extract_emails',
 'extract_emojis',
 'extract_hashtags',
 'extract_html_tags',
 'extract_mastercard_addr',
 'extract_md5sha',
 'extract_numbers',
 'extr

In [8]:
# User handles
df['Clean_Text'] = df['Text'].apply(nfx.remove_userhandles)

In [9]:
# Stopwords
df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_stopwords)

In [10]:
df

Unnamed: 0,Text,Emotion,Clean_Text
0,i didnt feel humiliated,sadness,didnt feel humiliated
1,i can go from feeling so hopeless to so damned...,sadness,feeling hopeless damned hopeful cares awake
2,im grabbing a minute to post i feel greedy wrong,anger,im grabbing minute post feel greedy wrong
3,i am ever feeling nostalgic about the fireplac...,love,feeling nostalgic fireplace know property
4,i am feeling grouchy,anger,feeling grouchy
...,...,...,...
1995,im having ssa examination tomorrow in the morn...,sadness,im ssa examination tomorrow morning im prepare...
1996,i constantly worry about their fight against n...,joy,constantly worry fight nature push limits inne...
1997,i feel its important to share this info for th...,joy,feel important share info experience thing
1998,i truly feel that if you are passionate enough...,joy,truly feel passionate stay true succeed


In [11]:
# Load ML Pkgs
# Estimators
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

# Transformers
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix

In [12]:
# Features & Labels
Xfeatures = df['Clean_Text']
ylabels = df['Emotion']

In [13]:
#  Split Data
x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=0.3,random_state=42)

In [14]:
# Build Pipeline
from sklearn.pipeline import Pipeline

In [15]:
# LogisticRegression Pipeline
pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])

In [16]:
# Train and Fit Data
pipe_lr.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
pipe_lr

In [18]:
# Check Accuracy
pipe_lr.score(x_test,y_test)

0.8885

In [19]:
# Make A Prediction
sample1 = "This chocholate was very sweet it made me happy"

In [20]:
pipe_lr.predict([sample1])

array(['joy'], dtype=object)

In [21]:
y_pred = pipe_lr.predict(x_test)

class_report = classification_report(y_test, y_pred)
print(class_report)

              precision    recall  f1-score   support

       anger       0.92      0.86      0.89       831
        fear       0.85      0.83      0.84       697
         joy       0.89      0.94      0.91      1980
        love       0.82      0.77      0.79       507
     sadness       0.92      0.92      0.92      1755
    surprise       0.83      0.73      0.77       230

    accuracy                           0.89      6000
   macro avg       0.87      0.84      0.85      6000
weighted avg       0.89      0.89      0.89      6000



In [22]:
conf_matrix = confusion_matrix(y_test, y_pred)

confusion_df = pd.DataFrame(conf_matrix, columns=pipe_lr.classes_, index=pipe_lr.classes_)

print(confusion_df)

          anger  fear   joy  love  sadness  surprise
anger       718    22    32     4       53         2
fear         21   576    21     4       51        24
joy           6    15  1862    64       28         5
love          2     3   105   390        6         1
sadness      36    27    57    14     1618         3
surprise      1    37    14     0       11       167
