## Importing libraries 

In [1]:
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## Importing datasets

I had a single column containing both input and output values separated by a semicolon (';'), and I segregated them by using ';' as the delimiter to create two separate columns.

In [3]:
with open("train.txt", "r") as file:
    data = file.readlines()
df = pd.DataFrame(data, columns=["text"])

# Split each row by semicolon, extract the emotion part, and remove it from the original text
df["emotion"] = df["text"].apply(lambda x: x.split(';')[1].strip() if ';' in x else "")
df["text"] = df["text"].apply(lambda x: x.split(';')[0].strip() if ';' in x else x)
print(df)

                                                    text  emotion
0                                i didnt feel humiliated  sadness
1      i can go from feeling so hopeless to so damned...  sadness
2       im grabbing a minute to post i feel greedy wrong    anger
3      i am ever feeling nostalgic about the fireplac...     love
4                                   i am feeling grouchy    anger
...                                                  ...      ...
15995  i just had a very brief time in the beanbag an...  sadness
15996  i am now turning and i feel pathetic that i am...  sadness
15997                     i feel strong and good overall      joy
15998  i feel like this was such a rude comment and i...    anger
15999  i know a lot but i feel so stupid because i ca...  sadness

[16000 rows x 2 columns]


#unique output

In [21]:
df["emotion"].value_counts()

joy         695
sadness     581
anger       275
fear        224
love        159
surprise     66
Name: emotion, dtype: int64

In [4]:
df["text"][0]

'i didnt feel humiliated'

## data cleaning & Pre-processing

In [5]:
a=nltk.download('stopwords')# inbuild stopwords
a

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
pip install --upgrade nltk

Note: you may need to restart the kernel to use updated packages.




In [7]:
ps=PorterStemmer()
ps

<PorterStemmer>

In [8]:
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [9]:
corpus

['didnt feel humili',
 'go feel hopeless damn hope around someon care awak',
 'im grab minut post feel greedi wrong',
 'ever feel nostalg fireplac know still properti',
 'feel grouchi',
 'ive feel littl burden late wasnt sure',
 'ive take milligram time recommend amount ive fallen asleep lot faster also feel like funni',
 'feel confus life teenag jade year old man',
 'petrona year feel petrona perform well made huge profit',
 'feel romant',
 'feel like make suffer see mean someth',
 'feel run divin experi expect type spiritu encount',
 'think easiest time year feel dissatisfi',
 'feel low energi thirsti',
 'immens sympathi gener point possibl proto writer tri find time write corner life sign agent let alon publish contract feel littl preciou',
 'feel reassur anxieti side',
 'didnt realli feel embarrass',
 'feel pretti pathet time',
 'start feel sentiment doll child began collect vintag barbi doll sixti',
 'feel compromis skeptic valu everi unit work put',
 'feel irrit reject without an

### Converting Text to Vector 

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X = cv.fit_transform(corpus).toarray()

In [11]:
y=pd.get_dummies(df['emotion'])
y=y.iloc[:,1].values

## Algorithm building 

In [12]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(X, y)

## Testing data 

In [13]:
#Doing it same for testing data
with open("test.txt", "r") as file:
    data = file.readlines()

# Create a DataFrame from the data
df = pd.DataFrame(data, columns=["text"])

# Split each row by semicolon, extract the emotion part, and remove it from the original text
df["emotion"] = df["text"].apply(lambda x: x.split(';')[1].strip() if ';' in x else "")
df["text"] = df["text"].apply(lambda x: x.split(';')[0].strip() if ';' in x else x)

# Display the resulting DataFrame
print(df)

                                                   text  emotion
0     im feeling rather rotten so im not very ambiti...  sadness
1             im updating my blog because i feel shitty  sadness
2     i never make her separate from me because i do...  sadness
3     i left with my bouquet of red and yellow tulip...      joy
4       i was feeling a little vain when i did this one  sadness
...                                                 ...      ...
1995  i just keep feeling like someone is being unki...    anger
1996  im feeling a little cranky negative after this...    anger
1997  i feel that i am useful to my people and that ...      joy
1998  im feeling more comfortable with derby i feel ...      joy
1999  i feel all weird when i have to meet w people ...     fear

[2000 rows x 2 columns]


In [14]:
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df['text'][i])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500)
X_test = cv.fit_transform(corpus).toarray()

In [16]:
y_test=pd.get_dummies(df['emotion'])
y_test=y_test.iloc[:,1].values

In [17]:
y_pred=spam_detect_model.predict(X_test)

## Performance matrix 

In [18]:
from sklearn.metrics import accuracy_score,classification_report

In [19]:
score=accuracy_score(y_test,y_pred)
print(score)

0.7755


In [20]:
from sklearn.metrics import classification_report
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      1741
           1       0.08      0.07      0.07       259

    accuracy                           0.78      2000
   macro avg       0.47      0.47      0.47      2000
weighted avg       0.76      0.78      0.77      2000

