In [105]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string

In [94]:
#reading the data
train_data = pd.read_csv('train.txt')
test_data = pd.read_csv('test.txt')
val_data = pd.read_csv('val.txt')

In [95]:
#data processing and cleaning training data
train_data.shift(periods=1)[0] = 'i didnt feel humiliated;sadness'
train_data.rename(columns={'i didnt feel humiliated;sadness': 'Message'}, inplace = True)
train_data[['Message', 'Class']] = train_data.Message.str.split(";", expand = True)

In [96]:
#data processing and cleaning testing data
test_data.shift(periods=1)[0] = 'im feeling rather rotten so im not very ambitious right now;sadness'
test_data.rename(columns={'im feeling rather rotten so im not very ambitious right now;sadness': 'Message'}, inplace = True)
test_data[['Message', 'Class']] = test_data.Message.str.split(";", expand = True)

In [97]:
#data processing and cleaning vaidation data
val_data.shift(periods=1)[0] = 'im feeling quite sad and sorry for myself but ill snap out of it soon;sadness'
val_data.rename(columns={'im feeling quite sad and sorry for myself but ill snap out of it soon;sadness': 'Message'}, inplace = True)
val_data[['Message', 'Class']] = val_data.Message.str.split(";", expand = True)

In [98]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\NIDHISH\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\NIDHISH\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [106]:
#the function to process the data
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word for word in nopunc.split() if word.lower() not in stopwords.words('english')]

In [114]:
from sklearn.svm import SVC
model = SVC()

In [115]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [130]:
from sklearn.pipeline import Pipeline

#making a pipeline for processing the data
pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', SVC()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [131]:
#fitting the data
pipeline.fit(train_data['Message'],train_data['Class'])

Pipeline(steps=[('bow',
                 CountVectorizer(analyzer=<function text_process at 0x0000016F18766048>)),
                ('tfidf', TfidfTransformer()), ('classifier', SVC())])

In [149]:
#predicting on validation data
predictions = pipeline.predict(val_data['Message'])

In [150]:
from sklearn.metrics import classification_report, confusion_matrix

In [151]:
#printing the classification report for va_data
print(classification_report(predictions,val_data['Class']))

              precision    recall  f1-score   support

       anger       0.81      0.91      0.86       246
        fear       0.77      0.84      0.80       195
         joy       0.96      0.84      0.89       803
        love       0.63      0.90      0.75       125
     sadness       0.93      0.89      0.91       574
    surprise       0.64      0.93      0.76        56

    accuracy                           0.87      1999
   macro avg       0.79      0.88      0.83      1999
weighted avg       0.88      0.87      0.87      1999



In [153]:
#predicting and printing the the classification report on test_data
predictions = pipeline.predict(test_data['Message'])
print(classification_report(predictions,test_data['Class']))

              precision    recall  f1-score   support

       anger       0.83      0.90      0.86       255
        fear       0.80      0.86      0.83       207
         joy       0.96      0.83      0.89       804
        love       0.54      0.82      0.65       105
     sadness       0.92      0.90      0.91       589
    surprise       0.52      0.87      0.65        39

    accuracy                           0.86      1999
   macro avg       0.76      0.86      0.80      1999
weighted avg       0.88      0.86      0.87      1999



In [197]:
#example from test_data
message = test_data['Message'].iloc[249]
print('message:',message)
print("Emotion:", test_data['Class'].iloc[4])
print("Expected Emotion:", predictions[4])


message: i feel like it was all in vain cant be right and feel this wrong this heart of mine is just
Emotion: fear
Expected Emotion: fear
