In [1]:
import pandas as pd
import numpy as np
import ast

from nrclex import NRCLex
from sklearn import utils
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, naive_bayes, svm
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

In [2]:
from nltk.corpus import stopwords
stopwords = stopwords.words('english')

In [3]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

[nltk_data] Error loading vader_lexicon: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1076)>


In [4]:
df = pd.read_csv("./Dataset Creation/well-narratives.csv")

# Cleaning

In [5]:
def clean_text(df, column_name):
    df['cleaned_text'] = df[column_name].fillna('')
    df['cleaned_text'] = df['cleaned_text'].str.lower()
    df['cleaned_text'] = df['cleaned_text'].str.replace(r'(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|rt|\d+', '')
    df['cleaned_text'] = df['cleaned_text'].str.replace(r'^\s+|\s+$', '') 
    df['cleaned_text'] = df['cleaned_text'].apply(lambda x: ' '.join([w for w in x.split() if w not in (stopwords)]))
    return df

In [6]:
df_cleaned = clean_text(df, "sentence")

# Tagging

In [7]:
def tag_sentiment(string):
    result = sid.polarity_scores(string)
    if (result['compound'] > 0):
        return "POS"
    elif (result['compound'] == 0):
        return "NEU"
    else:
        return "NEG"

In [8]:
df_cleaned["sentiment_tag"] = df_cleaned["cleaned_text"].apply(lambda x : tag_sentiment(x))

# Results

In [9]:
df_cleaned['sentiment_tag'].value_counts()

POS    1415
NEU     797
NEG     164
Name: sentiment_tag, dtype: int64

In [10]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,hmid,wid,reflection_period,original_hm,modified,num_sentence,ground_truth_category,predicted_category,match,sentence,cleaned_text,sentiment_tag
0,893,3235,2706,30396.0,3304.0,24h,I was happy when I got my driver's license ren...,True,2.0,,achievement,1.0,It had been months since they were expired.,months since expired,NEU
1,1056,23577,19134,46915.0,1458.0,24h,I actually got a chance at love again. My ex f...,True,3.0,,affection,1.0,My ex from high school came to town and showed...,ex high school came town showed great time cha...,POS
2,1139,2470,2170,29858.0,10726.0,24h,"Last month, I got selection in a Government Ex...",True,1.0,,achievement,1.0,That moment of joy I cannot really express.,moment joy cannot really express,POS
3,206,14320,11257,38999.0,916.0,24h,I went to the gym and had a good workout.,True,1.0,,exercise,0.0,I went to the gym and had a good workout.,went gym good workout,POS
4,1184,9569,7579,35307.0,1929.0,24h,I went to a new italian restaurant that recent...,True,4.0,,enjoy_the_moment,0.0,I went to a new italian restaurant that recent...,went new italian restaurant recently opened,NEU


# Filtering

In [11]:
def filter_sentiment(row):
    text_object = NRCLex(row["cleaned_text"])
    return str(text_object.affect_frequencies)

In [12]:
df_cleaned["sentiments"] = df_cleaned.apply(filter_sentiment, axis = 1)

In [13]:
df_cleaned.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,hmid,wid,reflection_period,original_hm,modified,num_sentence,ground_truth_category,predicted_category,match,sentence,cleaned_text,sentiment_tag,sentiments
0,893,3235,2706,30396.0,3304.0,24h,I was happy when I got my driver's license ren...,True,2.0,,achievement,1.0,It had been months since they were expired.,months since expired,NEU,"{'fear': 0.0, 'anger': 0.0, 'anticip': 0.0, 't..."
1,1056,23577,19134,46915.0,1458.0,24h,I actually got a chance at love again. My ex f...,True,3.0,,affection,1.0,My ex from high school came to town and showed...,ex high school came town showed great time cha...,POS,"{'fear': 0.0, 'anger': 0.0, 'anticip': 0.0, 't..."
2,1139,2470,2170,29858.0,10726.0,24h,"Last month, I got selection in a Government Ex...",True,1.0,,achievement,1.0,That moment of joy I cannot really express.,moment joy cannot really express,POS,"{'fear': 0.0, 'anger': 0.0, 'anticip': 0.0, 't..."
3,206,14320,11257,38999.0,916.0,24h,I went to the gym and had a good workout.,True,1.0,,exercise,0.0,I went to the gym and had a good workout.,went gym good workout,POS,"{'fear': 0.0, 'anger': 0.0, 'anticip': 0.0, 't..."
4,1184,9569,7579,35307.0,1929.0,24h,I went to a new italian restaurant that recent...,True,4.0,,enjoy_the_moment,0.0,I went to a new italian restaurant that recent...,went new italian restaurant recently opened,NEU,"{'fear': 0.0, 'anger': 0.0, 'anticip': 0.0, 't..."


# SVM Model

In [14]:
sentiments = pd.DataFrame(columns=['cleaned_text', 'tag', 'fear', 'anger', 'anticipation', 'trust', 'surprise', 
                                   'positive', 'negative', 'sadness', 'disgust', 'joy'])

In [15]:
for index, row in df_cleaned.iterrows():
    narrative = row['cleaned_text']
    tag = row['sentiment_tag']
    breakdown = ast.literal_eval(row['sentiments'])
    
    breakdown['cleaned_text'] = narrative
    breakdown['tag'] = tag
    sentiments = sentiments.append(breakdown, ignore_index=True)

In [16]:
sentiments = sentiments.replace(np.nan, 0)

In [17]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(sentiments[['fear', 'anger', 'anticipation', 'trust', 'surprise', 
                                   'positive', 'negative', 'sadness', 'disgust', 'joy']],sentiments['tag'],test_size=0.3,random_state=42)

In [18]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(Train_X,Train_Y)

SVC(gamma='auto', kernel='linear')

In [19]:
predictions_SVM = SVM.predict(Test_X)
print("SVM Accuracy Score: ",accuracy_score(predictions_SVM, Test_Y)*100)

SVM Accuracy Score:  62.833099579242635


In [20]:
print(classification_report(Test_Y, predictions_SVM))

              precision    recall  f1-score   support

         NEG       0.00      0.00      0.00        65
         NEU       0.59      0.43      0.50       238
         POS       0.64      0.84      0.73       410

    accuracy                           0.63       713
   macro avg       0.41      0.42      0.41       713
weighted avg       0.56      0.63      0.59       713



  _warn_prf(average, modifier, msg_start, len(result))


# Naive Bayes

In [22]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(Train_X,Train_Y)

predictions_NB = Naive.predict(Test_X)

print("Naive Bayes Accuracy Score: ", accuracy_score(predictions_NB, Test_Y)*100)

Naive Bayes Accuracy Score:  57.503506311360454


In [23]:
print(classification_report(Test_Y, predictions_NB))

              precision    recall  f1-score   support

         NEG       0.00      0.00      0.00        65
         NEU       0.00      0.00      0.00       238
         POS       0.58      1.00      0.73       410

    accuracy                           0.58       713
   macro avg       0.19      0.33      0.24       713
weighted avg       0.33      0.58      0.42       713



  _warn_prf(average, modifier, msg_start, len(result))
