# Assignment 2- Sentiment Analysis


In [1]:
!pip install vaderSentiment



## Import necessary packages and libraries

In [2]:
import pandas as pd
import numpy as np
import os

import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#confusion matrix
from sklearn.metrics import confusion_matrix, classification_report,  accuracy_score
import re

import warnings
warnings.filterwarnings('ignore')

## Load dataset

- The dataset I've chosen today is the US Twitter Airline Sentiment dataset
- Link: https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment

In [3]:
file_path = './Tweets.csv'

In [4]:
df = pd.read_csv(file_path)

In [5]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


## Filter out necessary columns

- Columns we want:
  - `airline_sentiment`: containing ground truth sentiment for a given text review
  - `text`:  original tweet containing text of airline review

In [6]:
df_tweets = df[['airline_sentiment', 'text']]

## Sentiment classes
- positive
- negative
- neutral

In [7]:
df_tweets['airline_sentiment'].value_counts()

negative    9178
neutral     3099
positive    2363
Name: airline_sentiment, dtype: int64

## Work Done
- Run VADER on the dataset without any preprocessing steps - I feed the raw twitter data including special characters
- Run VADER after performing some preprocessing steps

# Part 1

## VADER without preprocessing

In [8]:
df_tweets_og = df_tweets.copy()

In [9]:
df_tweets_og

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...
...,...,...
14635,positive,@AmericanAir thank you we got on a different f...
14636,negative,@AmericanAir leaving over 20 minutes Late Flig...
14637,neutral,@AmericanAir Please bring American Airlines to...
14638,negative,"@AmericanAir you have my money, you change my ..."


Initializing the analyzer

In [10]:
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(df_tweets_og['text'][0])

{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}

Performing sentiment analysis on raw data

In [11]:
for i in ['pos', 'neg', 'neu', 'compound']:
  df_tweets_og[i] = df_tweets_og['text'].apply(lambda comment: analyzer.polarity_scores(comment)[i])

In [12]:
df_tweets_og[['text', 'pos', 'neg', 'neu', 'compound', 'airline_sentiment']]

Unnamed: 0,text,pos,neg,neu,compound,airline_sentiment
0,@VirginAmerica What @dhepburn said.,0.000,0.000,1.000,0.0000,neutral
1,@VirginAmerica plus you've added commercials t...,0.000,0.000,1.000,0.0000,positive
2,@VirginAmerica I didn't today... Must mean I n...,0.000,0.000,1.000,0.0000,neutral
3,@VirginAmerica it's really aggressive to blast...,0.129,0.226,0.645,-0.2716,negative
4,@VirginAmerica and it's a really big bad thing...,0.000,0.296,0.704,-0.5829,negative
...,...,...,...,...,...,...
14635,@AmericanAir thank you we got on a different f...,0.200,0.000,0.800,0.3612,positive
14636,@AmericanAir leaving over 20 minutes Late Flig...,0.071,0.136,0.793,-0.4043,negative
14637,@AmericanAir Please bring American Airlines to...,0.277,0.000,0.723,0.3182,neutral
14638,"@AmericanAir you have my money, you change my ...",0.129,0.000,0.871,0.5027,negative


In [13]:
df_tweets_og.iloc[14635]

airline_sentiment                                             positive
text                 @AmericanAir thank you we got on a different f...
pos                                                                0.2
neg                                                                0.0
neu                                                                0.8
compound                                                        0.3612
Name: 14635, dtype: object

## Categorizing reviews
- change ground truth (gt) string sentiment to numbers
  - `positive`: 1
  - `negative`: 2
  - `neutral`: 0

In [14]:
#metrics
#convert airline_sentiment to numbers
df_tweets_og['airline_sentiment_num_gt'] = ""
for i in df_tweets_og.index:
  if df_tweets_og.loc[i, 'airline_sentiment'] == 'positive':
    df_tweets_og.loc[i, 'airline_sentiment_num_gt'] = 1
  elif df_tweets_og.loc[i, 'airline_sentiment'] == 'neutral':
    df_tweets_og.loc[i, 'airline_sentiment_num_gt'] = 0
  elif df_tweets_og.loc[i, 'airline_sentiment'] == 'negative':
    df_tweets_og.loc[i, 'airline_sentiment_num_gt'] = 2

df_tweets_og['airline_sentiment_num_gt'] = df_tweets_og['airline_sentiment_num_gt'].astype(int)

In [15]:
df_tweets_og['airline_sentiment_num_pred'] = ""
for i in df_tweets_og.index:
  if df_tweets_og.loc[i, 'compound'] > 0:
    df_tweets_og.loc[i, 'airline_sentiment_num_pred'] = 1
  elif df_tweets_og.loc[i, 'compound'] < 0:
    df_tweets_og.loc[i, 'airline_sentiment_num_pred'] = 2
  else:
    df_tweets_og.loc[i, 'airline_sentiment_num_pred'] = 0

df_tweets_og['airline_sentiment_num_pred'] = df_tweets_og['airline_sentiment_num_pred'].astype(int)

Get accuracy score based on predicted sentiment and ground truth sentiment

In [16]:
accuracy_score(df_tweets_og['airline_sentiment_num_gt'].values, df_tweets_og['airline_sentiment_num_pred'].values)

0.4948770491803279

Generate a confusion matrix and classification report

In [17]:
confusion_matrix(df_tweets_og['airline_sentiment_num_gt'].values, df_tweets_og['airline_sentiment_num_pred'].values)

array([[ 966, 1767,  366],
       [ 138, 2153,   72],
       [1219, 3833, 4126]])

In [18]:
print(classification_report(df_tweets_og['airline_sentiment_num_gt'].values, df_tweets_og['airline_sentiment_num_pred'].values,
                            target_names=['neutral', 'positive', 'negative']))

              precision    recall  f1-score   support

     neutral       0.42      0.31      0.36      3099
    positive       0.28      0.91      0.43      2363
    negative       0.90      0.45      0.60      9178

    accuracy                           0.49     14640
   macro avg       0.53      0.56      0.46     14640
weighted avg       0.70      0.49      0.52     14640



## Results on Raw Data

- out of the 9178 negative sentences, 4126 are classified correctly
- out of the 3099 neutral sentences, 966 are classified correctly 
- out of the 2363 positive sentences, 2153 are classified correctly

- overall accuracy for this dataset without preprocessing is ~49.48%

## Interpretation

- `df_tweets_og.iloc[14635]['text']`: the text in index `14635` is `@AmericanAir thank you we got on a different flight to Chicago`
- The ground truth is `positive`
- VADER's scores:
  - positive: 0.2
  - negative: 0.0
  - neutral: 0.8
  - compound score: 0.3612
- We can see that VADER is sure that it isn't a statement with `negative` sentiment as the `negative` score is 0.0
- Although, it's leaning more towards the statement being `neutral` as the `positive` score is less than `neutral` score
- However, the `compound` score is 0.361, which is not as close to 1.0, which indicates not so strong positive sentiment classification.
- With my classification interpretation of `compound > 0 == positive`, the classification is taken as positive.

# Part 2
## VADER with Preprocessing

- make string lower case
- replace special chars - @, #, $ with empty string
- remove unwanted spaces
- replace end-of-line characters

- Since VADER performs better for whole sentences, I'm choosing not to remove stopwords

In [19]:
#preprocessing
df_tweets['text'] = df_tweets['text'].str.lower()
df_tweets['text'] = df_tweets['text'].str.replace("\n", "")
df_tweets['text'] = df_tweets['text'].str.replace("@", "")
df_tweets['text'] = df_tweets['text'].str.replace("$", "")
df_tweets['text'] = df_tweets['text'].str.replace("#", "")
df_tweets['text'] = df_tweets['text'].str.lstrip().str.rstrip().str.strip()

In [20]:
df_tweets['text']

0                        virginamerica what dhepburn said.
1        virginamerica plus you've added commercials to...
2        virginamerica i didn't today... must mean i ne...
3        virginamerica it's really aggressive to blast ...
4        virginamerica and it's a really big bad thing ...
                               ...                        
14635    americanair thank you we got on a different fl...
14636    americanair leaving over 20 minutes late fligh...
14637    americanair please bring american airlines to ...
14638    americanair you have my money, you change my f...
14639    americanair we have 8 ppl so we need 2 know ho...
Name: text, Length: 14640, dtype: object

## Sentiment Analysis
- load VADER sentiment analyzer
- Get scores for each text

In [21]:
analyzer = SentimentIntensityAnalyzer()
analyzer.polarity_scores(df_tweets['text'][0])

{'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}

In [22]:
for i in ['pos', 'neg', 'neu', 'compound']:
  df_tweets[i] = df_tweets['text'].apply(lambda comment: analyzer.polarity_scores(comment)[i])

In [23]:
df_tweets[['text', 'pos', 'neg', 'neu', 'compound', 'airline_sentiment']]

Unnamed: 0,text,pos,neg,neu,compound,airline_sentiment
0,virginamerica what dhepburn said.,0.000,0.000,1.000,0.0000,neutral
1,virginamerica plus you've added commercials to...,0.000,0.000,1.000,0.0000,positive
2,virginamerica i didn't today... must mean i ne...,0.000,0.000,1.000,0.0000,neutral
3,virginamerica it's really aggressive to blast ...,0.129,0.226,0.645,-0.2716,negative
4,virginamerica and it's a really big bad thing ...,0.000,0.296,0.704,-0.5829,negative
...,...,...,...,...,...,...
14635,americanair thank you we got on a different fl...,0.200,0.000,0.800,0.3612,positive
14636,americanair leaving over 20 minutes late fligh...,0.071,0.136,0.793,-0.4043,negative
14637,americanair please bring american airlines to ...,0.277,0.000,0.723,0.3182,neutral
14638,"americanair you have my money, you change my f...",0.129,0.000,0.871,0.5027,negative


In [24]:
#metrics
#convert airline_sentiment to numbers
df_tweets['airline_sentiment_num_gt'] = ""
for i in df_tweets.index:
  if df_tweets.loc[i, 'airline_sentiment'] == 'positive':
    df_tweets.loc[i, 'airline_sentiment_num_gt'] = 1
  elif df_tweets.loc[i, 'airline_sentiment'] == 'neutral':
    df_tweets.loc[i, 'airline_sentiment_num_gt'] = 0
  elif df_tweets.loc[i, 'airline_sentiment'] == 'negative':
    df_tweets.loc[i, 'airline_sentiment_num_gt'] = 2

In [25]:
df_tweets['airline_sentiment_num_gt'] = df_tweets['airline_sentiment_num_gt'].astype(int)

In [26]:
df_tweets.head()

Unnamed: 0,airline_sentiment,text,pos,neg,neu,compound,airline_sentiment_num_gt
0,neutral,virginamerica what dhepburn said.,0.0,0.0,1.0,0.0,0
1,positive,virginamerica plus you've added commercials to...,0.0,0.0,1.0,0.0,1
2,neutral,virginamerica i didn't today... must mean i ne...,0.0,0.0,1.0,0.0,0
3,negative,virginamerica it's really aggressive to blast ...,0.129,0.226,0.645,-0.2716,2
4,negative,virginamerica and it's a really big bad thing ...,0.0,0.296,0.704,-0.5829,2


## Predicted Sentiments
- we use the `compound` column to categorize predictions
  - if `compound > 0` == `positive` sentiment
  - if `compound < 0` == `negative` sentiment
  - if `compound == 0` == `neutral` sentiment

In [27]:
df_tweets['airline_sentiment_num_pred'] = ""
for i in df_tweets.index:
  if df_tweets.loc[i, 'compound'] > 0:
    df_tweets.loc[i, 'airline_sentiment_num_pred'] = 1
  elif df_tweets.loc[i, 'compound'] < 0:
    df_tweets.loc[i, 'airline_sentiment_num_pred'] = 2
  else:
    df_tweets.loc[i, 'airline_sentiment_num_pred'] = 0

In [28]:
df_tweets['airline_sentiment_num_pred'] = df_tweets['airline_sentiment_num_pred'].astype(int)

## Analyzing VADER performance after preprocessing
- construct a confusion matrix 
- construct classification report

In [29]:
accuracy_score(df_tweets['airline_sentiment_num_gt'].values, df_tweets['airline_sentiment_num_pred'].values)

0.49405737704918035

In [30]:
confusion_matrix(df_tweets['airline_sentiment_num_gt'].values, df_tweets['airline_sentiment_num_pred'].values)

array([[ 966, 1768,  365],
       [ 140, 2151,   72],
       [1222, 3840, 4116]])

In [31]:
print(classification_report(df_tweets['airline_sentiment_num_gt'].values, df_tweets['airline_sentiment_num_pred'].values,
                            target_names=['neutral', 'positive', 'negative']))

              precision    recall  f1-score   support

     neutral       0.41      0.31      0.36      3099
    positive       0.28      0.91      0.43      2363
    negative       0.90      0.45      0.60      9178

    accuracy                           0.49     14640
   macro avg       0.53      0.56      0.46     14640
weighted avg       0.70      0.49      0.52     14640



## Results after preprocessing

- out of the 9178 negative sentences, 4116 are classified correctly
- out of the 3099 neutral sentences, 966 are classified correctly 
- out of the 2363 positive sentences, 2151 are classified correctly

- overall accuracy for this dataset without preprocessing is ~49.40%

## Q1: How is your analyzer working? 

### Answer
- I've chosen VADER (Valence Aware Dictionary and Sentiment Reasoner). It is a lexicon/rule based approach to sentiment classification
- In addition to giving a positive/negative class label, it gives the strength of sentiment as well
- VADER depends on a pre-defined dictionary of sentiment words and a score given to it
- VADER's lexicon includes acronyms and emoticons as well (ex: :-) is a smile emoji) making it a good option for analyzing social media data.
## Q2: How do you "know" if your text is positive, negative, etc.? 

### Answer
- Since VADER gives a "strength" score (how positive or negative is a sentence) with each sentiment, I've made a rule that says: 
  - if the `score is >0`, the sentence has a positive sentiment
  - if the `score is <0`, the sentence has a positive sentiment
  - if the `score == 0`, the sentence is neutral

- The `compound` score is the sum of positive, negative and neutral scores which is then normalized between `[-1,1]`
- The more closer the `compound` score is to +1, the more positive the sentence is. Similarly, if the `compound` score is close to -1, the sentence is more negative.

##Q3: Do you disagree with any results? If so, why?

- For the dataset I've chosen, VADER doesn't perform too well regardless of whether we preprocess the data or not
- The accuracy scores from the classification report has a maximum value of 49%
- Since this dataset involves retweets and comments, there might not be a lot of "sentiment" associated with a sentence which could be why VADER tends to underperform.

## Performance comparison

- There's no major difference in performance whether we perform sentiment analysis on raw data or on preprocessed data
- The positive sentences are getting classified with most accuracy, then negative.
- It's unable to capture tweets with neutral sentiment with high accuracy.



## Extra: Custom Classifier

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [33]:
vectorizer = TfidfVectorizer()

In [34]:
X = vectorizer.fit_transform(df_tweets['text'])
y = df_tweets['airline_sentiment_num_gt']

In [35]:
x_train, x_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2)

In [36]:
model = LogisticRegression()
model.fit(x_train, y_train)

LogisticRegression()

In [37]:
y_pred= model.predict(x_test)

In [38]:
accuracy_score(y_test, y_pred)

0.8090846994535519

In [39]:
confusion_matrix(y_test, y_pred)

array([[ 326,   50,  253],
       [  48,  305,  123],
       [  61,   24, 1738]])

In [40]:
print(classification_report(y_test, y_pred,
                            target_names=['neutral', 'positive', 'negative']))

              precision    recall  f1-score   support

     neutral       0.75      0.52      0.61       629
    positive       0.80      0.64      0.71       476
    negative       0.82      0.95      0.88      1823

    accuracy                           0.81      2928
   macro avg       0.79      0.70      0.74      2928
weighted avg       0.80      0.81      0.80      2928



In [49]:
model.predict(vectorizer.transform(["@VirginAmerica's services suck"]))

array([0])

In [42]:
model.predict(vectorizer.transform(["americanairlines is amazing"]))

array([1])

In [43]:
model.predict(vectorizer.transform(["the service was not great but the food was good"]))

array([1])

- From the custom classification, we can see there is a test accuracy of 81%
- For a custom test case `americanairlines is amazing`, we see the classifier is classifying it correctly as `positive`
- `@VirginAmerica's services suck` has a `negative` classification.
- Confusion Matrix
``` 
[[ 326,   50,  253],
[  48,  305,  123],
[  61,   24, 1738]]
``` 
- From here we can see that most of the statements are correctly classified (seen with the diagnoal values)