<a href="https://colab.research.google.com/github/stepthom/NLP_course/blob/main/sentiment_analysis/slides_study.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis Study

Comparing different approaches for sentiment analysis.

- Stephen W. thomas
- Used for MMAI 891
- Simple version

# Environment Setup

In [1]:
pip install vaderSentiment simpletransformers

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting simpletransformers
  Downloading simpletransformers-0.70.0-py3-none-any.whl (315 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.5/315.5 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.17.0-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.6/536.6 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorbo

In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from pandas import option_context
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from simpletransformers.classification import ClassificationModel, ClassificationArgs

import logging


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Data Loading and Prep

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/stepthom/NLP_course/main/data/Tweets.csv")
text_col = "text"
label_col = "airline_sentiment"
label_encoder = LabelEncoder()
df[label_col] = label_encoder.fit_transform(df[label_col])
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,1,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,2,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,1,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,0,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,0,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [4]:
with option_context('display.max_colwidth', None):
  display(df[[text_col, label_col]].head(15))

Unnamed: 0,text,airline_sentiment
0,@VirginAmerica What @dhepburn said.,1
1,@VirginAmerica plus you've added commercials to the experience... tacky.,2
2,@VirginAmerica I didn't today... Must mean I need to take another trip!,1
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",0
4,@VirginAmerica and it's a really big bad thing about it,0
5,@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA,0
6,"@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)",2
7,"@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP",1
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",2
9,"@VirginAmerica it was amazing, and arrived an hour early. You're too good to me.",2


In [5]:
X = df[text_col]
y = df[label_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [6]:
X_train.info()
X_test.info()

<class 'pandas.core.series.Series'>
Int64Index: 9808 entries, 8934 to 7131
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
9808 non-null   object
dtypes: object(1)
memory usage: 153.2+ KB
<class 'pandas.core.series.Series'>
Int64Index: 4832 entries, 10568 to 12916
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
4832 non-null   object
dtypes: object(1)
memory usage: 75.5+ KB


In [7]:
X_test.info()

<class 'pandas.core.series.Series'>
Int64Index: 4832 entries, 10568 to 12916
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
4832 non-null   object
dtypes: object(1)
memory usage: 75.5+ KB


In [8]:
y_test.value_counts()

0    3029
1    1023
2     780
Name: airline_sentiment, dtype: int64

In [9]:
label_encoder.transform(['neutral'])[0]

1

# Rule-based Sentiment Analysis with vaderSentiment

In [10]:
def get_preds(sentences):
  analyzer = SentimentIntensityAnalyzer()
  preds = []
  for sentence in sentences:
      _score = analyzer.polarity_scores(sentence)['compound']
      _class = label_encoder.transform(['neutral'])[0]
      if _score <= -0.05:
        _class = label_encoder.transform(['negative'])[0]
      elif _score >= 0.05:
        _class = label_encoder.transform(['positive'])[0]
      preds.append(_class)
  return preds

y_preds_train_1 = get_preds(X_train)
y_preds_test_1 = get_preds(X_test)

In [11]:
print(confusion_matrix(y_train, y_preds_train_1))
print(classification_report(y_train, y_preds_train_1))

[[2721  948 2480]
 [ 245  656 1175]
 [  50  104 1429]]
              precision    recall  f1-score   support

           0       0.90      0.44      0.59      6149
           1       0.38      0.32      0.35      2076
           2       0.28      0.90      0.43      1583

    accuracy                           0.49      9808
   macro avg       0.52      0.55      0.46      9808
weighted avg       0.69      0.49      0.51      9808



In [12]:
print(confusion_matrix(y_test, y_preds_test_1))
print(classification_report(y_test, y_preds_test_1))

[[1312  479 1238]
 [ 118  333  572]
 [  18   41  721]]
              precision    recall  f1-score   support

           0       0.91      0.43      0.59      3029
           1       0.39      0.33      0.36      1023
           2       0.28      0.92      0.44       780

    accuracy                           0.49      4832
   macro avg       0.53      0.56      0.46      4832
weighted avg       0.70      0.49      0.51      4832



# Shallow ML Sentiment Analysis with TF-IDF and RF

In [13]:


vectorizer = TfidfVectorizer(min_df=.01, max_df=.8, ngram_range=(1,3), max_features=200, stop_words=None)
clf = RandomForestClassifier(max_depth=None, n_estimators=1000, min_samples_leaf=10, random_state=0)

In [14]:
vectorizer = vectorizer.fit(X_train)
X_vec_train = vectorizer.transform(X_train)

In [15]:
clf.fit(X_vec_train, y_train)

In [16]:
X_vec_test = vectorizer.transform(X_test)
y_preds_train_2 = clf.predict(X_vec_train)
y_preds_test_2 = clf.predict(X_vec_test)

In [17]:
print(confusion_matrix(y_train, y_preds_train_2))
print(classification_report(y_train, y_preds_train_2))

[[5848  191  110]
 [1135  820  121]
 [ 596  225  762]]
              precision    recall  f1-score   support

           0       0.77      0.95      0.85      6149
           1       0.66      0.39      0.50      2076
           2       0.77      0.48      0.59      1583

    accuracy                           0.76      9808
   macro avg       0.73      0.61      0.65      9808
weighted avg       0.75      0.76      0.73      9808



In [18]:
print(confusion_matrix(y_test, y_preds_test_2))
print(classification_report(y_test, y_preds_test_2))

[[2854  117   58]
 [ 603  357   63]
 [ 333   98  349]]
              precision    recall  f1-score   support

           0       0.75      0.94      0.84      3029
           1       0.62      0.35      0.45      1023
           2       0.74      0.45      0.56       780

    accuracy                           0.74      4832
   macro avg       0.71      0.58      0.61      4832
weighted avg       0.72      0.74      0.71      4832



In [19]:
_df = X_test.to_frame()
_df['y_test'] = y_test
_df['y_preds_test_2'] = y_preds_test_2

with option_context('display.max_colwidth', None):
  display(_df[_df['y_test'] != _df['y_preds_test_2']].head(15))

Unnamed: 0,text,y_test,y_preds_test_2
476,@VirginAmerica Are there any plans for a short haul airline in Europe? Would defiantly fly with you guys :),2,0
3563,@united flight to RSW tonight -me &amp; twin 3 year olds. A pilot who was in row by me stayed to help me get the boys &amp; bags off. Lifesaver!!,2,0
10417,@USAirways I'm on flight 623 from DIA to Ontario tomorrow morning. no access to the airport via taxi or shuttle. What should I do?,1,0
6298,"@SouthwestAir OH MY GOSH SERIOUSLY?! you just made my day, week, year!!! No one will appreciate this more than me!!!",2,0
4964,@SouthwestAir My Fav!!!!,2,1
12424,"@AmericanAir ""Thank you for contacting American. The email address you have written to is an unmonitored account”",1,2
12042,@AmericanAir thanks for getting back to me. But I will find other airlines in the future.,0,2
6008,@SouthwestAir is there any chance I could get tickets to the #DestinationDragons show in Vegas? Ive been a huge fan for years! Pretty please,1,0
11326,@USAirways I got an email asking me to checkin TMRW for a flight I meant to book for 3/19 - can someone please help! @WallStSlumLord @PKG49,1,0
10635,"@USAirways Umm, can you define 'extra time'?",0,1


# Deep Learning with Fine Tuning

In [22]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=2, overwrite_output_dir=True)

# Create a ClassificationModel
model = ClassificationModel("roberta", "distilroberta-base", num_labels=3, args=model_args, use_cuda=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
_df = X_train.to_frame()
_df['labels'] = y_train
_df.columns = ["text", "labels"]
display(_df.head())
model.train_model(_df)

Unnamed: 0,text,labels
8934,@JetBlue @EllaHenderson Ah! Wish I was there! ...,1
13946,@AmericanAir if you could get the gate crew to...,1
6539,@SouthwestAir your employees were great!,2
589,"@united 441, which also had 1 working WC in co...",2
13365,@AmericanAir The website won't let me view my ...,0


ValueError: Output directory (outputs/) already exists and is not empty. Set overwrite_output_dir: True to automatically overwrite.

In [37]:
from sklearn.metrics import f1_score, accuracy_score

eval_df = X_test.to_frame()
eval_df['labels'] = y_test
eval_df.columns = ["text", "labels"]

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

y_preds_train_3, _ = model.predict(X_train.to_list())
y_preds_test_3, _ = model.predict(X_test.to_list())

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

In [38]:
print(confusion_matrix(y_train, y_preds_train_3))
print(classification_report(y_train, y_preds_train_3))

[[5972  120   57]
 [ 247 1710  119]
 [  36   73 1474]]
              precision    recall  f1-score   support

           0       0.95      0.97      0.96      6149
           1       0.90      0.82      0.86      2076
           2       0.89      0.93      0.91      1583

    accuracy                           0.93      9808
   macro avg       0.92      0.91      0.91      9808
weighted avg       0.93      0.93      0.93      9808



In [39]:
print(confusion_matrix(y_test, y_preds_test_3))
print(classification_report(y_test, y_preds_test_3))

[[2783  172   74]
 [ 249  672  102]
 [  62   71  647]]
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      3029
           1       0.73      0.66      0.69      1023
           2       0.79      0.83      0.81       780

    accuracy                           0.85      4832
   macro avg       0.81      0.80      0.80      4832
weighted avg       0.85      0.85      0.85      4832

