<a href="https://colab.research.google.com/github/stepthom/NLP_course/blob/main/sentiment_analysis/slides_study.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis Study

Comparing different approaches for sentiment analysis.

- Stephen W. thomas
- Used for MMAI 891
- Simple version

# Environment Setup

In [1]:
pip install vaderSentiment simpletransformers



In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from pandas import option_context
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from simpletransformers.classification import ClassificationModel, ClassificationArgs

import logging


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Data Loading and Prep

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/stepthom/NLP_course/main/data/Tweets.csv")
text_col = "text"
label_col = "label"

# Later, simpletransformers won't work with str() classes. So let's convert them
# to integers as follows.
class_dict = {'negative': 0, 'neutral': 1 ,'positive': 2}


df[label_col] = df["airline_sentiment"].apply(lambda x: class_dict.get(x))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [4]:
df["airline_sentiment"].value_counts()

airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64

In [5]:
df[label_col].value_counts()

label
0    9178
1    3099
2    2363
Name: count, dtype: int64

In [6]:
with option_context('display.max_colwidth', None):
  display(df[[text_col, label_col]].head(10))

Unnamed: 0,text,label
0,@VirginAmerica What @dhepburn said.,1
1,@VirginAmerica plus you've added commercials to the experience... tacky.,2
2,@VirginAmerica I didn't today... Must mean I need to take another trip!,1
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",0
4,@VirginAmerica and it's a really big bad thing about it,0
5,@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA,0
6,"@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)",2
7,"@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP",1
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",2
9,"@VirginAmerica it was amazing, and arrived an hour early. You're too good to me.",2


In [7]:
X = df[text_col]
y = df[label_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [8]:
X_train.info()
X_test.info()

<class 'pandas.core.series.Series'>
Index: 9808 entries, 8934 to 7131
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
9808 non-null   object
dtypes: object(1)
memory usage: 153.2+ KB
<class 'pandas.core.series.Series'>
Index: 4832 entries, 10568 to 12916
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
4832 non-null   object
dtypes: object(1)
memory usage: 75.5+ KB


In [9]:
X_test.info()

<class 'pandas.core.series.Series'>
Index: 4832 entries, 10568 to 12916
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
4832 non-null   object
dtypes: object(1)
memory usage: 75.5+ KB


In [10]:
y_test.value_counts()

label
0    3029
1    1023
2     780
Name: count, dtype: int64

# LLM Prompting

In [11]:
from transformers import pipeline
#generator = pipeline('text-generation', model = 'HuggingFaceH4/zephyr-7b-beta')
generator = pipeline('text2text-generation', model = 'google/flan-t5-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [12]:
generator("The capital city of California is")



[{'generated_text': 'San Francisco'}]

In [13]:
# since this is expensive, don't do them all
test_instances = 2000

In [14]:
pre = "Please classify the overall SENTIMENT polarity of the INPUT sentence as positive, neutral or negative."
#sentence = "@VirginAmerica Are there any plans for a short haul airline in Europe?"

y_preds_test_4 = []

for sentence in X_test.head(test_instances):
  prompt = "{}\nINPUT: {}\n SENTIMENT: ".format(pre, sentence)
  print(prompt)
  response = generator(prompt, max_new_tokens=1)[0]['generated_text']
  print(response)
  _class = class_dict.get(response, 'unknown')
  y_preds_test_4.append(_class)

print(y_preds_test_4)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INPUT: @JetBlue Have a cup coffee and relax while you check out the New Deals and Promotions at Avon, twice a month at Doug @dcoadavon
 SENTIMENT: 
positive
Please classify the overall SENTIMENT polarity of the INPUT sentence as positive, neutral or negative.
INPUT: @united Flight 1547 CMH to ORD just arrived at gate. Next segment to YVR already boarding. Will they hold it?
 SENTIMENT: 
negative
Please classify the overall SENTIMENT polarity of the INPUT sentence as positive, neutral or negative.
INPUT: Stop the madness "@JetBlue: Our fleet's on fleek. http://t.co/Q5a7jtkI5K”
 SENTIMENT: 
negative
Please classify the overall SENTIMENT polarity of the INPUT sentence as positive, neutral or negative.
INPUT: @ods1819 aren't you glad this isn't you RT @AmericanAir: Bet these birds wish they'd flown south for the #winter... http://t.co/HEpkNpuzwU
 SENTIMENT: 
negative
Please classify the overall SENTIMENT polarity of the INPUT

In [15]:
print(confusion_matrix(y_test[:test_instances], y_preds_test_4))
print(classification_report(y_test[:test_instances], y_preds_test_4))

[[1163   12   85]
 [ 218   13  192]
 [  22    0  295]]
              precision    recall  f1-score   support

           0       0.83      0.92      0.87      1260
           1       0.52      0.03      0.06       423
           2       0.52      0.93      0.66       317

    accuracy                           0.74      2000
   macro avg       0.62      0.63      0.53      2000
weighted avg       0.71      0.74      0.67      2000



# Rule-based Sentiment Analysis with vaderSentiment

In [17]:
def get_preds(sentences):
  analyzer = SentimentIntensityAnalyzer()
  preds = []
  for sentence in sentences:
      _score = analyzer.polarity_scores(sentence)['compound']
      _class = class_dict.get('neutral')
      if _score <= -0.05:
        _class = class_dict.get('negative')
      elif _score >= 0.05:
        _class = class_dict.get('positive')
      preds.append(_class)
  return preds

y_preds_train_1 = get_preds(X_train)
y_preds_test_1 = get_preds(X_test)

In [18]:
print(confusion_matrix(y_train, y_preds_train_1))
print(classification_report(y_train, y_preds_train_1))

[[2721  948 2480]
 [ 245  656 1175]
 [  50  104 1429]]
              precision    recall  f1-score   support

           0       0.90      0.44      0.59      6149
           1       0.38      0.32      0.35      2076
           2       0.28      0.90      0.43      1583

    accuracy                           0.49      9808
   macro avg       0.52      0.55      0.46      9808
weighted avg       0.69      0.49      0.51      9808



In [19]:
print(confusion_matrix(y_test, y_preds_test_1))
print(classification_report(y_test, y_preds_test_1))

[[1312  479 1238]
 [ 118  333  572]
 [  18   41  721]]
              precision    recall  f1-score   support

           0       0.91      0.43      0.59      3029
           1       0.39      0.33      0.36      1023
           2       0.28      0.92      0.44       780

    accuracy                           0.49      4832
   macro avg       0.53      0.56      0.46      4832
weighted avg       0.70      0.49      0.51      4832



# Shallow ML Sentiment Analysis with TF-IDF and RF

In [20]:


vectorizer = TfidfVectorizer(min_df=.01, max_df=.8, ngram_range=(1,3), max_features=200, stop_words=None)
clf = RandomForestClassifier(max_depth=None, n_estimators=1000, min_samples_leaf=10, random_state=0)

In [21]:
vectorizer = vectorizer.fit(X_train)
X_vec_train = vectorizer.transform(X_train)

In [22]:
clf.fit(X_vec_train, y_train)

In [23]:
X_vec_test = vectorizer.transform(X_test)
y_preds_train_2 = clf.predict(X_vec_train)
y_preds_test_2 = clf.predict(X_vec_test)

In [24]:
print(confusion_matrix(y_train, y_preds_train_2))
print(classification_report(y_train, y_preds_train_2))

[[5848  191  110]
 [1135  820  121]
 [ 596  225  762]]
              precision    recall  f1-score   support

           0       0.77      0.95      0.85      6149
           1       0.66      0.39      0.50      2076
           2       0.77      0.48      0.59      1583

    accuracy                           0.76      9808
   macro avg       0.73      0.61      0.65      9808
weighted avg       0.75      0.76      0.73      9808



In [25]:
print(confusion_matrix(y_test, y_preds_test_2))
print(classification_report(y_test, y_preds_test_2))

[[2854  117   58]
 [ 603  357   63]
 [ 333   98  349]]
              precision    recall  f1-score   support

           0       0.75      0.94      0.84      3029
           1       0.62      0.35      0.45      1023
           2       0.74      0.45      0.56       780

    accuracy                           0.74      4832
   macro avg       0.71      0.58      0.61      4832
weighted avg       0.72      0.74      0.71      4832



# Deep Learning with Fine Tuning

In [26]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=2, overwrite_output_dir=True)

# Create a ClassificationModel
model = ClassificationModel("roberta", "distilroberta-base", num_labels=3, args=model_args, use_cuda=True)

config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [27]:
_df = X_train.to_frame()
_df['labels'] = y_train
_df.columns = ["text", "labels"]
display(_df.head())
model.train_model(_df)

Unnamed: 0,text,labels
8934,@JetBlue @EllaHenderson Ah! Wish I was there! ...,1
13946,@AmericanAir if you could get the gate crew to...,1
6539,@SouthwestAir your employees were great!,2
589,"@united 441, which also had 1 working WC in co...",2
13365,@AmericanAir The website won't let me view my ...,0


  0%|          | 0/19 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/1226 [00:00<?, ?it/s]

Running Epoch 2 of 2:   0%|          | 0/1226 [00:00<?, ?it/s]

(2452, 0.4461321955209455)

In [28]:
from sklearn.metrics import f1_score, accuracy_score

eval_df = X_test.to_frame()
eval_df['labels'] = y_test
eval_df.columns = ["text", "labels"]

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

y_preds_train_3, _ = model.predict(X_train.to_list())
y_preds_test_3, _ = model.predict(X_test.to_list())

  0%|          | 0/19 [00:00<?, ?it/s]

  0%|          | 0/99 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/49 [00:00<?, ?it/s]

In [29]:
print(confusion_matrix(y_train, y_preds_train_3))
print(classification_report(y_train, y_preds_train_3))

[[5959  139   51]
 [ 240 1715  121]
 [  30   57 1496]]
              precision    recall  f1-score   support

           0       0.96      0.97      0.96      6149
           1       0.90      0.83      0.86      2076
           2       0.90      0.95      0.92      1583

    accuracy                           0.93      9808
   macro avg       0.92      0.91      0.91      9808
weighted avg       0.93      0.93      0.93      9808



In [30]:
print(confusion_matrix(y_test, y_preds_test_3))
print(classification_report(y_test, y_preds_test_3))

[[2782  166   81]
 [ 240  680  103]
 [  56   67  657]]
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      3029
           1       0.74      0.66      0.70      1023
           2       0.78      0.84      0.81       780

    accuracy                           0.85      4832
   macro avg       0.81      0.81      0.81      4832
weighted avg       0.85      0.85      0.85      4832

