<a href="https://colab.research.google.com/github/stepthom/NLP_course/blob/main/sentiment_analysis/slides_study.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis Study

Comparing different approaches for sentiment analysis.

- Stephen W. thomas
- Used for MMAI 891
- Simple version

# Environment Setup

In [1]:
pip install vaderSentiment simpletransformers panml



In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from pandas import option_context
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from simpletransformers.classification import ClassificationModel, ClassificationArgs

import logging


logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Data Loading and Prep

In [3]:
df = pd.read_csv("https://raw.githubusercontent.com/stepthom/NLP_course/main/data/Tweets.csv")
text_col = "text"
label_col = "label"

# Later, simpletransformers won't work with str() classes. So let's convert them
# to integers as follows.
class_dict = {'negative': 0, 'neutral': 1 ,'positive': 2}


df[label_col] = df["airline_sentiment"].apply(lambda x: class_dict.get(x))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14640 entries, 0 to 14639
Data columns (total 16 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   tweet_id                      14640 non-null  int64  
 1   airline_sentiment             14640 non-null  object 
 2   airline_sentiment_confidence  14640 non-null  float64
 3   negativereason                9178 non-null   object 
 4   negativereason_confidence     10522 non-null  float64
 5   airline                       14640 non-null  object 
 6   airline_sentiment_gold        40 non-null     object 
 7   name                          14640 non-null  object 
 8   negativereason_gold           32 non-null     object 
 9   retweet_count                 14640 non-null  int64  
 10  text                          14640 non-null  object 
 11  tweet_coord                   1019 non-null   object 
 12  tweet_created                 14640 non-null  object 
 13  t

In [4]:
df["airline_sentiment"].value_counts()

airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64

In [5]:
df[label_col].value_counts()

label
0    9178
1    3099
2    2363
Name: count, dtype: int64

In [6]:
with option_context('display.max_colwidth', None):
  display(df[[text_col, label_col]].head(10))

Unnamed: 0,text,label
0,@VirginAmerica What @dhepburn said.,1
1,@VirginAmerica plus you've added commercials to the experience... tacky.,2
2,@VirginAmerica I didn't today... Must mean I need to take another trip!,1
3,"@VirginAmerica it's really aggressive to blast obnoxious ""entertainment"" in your guests' faces &amp; they have little recourse",0
4,@VirginAmerica and it's a really big bad thing about it,0
5,@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.\nit's really the only bad thing about flying VA,0
6,"@VirginAmerica yes, nearly every time I fly VX this “ear worm” won’t go away :)",2
7,"@VirginAmerica Really missed a prime opportunity for Men Without Hats parody, there. https://t.co/mWpG7grEZP",1
8,"@virginamerica Well, I didn't…but NOW I DO! :-D",2
9,"@VirginAmerica it was amazing, and arrived an hour early. You're too good to me.",2


In [7]:
X = df[text_col]
y = df[label_col]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42, stratify=y)

In [8]:
X_train.info()
X_test.info()

<class 'pandas.core.series.Series'>
Index: 9808 entries, 8934 to 7131
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
9808 non-null   object
dtypes: object(1)
memory usage: 153.2+ KB
<class 'pandas.core.series.Series'>
Index: 4832 entries, 10568 to 12916
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
4832 non-null   object
dtypes: object(1)
memory usage: 75.5+ KB


In [9]:
X_test.info()

<class 'pandas.core.series.Series'>
Index: 4832 entries, 10568 to 12916
Series name: text
Non-Null Count  Dtype 
--------------  ----- 
4832 non-null   object
dtypes: object(1)
memory usage: 75.5+ KB


In [10]:
y_test.value_counts()

label
0    3029
1    1023
2     780
Name: count, dtype: int64

# Rule-based Sentiment Analysis with vaderSentiment

In [11]:
def get_preds(sentences):
  analyzer = SentimentIntensityAnalyzer()
  preds = []
  for sentence in sentences:
      _score = analyzer.polarity_scores(sentence)['compound']
      _class = class_dict.get('neutral')
      if _score <= -0.05:
        _class = class_dict.get('negative')
      elif _score >= 0.05:
        _class = class_dict.get('positive')
      preds.append(_class)
  return preds

y_preds_train_1 = get_preds(X_train)
y_preds_test_1 = get_preds(X_test)

In [12]:
print(confusion_matrix(y_train, y_preds_train_1))
print(classification_report(y_train, y_preds_train_1))

[[2721  948 2480]
 [ 245  656 1175]
 [  50  104 1429]]
              precision    recall  f1-score   support

           0       0.90      0.44      0.59      6149
           1       0.38      0.32      0.35      2076
           2       0.28      0.90      0.43      1583

    accuracy                           0.49      9808
   macro avg       0.52      0.55      0.46      9808
weighted avg       0.69      0.49      0.51      9808



In [13]:
print(confusion_matrix(y_test, y_preds_test_1))
print(classification_report(y_test, y_preds_test_1))

[[1312  479 1238]
 [ 118  333  572]
 [  18   41  721]]
              precision    recall  f1-score   support

           0       0.91      0.43      0.59      3029
           1       0.39      0.33      0.36      1023
           2       0.28      0.92      0.44       780

    accuracy                           0.49      4832
   macro avg       0.53      0.56      0.46      4832
weighted avg       0.70      0.49      0.51      4832



# Shallow ML Sentiment Analysis with TF-IDF and RF

In [14]:


vectorizer = TfidfVectorizer(min_df=.01, max_df=.8, ngram_range=(1,3), max_features=200, stop_words=None)
clf = RandomForestClassifier(max_depth=None, n_estimators=1000, min_samples_leaf=10, random_state=0)

In [15]:
vectorizer = vectorizer.fit(X_train)
X_vec_train = vectorizer.transform(X_train)

In [16]:
clf.fit(X_vec_train, y_train)

In [17]:
X_vec_test = vectorizer.transform(X_test)
y_preds_train_2 = clf.predict(X_vec_train)
y_preds_test_2 = clf.predict(X_vec_test)

In [18]:
print(confusion_matrix(y_train, y_preds_train_2))
print(classification_report(y_train, y_preds_train_2))

[[5848  191  110]
 [1135  820  121]
 [ 596  225  762]]
              precision    recall  f1-score   support

           0       0.77      0.95      0.85      6149
           1       0.66      0.39      0.50      2076
           2       0.77      0.48      0.59      1583

    accuracy                           0.76      9808
   macro avg       0.73      0.61      0.65      9808
weighted avg       0.75      0.76      0.73      9808



In [19]:
print(confusion_matrix(y_test, y_preds_test_2))
print(classification_report(y_test, y_preds_test_2))

[[2854  117   58]
 [ 603  357   63]
 [ 333   98  349]]
              precision    recall  f1-score   support

           0       0.75      0.94      0.84      3029
           1       0.62      0.35      0.45      1023
           2       0.74      0.45      0.56       780

    accuracy                           0.74      4832
   macro avg       0.71      0.58      0.61      4832
weighted avg       0.72      0.74      0.71      4832



In [20]:
_df = X_test.to_frame()
_df['y_test'] = y_test
_df['y_preds_test_2'] = y_preds_test_2

with option_context('display.max_colwidth', None):
  display(_df[_df['y_test'] != _df['y_preds_test_2']].head(15))

Unnamed: 0,text,y_test,y_preds_test_2
476,@VirginAmerica Are there any plans for a short haul airline in Europe? Would defiantly fly with you guys :),2,0
3563,@united flight to RSW tonight -me &amp; twin 3 year olds. A pilot who was in row by me stayed to help me get the boys &amp; bags off. Lifesaver!!,2,0
10417,@USAirways I'm on flight 623 from DIA to Ontario tomorrow morning. no access to the airport via taxi or shuttle. What should I do?,1,0
6298,"@SouthwestAir OH MY GOSH SERIOUSLY?! you just made my day, week, year!!! No one will appreciate this more than me!!!",2,0
4964,@SouthwestAir My Fav!!!!,2,1
12424,"@AmericanAir ""Thank you for contacting American. The email address you have written to is an unmonitored account”",1,2
12042,@AmericanAir thanks for getting back to me. But I will find other airlines in the future.,0,2
6008,@SouthwestAir is there any chance I could get tickets to the #DestinationDragons show in Vegas? Ive been a huge fan for years! Pretty please,1,0
11326,@USAirways I got an email asking me to checkin TMRW for a flight I meant to book for 3/19 - can someone please help! @WallStSlumLord @PKG49,1,0
10635,"@USAirways Umm, can you define 'extra time'?",0,1


# Deep Learning with Fine Tuning

In [21]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=2, overwrite_output_dir=True)

# Create a ClassificationModel
model = ClassificationModel("roberta", "distilroberta-base", num_labels=3, args=model_args, use_cuda=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
_df = X_train.to_frame()
_df['labels'] = y_train
_df.columns = ["text", "labels"]
display(_df.head())
model.train_model(_df)

Unnamed: 0,text,labels
8934,@JetBlue @EllaHenderson Ah! Wish I was there! ...,1
13946,@AmericanAir if you could get the gate crew to...,1
6539,@SouthwestAir your employees were great!,2
589,"@united 441, which also had 1 working WC in co...",2
13365,@AmericanAir The website won't let me view my ...,0


  0%|          | 0/19 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 1 of 2:   0%|          | 0/1226 [00:00<?, ?it/s]

Running Epoch 2 of 2:   0%|          | 0/1226 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import f1_score, accuracy_score

eval_df = X_test.to_frame()
eval_df['labels'] = y_test
eval_df.columns = ["text", "labels"]

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

y_preds_train_3, _ = model.predict(X_train.to_list())
y_preds_test_3, _ = model.predict(X_test.to_list())

In [None]:
print(confusion_matrix(y_train, y_preds_train_3))
print(classification_report(y_train, y_preds_train_3))

In [None]:
print(confusion_matrix(y_test, y_preds_test_3))
print(classification_report(y_test, y_preds_test_3))

# LLM Prompting

In [None]:
from panml.models import ModelPack
#lm = ModelPack(model='google/flan-t5-large', source='huggingface')
lm = ModelPack(model='distilgpt2', source='huggingface')

In [None]:
_prompt_df = pd.DataFrame({'input_prompts': X_test.head(30).to_list()})

In [None]:
prompts = [
    {'prepend': 'you are an expert at sentiment analysis.'},
    {'prepend': 'classify each tweet as negative, neutral, or positive.'},
    #{'prepend': 'output only the class.'},
]

y_preds_test_4 = lm.predict(_prompt_df['input_prompts']) #, prompt_modifier=prompts, keep_history=True)

In [None]:
print(confusion_matrix(y_test, y_preds_test_4))
print(classification_report(y_test, y_preds_test_4))