### VADER Lexicon Models

Out-of-Sample Performance (Macro Avg F1-Score)
* Fine-tuned VADER - 0.5600
* Untuned VADER - 0.4900

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('tweets_comments_combined_df.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4300 entries, 0 to 4299
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   comment    4300 non-null   object
 1   sentiment  4300 non-null   object
dtypes: object(2)
memory usage: 67.3+ KB


In [None]:
df.head()

Unnamed: 0,comment,sentiment
0,Come on you battery obsessed retards I want to...,neutral
1,"Selling my Amazon, Nvidia and amd shares to al...",neutral
2,"Texas basically opening May 1st, schools close...",negative
3,I need LeBron to dunk on someone’s face right ...,neutral
4,PLTR either forming a nice bull flag or it’s a...,neutral


In [None]:
df['sentiment'].value_counts()

positive    1650
neutral     1337
negative    1313
Name: sentiment, dtype: int64

In [None]:
! pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 19.6 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 28.1 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 17.8 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 12.3 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 9.6 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 11.3 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 11.8 MB/s eta 0:00:01[K     |████████████████████▉           | 81 kB 11.8 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 12.7 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 13.0 MB/s eta 0:00:01[K     |████████████████████████████▋   | 112 kB 13.0 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 13.0 MB/s eta 0:00:01[K     |████████████████████████████████| 125 k

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# UNTUNED VADER
untuned_analyser = SentimentIntensityAnalyzer()

# TUNED VADER
tuned_analyser = SentimentIntensityAnalyzer()

new_words = {'moon': 4.0,'mooning': 4.0, 'long': 3.0, 'short': -3.0,
             'call': 4.0, 'calls': 4.0, 'put': -4.0, 'puts': -4.0, 
             'break': 2.0, 'tendie': 2.0, 'tendies': 1.0, 'overvalued': -3.0, 
             'undervalued': 3.0, 'buy': 4.0, 'sell': -4.0, 'gone': -2.0, 'gtfo': -1.7, 
             'bullish': 3.7, 'bearish': -3.7, 'bull': 3.7, 'hold':4.0, 'sell':-4.0, 'buying':4.0, 'selling':-4.0,
             'bear': -3.7, 'down': -4.0, 'sold': -2.0,
             'bagholder': -3.0, 'stonk': 2.4, 'green': 2.4, 'money': 1.2, 
             'rocket': 3.0, 'pumping': 2.0, 'pump':2.0, 'pamp':2.0,
             'sus': -3.0, 'rip': -4.0, 'rope':-3.0,
             'downgrade': -3.0, 'upgrade': 3.0, 'pump': 2.5, 'hot': 1.5,
             'drop': -2.5, 'rebound': 1.5, 'uppies': 3.0, 'downies': -3.0, 
             '🤡':-3.0, 'clown':-3.0,
             '🌈🐻': -3.0, 'rainbow bear':-3.0, 'fire':-2.0,
             }

tuned_analyser.lexicon.update(new_words)

In [None]:
from sklearn.model_selection import train_test_split

X = df['comment']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42) 

In [None]:
y_preds_compound = X_test.apply(lambda x: tuned_analyser.polarity_scores(x)['compound'])
y_preds_labels = []
threshold=0.07
for vader_score in y_preds_compound:
  if vader_score > threshold:
    y_preds_labels.append('positive')
  elif vader_score < -threshold:
    y_preds_labels.append('negative')
  else:
    y_preds_labels.append('neutral')

from sklearn.metrics import classification_report
print("Tuned Vader Sentiment Scores")
print(classification_report(y_test, y_preds_labels))

Tuned Vader Sentiment Scores
              precision    recall  f1-score   support

    negative       0.58      0.62      0.60       337
     neutral       0.53      0.42      0.47       322
    positive       0.57      0.64      0.61       416

    accuracy                           0.57      1075
   macro avg       0.56      0.56      0.56      1075
weighted avg       0.57      0.57      0.56      1075



In [None]:
inference = pd.read_csv('inference_1month_comments.csv')
inference.shape

(18676, 2)

In [None]:
import time
t1 = time.perf_counter()

inference_predictions = inference['comment'].apply(lambda x: tuned_analyser.polarity_scores(x)['compound'])

t2 = time.perf_counter()
print('time taken to run:',t2-t1)

time taken to run: 2.397669834000226


In [None]:
len(inference_predictions)
inference_predictions[:5]

0    0.9014
1    0.5093
2   -0.8467
3    0.0000
4   -0.3182
Name: comment, dtype: float64

In [None]:
y_preds_compound = X_test.apply(lambda x: untuned_analyser.polarity_scores(x)['compound'])
y_preds_label = []
for vader_score in y_preds_compound:
  if vader_score > threshold:
    y_preds_label.append('positive')
  elif vader_score < -threshold:
    y_preds_label.append('negative')
  else:
    y_preds_label.append('neutral')

from sklearn.metrics import classification_report
print("Untuned Vader Sentiment Scores")
print(classification_report(y_test, y_preds_label))

Untuned Vader Sentiment Scores
              precision    recall  f1-score   support

    negative       0.56      0.50      0.53       337
     neutral       0.42      0.49      0.45       322
    positive       0.50      0.47      0.49       416

    accuracy                           0.49      1075
   macro avg       0.49      0.49      0.49      1075
weighted avg       0.49      0.49      0.49      1075



In [None]:
import time
t1 = time.perf_counter()

inference_predictions = inference['comment'].apply(lambda x: untuned_analyser.polarity_scores(x)['compound'])

t2 = time.perf_counter()
print('time taken to run:',t2-t1)

time taken to run: 2.0619974580004055


In [None]:
print(inference_predictions[:5])
len(inference_predictions)


0    0.9014
1    0.5093
2   -0.8467
3    0.0000
4   -0.3182
Name: comment, dtype: float64


18676