#### Load Tweets Dataset

In [1]:
# Make a directory i.e. data
!mkdir ./data/


# Download the cleaned tweet data file from drive and 
# save into data folder
!gdown 1W_QFa1hv4eHGa491vENourYdV3iXJFeC -O ./data/

Downloading...
From: https://drive.google.com/uc?id=1W_QFa1hv4eHGa491vENourYdV3iXJFeC
To: /content/data/cleaned_data.csv
100% 17.0M/17.0M [00:00<00:00, 39.6MB/s]


#### Install Dependencies

In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.24.0-py3-none-any.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 16.5 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.0-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 50.3 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 50.9 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.0 tokenizers-0.13.2 transformers-4.24.0


### Import Packages

In [None]:
# import required packages
import os
import numpy as np
import pandas as pd

# train test split
from sklearn.model_selection import train_test_split

# metrices
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import f1_score, roc_auc_score, confusion_matrix, roc_curve

# visualization
import matplotlib.pyplot as plt

# TensforFlow Library
import tensorflow as tf
# Transformer Models from huggingface
from transformers import pipeline
from transformers import AutoTokenizer, TFBertForSequenceClassification

### Load dataset

In [4]:
# read cleaned csv file
dataset = pd.read_csv("./data/cleaned_data.csv")
dataset.head(5)

Unnamed: 0,tweet,sentiment
0,$twtr are you feeling confident bear tick tock...,Bullish
1,$tsla lot thing may happen over the weekend if...,Bullish
2,$twtr look like the crook and thief at twtr wi...,Bearish
3,$doge.x why are you selling twitter deal will ...,Bullish
4,$deso.x if elon is planning on cutting up to 7...,Bullish


### Label Mapping

In [5]:
# Label Mapping
id2label = {0: "Bearish", 1: "Bullish"}
label2id = {val: key for key, val in id2label.items()}

# Mapping Label to ID
dataset['label'] = dataset['sentiment'].apply(lambda x: label2id[x])
# cheking shape of dataset
X, y = dataset['tweet'], dataset['label']

### Train and validation Set Splitting

In [6]:
# GET A TRAIN TEST SPLIT (set seed for consistent results)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1,
                                                    stratify=y, random_state=2000)

In [56]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((116871,), (12986,), (116871,), (12986,))

#### Function for Model metrics analysis

In [52]:
def print_metrices(Y_test, Y_pred):
  # GET ACCURACY, PRECISION, RECALL, F1-SCORE
  # 0: 50, 1: 30 => (FP + FN) = 19
  # 0: 41, 1: 20 => (TP + TN)/Total samples = (41 + 20) / 80

  # imbalance
  # 0: 100, 1:10
  # 0: 100, 1: 0 => (100 + 0) / 110 => >90%

  print("ACCURACY: {}".format(round(accuracy_score(Y_test, Y_pred), 4)))

  print("BEARISH PRECISION: {}".format(round(precision_score(Y_test, Y_pred, pos_label=0), 4)))

  print("BEARISH RECALL: {}".format(round(recall_score(Y_test, Y_pred, pos_label=0), 4)))

  print("BEARISH F1-SCORE: {}".format(round(f1_score(Y_test, Y_pred, pos_label=0), 4)))

  # inverse_y_test = np.where(Y_test==1, 0, 1)
  # inverse_y_pred = np.where(Y_pred==1, 0, 1)
  print("BULLISH PRECISION: {}".format(round(precision_score(Y_test, Y_pred, pos_label=1), 4)))

  print("BULLISH RECALL: {}".format(round(recall_score(Y_test, Y_pred, pos_label=1), 4)))

  print("BULLISH F1-SCORE: {}".format(round(f1_score(Y_test, Y_pred, pos_label=1), 4)))

  #
  print("Weighted PRECISION: {}".format(round(precision_score(Y_test, Y_pred, average='weighted'), 4)))

  print("Weighted RECALL: {}".format(round(recall_score(Y_test, Y_pred, average='weighted'), 4)))

  print("Weighted F1-SCORE: {}".format(round(f1_score(Y_test, Y_pred, average='weighted'), 4)))


  total_bullish = sum(confusion_matrix(Y_test, Y_pred)[0])
  total_bearish = sum(confusion_matrix(Y_test, Y_pred)[1])

  true_pred_bullish = confusion_matrix(Y_test, Y_pred)[0][0]
  true_pred_bearish = confusion_matrix(Y_test, Y_pred)[1][1]
  print("{} bearish samples are correctly predicted out of {}".format(true_pred_bullish,
                                                                      total_bullish))
  print("{} bullish samples are correctly predicted out of {}".format(true_pred_bearish,
                                                                      total_bearish))
  

# Plot Graphs
def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

### Apply Transformer Model

#### Input data Pipeline

In [9]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=70, return_tensors="tf")
val_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=70, return_tensors="tf")

Downloading:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [57]:
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train.tolist()))

val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), y_test.tolist()))

In [58]:
BUFFER_SIZE = 10000
BATCH_SIZE = 128
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
val_dataset = val_dataset.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

In [59]:
for i, j in train_dataset.take(1):
  print(i['input_ids'].shape)
  print(i['token_type_ids'].shape)
  print(i['attention_mask'].shape)
  print(j)

(128, 70)
(128, 70)
(128, 70)
tf.Tensor(
[1 1 1 1 1 1 1 0 0 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 0 1 0 0 1 1 0 1 1 0 1 1
 1 1 1 1 1 1 0 0 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1
 1 0 1 0 1 1 0 0 1 0 1 0 1 1 1 1 1], shape=(128,), dtype=int32)


#### Loading Model with Pretrained checkpoints

In [49]:
model = TFBertForSequenceClassification.from_pretrained("ProsusAI/finbert", num_labels=2, 
                                                        id2label=id2label, label2id=label2id)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at ProsusAI/finbert and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Performance Analysis on Pretrained Model

In [60]:
y_pred_logits = model.predict(val_dataset)
y_pred = np.argmax(y_pred_logits.logits, axis=1)



In [61]:
print_metrices(y_test, y_pred)

ACCURACY: 0.3236
BEARISH PRECISION: 0.2225
BEARISH RECALL: 0.9424
BEARISH F1-SCORE: 0.36
BULLISH PRECISION: 0.9198
BULLISH RECALL: 0.167
BULLISH F1-SCORE: 0.2827
Weighted PRECISION: 0.779
Weighted RECALL: 0.3236
Weighted F1-SCORE: 0.2983
2471 bearish samples are correctly predicted out of 2622
1731 bullish samples are correctly predicted out of 10364


Setting Up for Fine-Tuning 

In [62]:
# Learning rate Scheduler
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=5e-4,
    decay_steps=1000,
    decay_rate=0.9)


# Settting up Optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)

# Model compilation strategy
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=tf.metrics.SparseCategoricalAccuracy()
              )

#### Model Fine-Tuning

In [None]:
#  model training
history = model.fit(train_dataset, 
                    epochs=2,
                    # steps_per_epoch=X_train.shape[0]/BATCH_SIZE,
                    validation_data=val_dataset,
                    validation_steps=10)

Epoch 1/2
Epoch 2/2

Convergence Plot

In [None]:
plt.figure(figsize=(16, 8))
plt.subplot(1, 2, 1)
plot_graphs(history, 'accuracy')
plt.ylim(None, 1)
plt.subplot(1, 2, 2)
plot_graphs(history, 'loss')
plt.ylim(0, None)

#### Check Testing Accuracy

In [88]:
val_loss, val_accuracy = model.evaluate(val_dataset)

print('Test Loss:', val_loss)
print('Test Accuracy:', val_accuracy)

Test Loss: 0.2655167579650879
Test Accuracy: 0.899969220161438


Performance Analysis on Fine-Tuned Model

In [None]:
y_pred_logits = model.predict(val_dataset)
y_pred = np.argmax(y_pred_logits.logits, axis=1)

In [None]:
print_metrices(y_test, y_pred)

### Merge Tokenizer and Model for stock tweet sentiment analysis

In [17]:
pipe = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [89]:
pipe = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)

In [93]:
tuple(zip(X_test.tolist()[:5], y_test))

(('$btc.x so far 19k ha been the mendoza line. it 39 a buy below. todamoonbubba http news.bitcoin.com robert kiyosaki predicts u dollar will crash by january suggests buying bitcoin utm_source onesignalpush amp utm_medium notification amp utm_campaign pushnotifications',
  1),
 ('$xom $brk.b $cci $ge are our stock suggestion for buying long. more information about buy price range chart expected buy return and option price are given in our closing bell video 8 19 2020 only on justrading youtube channel check it out. if you would like our daily suggested stock report and get more than 60% of capital return per year with the new strategy that we are using please subscribe on our patreon page thanks. part 1',
  1),
 ('$goos $sbux own some equity or dated call in both these name new leader imo',
  1),
 ('$dkng weekly look ready to cook some bull cut of meat', 0),
 ('$riot here we go', 1))

In [96]:
y_preds = pipe(X_test.tolist()[:1000])

In [98]:
y_pred = [1 if i['label']=='LABEL_1' else 0 for i in y_preds]

In [100]:
accuracy_score(y_test[:1000], y_pred)

0.91

In [None]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))