Here's the link to the dataset: https://www.kaggle.com/datasets/BidecInnovations/stock-price-and-news-realted-to-it?resource=download

This notebook preprocess that data, applies a finance sentiment analysis classifier on the news to create a new dataset.


To Do:

Make sure predict / evaluate work with the model

Surely there's more efficient ways of doing this way than converting data so many times. Consider using .apply() in pandas

In [None]:
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.0-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 4.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.5 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 12.1 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling P

In [None]:
import pandas as pd

import scipy
import sklearn
import numpy as np

from transformers import AutoTokenizer, DataCollatorWithPadding, TFAutoModelForSequenceClassification, create_optimizer
from transformers.keras_callbacks import KerasMetricCallback
from tensorflow.keras.callbacks import TensorBoard

import datasets

import tensorflow as tf
from datasets import load_dataset, load_metric

In [None]:
#Preprocess data to get day's change

df = pd.read_csv('/content/drive/MyDrive/Data_fincent/MicrosoftNewsStock.csv')


df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2517 entries, 0 to 2516
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  2517 non-null   int64  
 1   Date        2517 non-null   object 
 2   Open        2517 non-null   float64
 3   High        2517 non-null   float64
 4   Low         2517 non-null   float64
 5   Close       2517 non-null   float64
 6   Adj Close   2517 non-null   float64
 7   Volume      2517 non-null   int64  
 8   News        1341 non-null   object 
dtypes: float64(5), int64(2), object(2)
memory usage: 177.1+ KB


In [None]:
df['News'] = df['News'].fillna(value='<No News>')

In [None]:
df['percent_change_at_close'] = ((df['Close'] - df['Open']) / df['Open'])*100

df.head()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,News,percent_change_at_close
0,0,2006-12-01,29.23,29.299999,28.9,29.120001,29.120001,72257000,<No News>,-0.376322
1,1,2006-12-04,29.23,29.52,29.17,29.33,29.33,55123400,The Retooling of a Search Engine : Ask.com is ...,0.342114
2,2,2006-12-05,29.360001,29.4,29.030001,29.129999,29.129999,45606000,<No News>,-0.783386
3,3,2006-12-06,29.1,29.129999,28.870001,28.99,28.99,48564100,Combat as Usual? Not With These Games : A few ...,-0.378007
4,4,2006-12-07,28.959999,29.07,28.809999,28.85,28.85,46831100,Vista Is Ready. Are You? : Why it might be a b...,-0.379831


In [None]:
dataset = datasets.Dataset.from_pandas(df)
dataset

Dataset({
    features: ['Unnamed: 0', 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'News', 'percent_change_at_close'],
    num_rows: 2517
})

In [None]:
#Setting up task and model:
task = 'sst2' #Similar to problem statement from GLUE - supervised sentiment classification on Stanford Sentiment Treebank
model_checkpoint = 'roberta-base' #Make sure the model is compatible with classification tasks
#Here are some models for classification: roberta-base, roberta-large, ProsusAI/finbert


#Preprocessing

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess_function(examples):
  return tokenizer(examples['News'], truncation = True)


encoded_dataset = dataset.map(preprocess_function, batched = True)
encoded_dataset


Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

Dataset({
    features: ['Unnamed: 0', 'Date', 'Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'News', 'percent_change_at_close', 'input_ids', 'attention_mask'],
    num_rows: 2517
})

In [None]:
batch_size = 16 #This might be need to tweaked based on task and model.

#Loading metric for sentiment classification 
metric = load_metric('glue', task) #Can look into finding a different metric, here it is accuracy.


#Defining Loss and Model

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
num_labels = 3 #for tweet sentiments
model = TFAutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = num_labels)




#Compiling the model

num_epochs = 2
batches_per_epoch = len(encoded_dataset) // batch_size #No train test split here, you can't train here
total_train_steps = int(batches_per_epoch * num_epochs)


#create_optimizer() is AdamW with weight and learning rate decay
optimizer, schedule = create_optimizer(
    init_lr = 3e-5, num_warmup_steps = 0, 
    num_train_steps = total_train_steps #This could be a totally random number, bc we're not training again i think it's ok
    )

model.compile(optimizer = optimizer, loss = loss, metrics = ['accuracy'])

model.load_weights('/content/drive/MyDrive/roberta_base_2_sentfin/pretrained_ckpt') #this is pretrained by me



Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/627M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f23d4336c50>

In [None]:
#Just a random test
sentiments = ['positive', 'neutral', 'negative']
input_tweet = "Microsoft does nothing"

inputs = tokenizer(input_tweet, return_tensors="tf")

logits = model(**inputs).logits

predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])

print(f'logits: {logits}')
print(f'Input tweet: {input_tweet}')
print(f'Highest probability prediction: {sentiments[predicted_class_id]}')

logits: [[-2.5988214   2.071279    0.72800076]]
Input tweet: Microsoft does nothing
Highest probability prediction: neutral


In [None]:
model_inputs = np.asarray(encoded_dataset['input_ids'])
type(model_inputs)

In [None]:
dummy_list = []

for i in range(len(df)):

  input = tokenizer(str(df.loc[i,'News']), return_tensors='tf')
  pred = model(input).logits
  dummy_list.append(pred)

Token indices sequence length is longer than the specified maximum sequence length for this model (565 > 512). Running this sequence through the model will result in indexing errors


In [None]:
print(len(df))
len(dummy_list)

2517


2517

In [None]:
print(type(dummy_list[1].numpy()[0][0]))

<class 'numpy.float32'>


In [None]:
for i in range(len(df)):

  #df.loc[i, 'logits'] = list(dummy_list[i].numpy()[0])
  df.loc[i, 'positive'] = dummy_list[i].numpy()[0][0]
  df.loc[i, 'neutral'] = dummy_list[i].numpy()[0][1]
  df.loc[i, 'negative'] = dummy_list[i].numpy()[0][2]

In [None]:
df['logits'] = dummy_list

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,News,percent_change_at_close,logits,positive,neutral,negative
0,0,2006-12-01,29.23,29.299999,28.9,29.120001,29.120001,72257000,<No News>,-0.376322,"((tf.Tensor(-2.6761162, shape=(), dtype=float3...",-2.676116,2.118815,0.881641
1,1,2006-12-04,29.23,29.52,29.17,29.33,29.33,55123400,The Retooling of a Search Engine : Ask.com is ...,0.342114,"((tf.Tensor(-1.4502404, shape=(), dtype=float3...",-1.45024,1.712203,-0.008445
2,2,2006-12-05,29.360001,29.4,29.030001,29.129999,29.129999,45606000,<No News>,-0.783386,"((tf.Tensor(-2.6761162, shape=(), dtype=float3...",-2.676116,2.118815,0.881641
3,3,2006-12-06,29.1,29.129999,28.870001,28.99,28.99,48564100,Combat as Usual? Not With These Games : A few ...,-0.378007,"((tf.Tensor(-0.5955663, shape=(), dtype=float3...",-0.595566,1.342662,-0.402697
4,4,2006-12-07,28.959999,29.07,28.809999,28.85,28.85,46831100,Vista Is Ready. Are You? : Why it might be a b...,-0.379831,"((tf.Tensor(-0.5480019, shape=(), dtype=float3...",-0.548002,1.999854,-1.358057


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2517 entries, 0 to 2516
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Unnamed: 0               2517 non-null   int64  
 1   Date                     2517 non-null   object 
 2   Open                     2517 non-null   float64
 3   High                     2517 non-null   float64
 4   Low                      2517 non-null   float64
 5   Close                    2517 non-null   float64
 6   Adj Close                2517 non-null   float64
 7   Volume                   2517 non-null   int64  
 8   News                     2517 non-null   object 
 9   percent_change_at_close  2517 non-null   float64
 10  logits                   2517 non-null   object 
 11  positive                 2517 non-null   float64
 12  neutral                  2517 non-null   float64
 13  negative                 2517 non-null   float64
dtypes: float64(9), int64(2),

In [None]:
df = df.drop(['Unnamed: 0'], axis = 1)

In [None]:
#Saving into google drive for easier usage
df.to_csv('/content/drive/MyDrive/Data_fincent/MSNews_Predicted.csv')