In [158]:
!pip install -U transformers



In [159]:
!pip install datasets



In [160]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [161]:
# Import necessary packages
import os
import gc
import random
import pandas as pd
import numpy as np
import torch
try:
    print(torch.cuda.get_device_name())
except:
    print('Only CPU :(')
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, BertPreTrainedModel
from transformers.models.bert.modeling_bert import BertForSequenceClassification
from datasets import load_dataset, load_metric, Dataset
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score

Tesla K80


In [162]:
# Some definitions
STOCKS = ['F', 'GM', 'TSLA']
MAX_LENGTH = 120
START_DATE = '2011-01-01'
START_VAL = '2020-01-01'
START_TEST = '2021-01-01'
END_DATE = '2021-12-01'
DATA_DIR = '/content/drive/MyDrive/Pictet_Assignement/data'
THR_PCT_PRICE_CHANGE = 0.05

In [163]:
# Set seed for reproducibility
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
set_seed(42)

In [164]:
# Utility function to load price data
def load_stock_data(stock):
    dfs = pd.read_csv(os.path.join(DATA_DIR, f'{stock}.csv'), parse_dates=['Date'])
    dfs = dfs[(dfs['Date'] >= START_DATE) & (dfs['Date'] < END_DATE)].copy()
    dfs = dfs.sort_values('Date')
    dfs['pct'] = dfs['Close'].pct_change()
    dfs['is_signif'] = (dfs['pct'].abs() > THR_PCT_PRICE_CHANGE).astype(int)
    return dfs.dropna(subset=['pct'])

In [165]:
# Load and concatenate price data for stocks
stock_df = {}
for stock in STOCKS:
    print(stock)
    stock_df[stock] = load_stock_data(stock)

df_price = None
for stock_name, df in stock_df.items():
    df = df.rename({c: c + '_' + stock_name for c in df.columns if c != 'Date'}, axis=1)
    if df_price is None:
        df_price = df.copy()
    else:
        df_price = df_price.merge(df, on='Date', how='inner')

F
GM
TSLA


In [168]:
# Define target
df_price = df_price.set_index('Date').sort_index()
labels_columns = [f'label_{stock}' for stock in STOCKS]
for stock in STOCKS:
    df_price[f'label_{stock}'] = df_price[f'is_signif_{stock}'].astype(float)
df_price['label'] = df_price.apply(lambda x: [x[c] for c in labels_columns], axis=1)

# Split train/val/test
df_price_train = df_price[: START_VAL]
df_price_val = df_price[START_VAL: START_TEST]
df_price_test = df_price[START_TEST:]

In [169]:
df_price_train.shape, df_price_val.shape, df_price_test.shape

((2263, 28), (253, 28), (230, 28))

In [170]:
# Load NYTimes data
df_nyt = pd.read_csv(os.path.join(DATA_DIR, 'HL_source3_nytimeshl_v3.csv'), parse_dates=['date'])
df_nyt['date'] = df_nyt['date'].dt.tz_localize(None)

# Need to account for weekends and late news
df_nyt['hour'] = df_nyt['date'].dt.hour
df_nyt['weekday'] = df_nyt['date'].dt.weekday
df_nyt['offset_day'] = ((df_nyt['hour'] >= 19) & (~df_nyt['weekday'].isin([5, 6]))).astype(int)
df_nyt['offset_saturday'] = 2 * (df_nyt['weekday'] == 5).astype(int)
df_nyt['offset_day'] += df_nyt['offset_saturday']
df_nyt['offset_sunday'] = 1 * (df_nyt['weekday'] == 6).astype(int)
df_nyt['offset_day'] += df_nyt['offset_sunday']
days_to_shift = pd.TimedeltaIndex(df_nyt['offset_day'], unit='D')
df_nyt['new_date'] = df_nyt['date'] + days_to_shift
df_nyt['Date'] = df_nyt['new_date'].dt.normalize()

In [171]:
# Load all the news data
df_atn = pd.read_csv(os.path.join(DATA_DIR, 'HL_source1_all_the_news.csv'), parse_dates=['date'])

# Need to account for weekends and late news
df_atn['hour'] = df_atn['date'].dt.hour
df_atn['weekday'] = df_atn['date'].dt.weekday
df_atn['offset_day'] = ((df_atn['hour'] >= 19) & (~df_atn['weekday'].isin([5, 6]))).astype(int)
df_atn['offset_saturday'] = 2 * (df_atn['weekday'] == 5).astype(int)
df_atn['offset_day'] += df_atn['offset_saturday']
df_atn['offset_sunday'] = 1 * (df_atn['weekday'] == 6).astype(int)
df_atn['offset_day'] += df_atn['offset_sunday']
days_to_shift = pd.TimedeltaIndex(df_atn['offset_day'], unit='D')
df_atn['new_date'] = df_atn['date'] + days_to_shift
df_atn['Date'] = df_atn['new_date'].dt.normalize()

In [172]:
# Concatenate all news
df_news = pd.concat([df_nyt[['Date', 'title']].reset_index(), df_atn[['Date', 'title']].reset_index()])

# Cleanup to save some RAM
del df_nyt, df_atn
gc.collect()

247

In [173]:
# Create a single headline string per date
headlines = df_news.groupby('Date')['title'].apply('. '.join).rename('headlines').reset_index()

In [174]:
# Fill missing dates
headlines = headlines.set_index('Date').sort_index()
headlines = headlines.reindex(df_price.index).fillna('No news today').sort_index()
headlines.shape

(2746, 1)

In [175]:
# Split train/val/test
headlines_train = headlines[: START_VAL]
headlines_val = headlines[START_VAL: START_TEST]
headlines_test = headlines[START_TEST:]
headlines_train.shape, headlines_val.shape, headlines_test.shape

((2263, 1), (253, 1), (230, 1))

In [176]:
# Join with price data
train = headlines_train.join(df_price_train[['label']], on='Date', how='inner').reset_index().drop('Date', axis=1)
val = headlines_val.join(df_price_val[['label']], on='Date', how='inner').reset_index().drop('Date', axis=1)
test = headlines_test.join(df_price_test[['label']], on='Date', how='inner').reset_index().drop('Date', axis=1)

In [177]:
# Create datasets for huggingface
train_ds = Dataset.from_pandas(train, split='train').shuffle(42)# .select(range(500))
val_ds = Dataset.from_pandas(val, split='test')
test_ds = Dataset.from_pandas(test, split='test')

In [179]:
# Download BERT model to fine tune
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Reset the head to learn our task
model.classifier = nn.Linear(768, 3)
torch.nn.init.normal_(model.classifier.weight, std=0.02)
torch.nn.init.normal_(model.classifier.bias, 0)
model.num_labels = 1

loading configuration file https://huggingface.co/ProsusAI/finbert/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2120f4f96b5830e5a91fe94d242471b0133b0976c8d6e081594ab837ac5f17bc.ef97278c578016c8bb785f15296476b12eae86423097fed78719d1c8197a3430
Model config BertConfig {
  "_name_or_path": "ProsusAI/finbert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",


In [180]:
# Tokenize datasets
def preprocess_function(examples):
    return tokenizer(examples['headlines'], padding="max_length", truncation=True, max_length=MAX_LENGTH)

tokenized_train_ds = train_ds.map(preprocess_function, batched=True)
tokenized_train_ds = tokenized_train_ds.remove_columns('headlines')
tokenized_val_ds = val_ds.map(preprocess_function, batched=True)
tokenized_val_ds = tokenized_val_ds.remove_columns('headlines')
tokenized_test_ds = test_ds.map(preprocess_function, batched=True)
tokenized_test_ds = tokenized_test_ds.remove_columns('headlines')

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [182]:
# Utility functions to compute metrics
def sigmoid(x):
    return 1/(1 + np.exp(-x))

def metrics(labels, probs, predictions):
    return {
        'roc_auc': roc_auc_score(labels, probs, average='macro'),
        'f1': f1_score(labels, predictions, average='macro'),
        'precision': precision_score(labels, predictions, average='macro'),
        'recall': recall_score(labels, predictions, average='macro'),                
        }

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = sigmoid(logits)
    predictions = (probs > 0.5).astype(float)    
    return metrics(labels, probs, predictions)

In [183]:
# Run the fit
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=8,
    weight_decay=0.01,
    evaluation_strategy='epoch',
    logging_strategy='epoch'    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

train_output = trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 2263
  Num Epochs = 8
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 568


Epoch,Training Loss,Validation Loss,Roc Auc,F1,Precision,Recall
1,0.0606,0.162809,0.504623,0.271108,0.170321,0.672269
2,0.0462,0.160967,0.542568,0.214064,0.131723,0.642857
3,0.0424,0.159618,0.517407,0.208769,0.129143,0.595238
4,0.039,0.160283,0.537852,0.277194,0.201444,0.548033
5,0.0349,0.169353,0.544585,0.244027,0.49855,0.41428
6,0.0293,0.163197,0.551546,0.267721,0.238696,0.656784
7,0.0246,0.160265,0.559039,0.283446,0.197159,0.66861
8,0.0215,0.160991,0.573855,0.271928,0.20391,0.659299


***** Running Evaluation *****
  Num examples = 253
  Batch size = 32
***** Running Evaluation *****
  Num examples = 253
  Batch size = 32
***** Running Evaluation *****
  Num examples = 253
  Batch size = 32
***** Running Evaluation *****
  Num examples = 253
  Batch size = 32
***** Running Evaluation *****
  Num examples = 253
  Batch size = 32
***** Running Evaluation *****
  Num examples = 253
  Batch size = 32
***** Running Evaluation *****
  Num examples = 253
  Batch size = 32
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 253
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)




In [184]:
# Predict the test set
labels_test = np.array(tokenized_test_ds['label'])
pred_output = trainer.predict(tokenized_test_ds)
logits_test = pred_output.predictions
probs_test = sigmoid(logits_test)
preds_test = (probs_test > 0.5).astype(float)
print(metrics(labels_test, probs_test, preds_test))

***** Running Prediction *****
  Num examples = 230
  Batch size = 32


{'roc_auc': 0.5704674803389757, 'f1': 0.15393368440551483, 'precision': 0.11521959888296522, 'recall': 0.6988636363636364}
