In [None]:
!pip install evaluate
!pip install datasets polars scikit-learn

Collecting evaluate
  Downloading evaluate-0.4.6-py3-none-any.whl.metadata (9.5 kB)
Downloading evaluate-0.4.6-py3-none-any.whl (84 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m84.1/84.1 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.6


In [None]:
# imports
import os
import re
import random
import numpy as np
import pandas as pd
import polars as pl
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, r2_score
import torch
import torch.nn as nn
from datasets import DatasetDict, Dataset, load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    pipeline,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from huggingface_hub import notebook_login
from peft import LoraConfig, TaskType, get_peft_model
import evaluate
import matplotlib.pyplot as plt
import wandb


## Data Preperation

In [None]:
def set_seed(seed_value=6893):
    """Set seeds for reproducibility."""
    import random
    import numpy as np
    import torch

    random.seed(seed_value)

    np.random.seed(seed_value)

    torch.manual_seed(seed_value)

    if torch.cuda.is_available():
        torch.cuda.empty_cache()

        torch.cuda.manual_seed_all(seed_value)

        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    print(f"Global seed set to {seed_value}")

set_seed()

In [None]:
dataset = load_dataset(
    "siddharthgowda/articles_with_11day_ohlc_local",
    split="train"
)

df_og = pl.from_arrow(dataset.data.table)

df = df_og

print("‚úÖ Successfully loaded data from public Hugging Face dataset.")
print(df.head(), df_og.shape)

In [None]:
df = df_og.with_columns(
    # Calculate the Increase
    log_return = (pl.col("Close_1") / pl.col("Close_0")).log()
).with_columns(
    target = pl.col("log_return")
)

print("\nüìù DataFrame with new Increase Target Label:")
print(df.select([
    "Stock_symbol",
    "Article_title",
    "Close_0",
    "Close_1",
    "log_return",
    "target"
]).tail())

In [None]:

LARGE_CAP_TICKERS = [
    "AAPL", "MSFT", "GOOGL", "GOOG", "AMZN", "NVDA", "META", "TSLA", "BRK.B",
    "JPM", "V", "LLY", "XOM", "UNH", "JNJ", "WMT", "HD", "PG", "MA", "CVX",
    "BAC", "COST", "PFE", "ABBV", "KO", "AVGO", "CSCO", "PEP", "MRK", "TMO",
    "CRM", "CMCSA", "DIS", "PM", "ADBE", "ACN", "NKE", "INTU", "QCOM", "MCD",
    "UPS", "ORCL", "GILD", "SBUX", "TXN", "AMGN", "LOW", "CAT", "MS", "AXP",
    "BA", "GE", "LMT", "DE", "HON", "MMM", "GS", "SCHW", "CVS", "RTX", "MDLZ",
    "FDX", "USB", "C", "MDT", "WFC", "BKNG", "TGT", "ISRG", "BIIB", "BMY",
    "COP", "SLB", "EOG", "OXY", "KMI", "DHR", "EMR", "PPL", "SRE", "AEP",
    "SO", "PCAR", "DTE", "EXC", "DUK", "NEE", "PEG", "AFL", "CB", "MMC",
    "SPG", "AMT", "PLD", "EQIX", "ABT", "AT&T", "VZ", "TMUS", "LVS", "HCA"
]


df_filtered = df.filter(pl.col("Stock_symbol").is_in(LARGE_CAP_TICKERS))



print(f"Original total rows in DataFrame: {len(df):,}")
print(f"Number of large-cap tickers in filter list: {len(LARGE_CAP_TICKERS)}")
print(f"Filtered rows (Non-Large-Cap Stocks only): {len(df_filtered):,}")

print("\nSample of Filtered Data:")
print(df_filtered.head(5))

present_tickers = df_filtered["Stock_symbol"].unique().sort()
print(f"\nUnique Non-Large-Cap Tickers found in your dataset: {len(present_tickers)}")
print(present_tickers.to_list()[:10], '...')


In [None]:

RANDOM_SEED = 6893


original_filtered_rows = df_filtered.height
print(f"Starting with full data size: {original_filtered_rows:,} rows")

df_pd = df_filtered.to_pandas()
X = df_pd.drop(columns=["target"])
y = df_pd["target"]

# Perform 70/15/15 Train/Validation/Test Split
TEST_SIZE = 0.15 # 15% for the final test set

X_temp, X_test, y_temp, y_test = train_test_split(
    X,
    y,
    test_size=TEST_SIZE,
    shuffle=True,
    random_state=RANDOM_SEED
)


VAL_SIZE_RELATIVE = 0.15 / (1.0 - TEST_SIZE)

X_train, X_val, y_train, y_val = train_test_split(
    X_temp,
    y_temp,
    test_size=VAL_SIZE_RELATIVE,
    shuffle=True,
    random_state=RANDOM_SEED
)

df_train = pl.from_pandas(pd.concat([X_train, y_train], axis=1))
df_val = pl.from_pandas(pd.concat([X_val, y_val], axis=1))
df_test = pl.from_pandas(pd.concat([X_test, y_test], axis=1))


print("\n‚úÖ Split Verification:")
print(f"Train Set Size: {df_train.height:,} rows (Actual: {df_train.height/original_filtered_rows:.2%})")
print(f"Validation Set Size: {df_val.height:,} rows (Actual: {df_val.height/original_filtered_rows:.2%})")
print(f"Test Set Size: {df_test.height:,} rows (Actual: {df_test.height/original_filtered_rows:.2%})")

print("\nDistribution of 'target' in each set (describe for regression):")
print("--- Training Set ---")
print(df_train['target'].describe())

print("--- Validation Set ---")
print(df_val['target'].describe())

print("--- Test Set ---")
print(df_test['target'].describe())

In [None]:
base_model_name = "roBERTa-base"

In [None]:
# create tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model_name)

def tokenize_fn(examples):
    return tokenizer(
        examples['Article_title'],
        padding='max_length',
        truncation=True,
        max_length=512
    )


train_dataset = Dataset.from_pandas(df_train[['Article_title', 'target']].to_pandas().rename(columns={'target': 'labels'}))

tokenized_dataset = train_dataset.map(tokenize_fn, batched=True)

tokenized_dataset = tokenized_dataset.remove_columns(['Article_title'])

print(tokenized_dataset[0])

In [None]:
df_val_pd = df_val[['Article_title', 'target']].to_pandas()
val_dataset = Dataset.from_pandas(df_val_pd.rename(columns={'target': 'labels'}))
tokenized_val_dataset = val_dataset.map(tokenize_fn, batched=True)
tokenized_val_dataset = tokenized_val_dataset.remove_columns(['Article_title'])

In [None]:

df_test_pd = df_test[['Article_title', 'target']].to_pandas()
test_dataset = Dataset.from_pandas(df_test_pd.rename(columns={'target': 'labels'}))
tokenized_test_dataset = test_dataset.map(tokenize_fn, batched=True)
tokenized_test_dataset = tokenized_test_dataset.remove_columns(['Article_title'])

## Head only Finetuning!

In [None]:
wandb.finish()

In [None]:
wandb.login()

In [None]:
from transformers import AutoModelForSequenceClassification

model_to_classify = AutoModelForSequenceClassification.from_pretrained(
    base_model_name,
    num_labels=1, # only 1 bc of regression
    problem_type="regression"
)

In [None]:
def print_trainable_parameters(model):

    # Count parameters in each component
    encoder_total = sum(p.numel() for p in model.roberta.parameters())
    encoder_trainable = sum(p.numel() for p in model.roberta.parameters() if p.requires_grad)

    head_total = sum(p.numel() for p in model.classifier.parameters())
    head_trainable = sum(p.numel() for p in model.classifier.parameters() if p.requires_grad)

    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)

    print(f"{'Component':<20} {'Trainable':<15} {'Total':<15} {'Status'}")
    print("-" * 65)
    print(f"{'Encoder (roberta)':<20} {encoder_trainable:<15,} {encoder_total:<15,} {'üîì Trainable' if encoder_trainable > 0 else 'üîí Frozen'}")
    print(f"{'Head (classifier)':<20} {head_trainable:<15,} {head_total:<15,} {'üîì Trainable' if head_trainable > 0 else 'üîí Frozen'}")
    print("-" * 65)
    print(f"{'TOTAL':<20} {trainable:<15,} {total:<15,} {f'{trainable/total*100:.1f}% trainable'}")

In [None]:
for param in model_to_classify.base_model.parameters():
  param.requires_grad = False

# unfreezing last encoder layer
last_layer_prefix = 'roberta.encoder.layer.11.'
for name, param in model_to_classify.named_parameters():
    if name.startswith(last_layer_prefix):
        param.requires_grad = True
# unfreezing classification head
for param in model_to_classify.classifier.parameters():
    param.requires_grad = True

In [None]:
print_trainable_parameters(model_to_classify)

In [None]:
# WandB Setup
os.environ["WANDB_PROJECT"] = "stock-prediction-fine-tuning-project-regression-v2-head-normalized-no-large-stocks-tfid"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"

training_args = TrainingArguments(
    output_dir="./headResults",
    num_train_epochs=2,
    learning_rate=5e-5,
    per_device_train_batch_size=128,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir="./headLogs",
    logging_steps=10,
    report_to="wandb",
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_strategy="steps",
    max_grad_norm=1.0,
    # Mixed Precision Training for A100 speedup
    fp16=True,
    warmup_ratio=0.10,
    # 3. Model Selection
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)


trainer = Trainer(
    model=model_to_classify,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_val_dataset, # Use the tokenized validation dataset
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

In [None]:
print(f"Training will run for {training_args.num_train_epochs} epochs.")
trainer.train()

## Second Round of Finetuning

In [None]:
wandb.finish()

In [None]:
for param in model_to_classify.parameters():
  param.requires_grad = True

for param in model_to_classify.classifier.parameters():
  param.requires_grad = True

print_trainable_parameters(model_to_classify)

In [None]:
os.environ["WANDB_PROJECT"] = "stock-prediction-fine-tuning-project-regression-v1-full-normalized-no-large-stocks-tfid"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"


training_args = TrainingArguments(
    output_dir="./fullModelResults",
    num_train_epochs=3,
    learning_rate=1e-5,
    per_device_train_batch_size=128,
    warmup_steps=0,
    weight_decay=0.01,
    logging_dir="./fullModelLogs",
    logging_steps=10,
    report_to="wandb",
    save_strategy="epoch",
    eval_strategy="epoch",
    logging_strategy="steps",
    max_grad_norm=1.0,
    fp16=True,
    warmup_ratio=0.06,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)


trainer = Trainer(
    model=model_to_classify,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer)
)

In [None]:
print(f"Training will run for {training_args.num_train_epochs} epochs.")
trainer.train()

In [None]:
wandb.finish()

## Evaluation

In [None]:
# training set
# refetch model from wandb
wandb.init(project="stock-prediction-fine-tuning-project-regression-v1-full-normalized-no-large-stocks-tfid", job_type="evaluation_train_set")

print("\n--- Evaluating on Training Dataset ---")
prediction_output_train = trainer.predict(test_dataset=tokenized_dataset)
predictions_train = prediction_output_train.predictions.flatten()
true_labels_train = prediction_output_train.label_ids

mse_train = mean_squared_error(true_labels_train, predictions_train)
r2_train = r2_score(true_labels_train, predictions_train)

print(f"Mean Squared Error (MSE) on Training Set: {mse_train:.4f}")
print(f"R-squared (R2) on Training Set: {r2_train:.4f}")

plt.figure(figsize=(10, 6))
plt.scatter(true_labels_train, predictions_train, alpha=0.3)
plt.plot([min(true_labels_train), max(true_labels_train)], [min(true_labels_train), max(true_labels_train)], color='red', linestyle='--', label='Perfect Prediction')
plt.title('True Log Return vs. Predicted Log Return (Training Set)')
plt.xlabel('True Log Return')
plt.ylabel('Predicted Log Return')
plt.grid(True)
plt.legend()
plt.show()

wandb.finish()

In [None]:
# test set
wandb.init(project="stock-prediction-fine-tuning-project-regression-v1-full-normalized-no-large-stocks-tfid", job_type="evaluation_test_set")

print("\n--- Evaluating on Test Dataset ---")
prediction_output_test = trainer.predict(test_dataset=tokenized_test_dataset)
predictions_test = prediction_output_test.predictions.flatten()
true_labels_test = prediction_output_test.label_ids

mse_test = mean_squared_error(true_labels_test, predictions_test)
r2_test = r2_score(true_labels_test, predictions_test)

print(f"Mean Squared Error (MSE) on Test Set: {mse_test:.4f}")
print(f"R-squared (R2) on Test Set: {r2_test:.4f}")

plt.figure(figsize=(10, 6))
plt.scatter(true_labels_test, predictions_test, alpha=0.3)
plt.plot([min(true_labels_test), max(true_labels_test)], [min(true_labels_test), max(true_labels_test)], color='red', linestyle='--', label='Perfect Prediction')
plt.title('True Log Return vs. Predicted Log Return (Test Set)')
plt.xlabel('True Log Return')
plt.ylabel('Predicted Log Return')
plt.grid(True)
plt.legend()
plt.show()

wandb.finish()

In [None]:
notebook_login()

In [None]:
model_repo_name = "siddharthgowda/roberta-stock-news-regression-prediction"
trainer.push_to_hub(model_repo_name)
tokenizer.push_to_hub(model_repo_name)