In [2]:
print("Hello, Colab!")

Hello, Colab!


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
import torch
import warnings
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.model_selection import TimeSeriesSplit

from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertModel
from transformers import AutoConfig
from transformers import Trainer, TrainingArguments

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, matthews_corrcoef
)

# Ignore all warnings
warnings.filterwarnings("ignore")

In [5]:
filepath_MRK ='/content/drive/My Drive/nlp/dataset_final/daily_news/mrk/MRK_text_annotated.csv'
filepath_num_MRK = '/content/drive/My Drive/nlp/dataset_final/daily_news/mrk/MRK_numerical_LR.csv'
data_MRK = pd.read_csv(filepath_MRK)
data_num_MRK = pd.read_csv(filepath_num_MRK)
data_MRK.head()
data_num_MRK.head()

Unnamed: 0,Date,Open,Close,High,Volume,Movement,Daily_Return,Volatility,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1
0,2020-06-01,64.97,64.46,65.19,6269241.0,0,-1.587786,0.882498,65.5,65.52,11352984.0,-1.488946,0.99389
1,2020-05-29,64.38,65.4,65.58,13517104.0,0,1.458269,1.350375,64.46,65.19,6269241.0,-1.587786,0.882498
2,2020-05-28,63.69,64.04,65.06,13180906.0,0,-2.079511,1.55561,65.4,65.58,13517104.0,1.458269,1.350375
3,2020-05-27,62.39,62.84,62.91,12828987.0,0,-1.873829,1.456981,64.04,65.06,13180906.0,-2.079511,1.55561
4,2020-05-26,63.41,62.6,63.69,19146017.0,1,-0.381922,1.469883,62.84,62.91,12828987.0,-1.873829,1.456981


In [6]:
filepath_MU ='/content/drive/My Drive/nlp/dataset_final/daily_news/mu/MU_text_annotated.csv'
filepath_num_MU = '/content/drive/My Drive/nlp/dataset_final/daily_news/mu/MU_numerical_LR.csv'
data_MU = pd.read_csv(filepath_MU)
data_num_MU = pd.read_csv(filepath_num_MU)
data_MU.head()
data_num_MU.head()

Unnamed: 0,Date,Open,Close,High,Volume,Movement,Daily_Return,Volatility,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1
0,2020-05-29,45.53,46.79,46.9,26337000.0,1,3.380468,3.472126,45.26,46.55,21809100.0,-1.006124,2.223016
1,2020-05-28,47.56,45.38,47.65,38289300.0,1,-3.013464,3.255123,46.79,46.9,26337000.0,3.380468,3.472126
2,2020-05-27,44.87,48.29,48.42,56071500.0,0,6.412517,4.4945,45.38,47.65,38289300.0,-3.013464,3.255123
3,2020-05-26,45.14,44.73,45.6,24303900.0,0,-7.372127,5.396255,48.29,48.42,56071500.0,6.412517,4.4945
4,2020-05-22,44.09,43.89,44.25,13385400.0,0,-1.877934,5.437901,44.73,45.6,24303900.0,-7.372127,5.396255


In [7]:
filepath_MS ='/content/drive/My Drive/nlp/dataset_final/daily_news/ms/MS_text_annotated.csv'
filepath_num_MS = '/content/drive/My Drive/nlp/dataset_final/daily_news/ms/MS_numerical_LR.csv'
data_MS = pd.read_csv(filepath_MS)
data_num_MS = pd.read_csv(filepath_num_MS)
data_MS.head()
data_num_MS.head()

Unnamed: 0,Date,Open,Close,High,Volume,Movement,Daily_Return,Volatility,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1
0,2020-06-01,37.64,37.79,38.1,8649500.0,0,-1.920581,0.963911,38.53,39.11,11179200.0,-2.849218,1.260023
1,2020-05-29,37.21,37.37,37.87,13102100.0,0,-1.111405,1.001892,37.79,38.1,8649500.0,-1.920581,0.963911
2,2020-05-28,39.95,37.54,40.0,17615900.0,1,0.45491,1.587032,37.37,37.87,13102100.0,-1.111405,1.001892
3,2020-05-27,38.7,39.62,39.67,25892400.0,1,5.540757,3.314597,37.54,40.0,17615900.0,0.45491,1.587032
4,2020-05-26,35.4,36.94,37.02,19709900.0,0,-6.76426,4.43557,39.62,39.67,25892400.0,5.540757,3.314597


In [8]:
filepath_NVDA ='/content/drive/My Drive/nlp/dataset_final/daily_news/nvda/NVDA_text_annotated.csv'
filepath_num_NVDA = '/content/drive/My Drive/nlp/dataset_final/daily_news/nvda/NVDA_numerical_LR.csv'
data_NVDA = pd.read_csv(filepath_NVDA)
data_num_NVDA = pd.read_csv(filepath_num_NVDA)
data_NVDA.head()
data_num_NVDA.head()

Unnamed: 0,Date,Open,Close,High,Volume,Movement,Daily_Return,Volatility,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1
0,2020-05-29,8.52,8.84,8.84,745256000.0,0,0.798176,1.041836,8.77,8.8,389876000.0,-0.227531,1.186951
1,2020-05-28,8.38,8.45,8.72,734896000.0,0,-4.411765,2.158446,8.84,8.84,745256000.0,0.798176,1.041836
2,2020-05-27,8.59,8.49,8.6,1175892000.0,1,0.473373,2.202952,8.45,8.72,734896000.0,-4.411765,2.158446
3,2020-05-26,9.12,8.68,9.14,770780000.0,1,2.237927,2.506473,8.49,8.6,1175892000.0,0.473373,2.202952
4,2020-05-22,8.79,8.99,9.05,1038764000.0,1,3.571429,3.027515,8.68,9.14,770780000.0,2.237927,2.506473


In [9]:
def merge_with_prev_day_news(text_path, num_path):
    # Load data
    data = pd.read_csv(text_path)
    data_num = pd.read_csv(num_path)

    # Convert dates
    data['Date'] = pd.to_datetime(data['Date'])
    data_num['Date'] = pd.to_datetime(data_num['Date'])

    # Shift news dates FORWARD by 1 to align them with the next day's movement
    data['Date'] = data['Date'] + pd.Timedelta(days=1)

    # Merge today's movement with yesterday's news
    merged = pd.merge(data_num, data, on='Date', how='inner')  # keeps movement from data_num
    return merged


In [10]:
merged_MRK = merge_with_prev_day_news(filepath_MRK, filepath_num_MRK)
merged_MU = merge_with_prev_day_news(filepath_MU, filepath_num_MU)
merged_MS = merge_with_prev_day_news(filepath_MS, filepath_num_MS)
merged_NVDA = merge_with_prev_day_news(filepath_NVDA, filepath_num_NVDA)

# Combine
all_data = pd.concat([merged_MRK, merged_MU, merged_MS, merged_NVDA], ignore_index=True)
all_data = all_data.drop(columns=['stock']) if 'stock' in all_data.columns else all_data
all_data = all_data.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Prepare inputs
texts = all_data['title'].tolist()  # or 'title'
labels = all_data['Movement'].tolist()


In [11]:
all_data.head()

Unnamed: 0.1,Date,Open,Close,High,Volume,Movement,Daily_Return,Volatility,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1,Unnamed: 0,title,sentiment_score
0,2010-10-20,17.88,18.15,18.42,41015300.0,1,3.183627,1.544789,17.59,18.02,22775800.0,0.285063,1.156674,848405,Important Earnings Tomorrow,0.06
1,2016-10-14,24.88,24.75,25.31,13734100.0,1,0.896861,1.239535,24.53,24.83,10114300.0,-1.643945,1.014282,846474,The Big Bank Earnings Season Begins Friday,0.11
2,2012-04-05,24.29,24.37,24.38,14547078.0,1,0.412031,0.475232,24.27,24.43,12287066.0,0.705394,0.658136,841654,Decoding Wall St.: Express Scripts and Medco M...,-0.05
3,2017-11-14,5.26,5.29,5.3,529292000.0,1,2.123552,1.338387,5.18,5.23,501948000.0,-0.766284,0.649238,918505,"Long-Time Nvidia Bear Upgrades Stock, Lifts Ta...",0.02
4,2019-07-10,67.24,67.25,67.48,5996132.0,1,4.718156,2.509469,64.22,66.34,22146126.0,1.581778,1.181344,839356,FDA Accepts Merck's Supplemental Biologics Lic...,0.01


In [12]:
all_data.columns

Index(['Date', 'Open', 'Close', 'High', 'Volume', 'Movement', 'Daily_Return',
       'Volatility', 'Close_lag1', 'High_lag1', 'Volume_lag1',
       'Daily_Return_lag1', 'Volatility_lag1', 'Unnamed: 0', 'title',
       'sentiment_score'],
      dtype='object')

In [14]:
# drop all the numeric-/fuson-related columns
cols_to_drop = [
    'Date',
    'Open', 'Close', 'High', 'Volume',
    'Daily_Return', 'Volatility',
    'Close_lag1', 'High_lag1', 'Volume_lag1',
    'Daily_Return_lag1', 'Volatility_lag1',
    'Unnamed: 0', 'sentiment_score'
]
df_finetune = all_data.drop(columns=cols_to_drop)

# (optional) if you don't need the Date column during training, drop it too:
# df_finetune = df_finetune.drop(columns=['Date'])

# sanity check
print(df_finetune.columns)
# should only see ['Date', 'title', 'Movement'] (or without Date if dropped)

# save to disk for your Trainer
df_finetune.to_csv("/content/drive/My Drive/nlp/dataset_final/deberta_finetune_data.csv", index=False)


Index(['Movement', 'title'], dtype='object')
