# Relevant Imports

In [1]:
import numpy as np
import pandas as pd
import torch
import warnings
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.model_selection import TimeSeriesSplit

from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertModel
from transformers import AutoConfig
from transformers import Trainer, TrainingArguments

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, matthews_corrcoef
)

# Ignore all warnings
warnings.filterwarnings("ignore")

# Data Preprocessing

In [2]:
filepath_MRK ='../../../dataset_final/Daily_Financial_News/MRK/MRK_text_annotated.csv'
filepath_num_MRK = '../../../dataset_final/Daily_Financial_News/MRK/MRK_numerical_LR.csv'
data_MRK = pd.read_csv(filepath_MRK)
data_num_MRK = pd.read_csv(filepath_num_MRK)
data_MRK.head()
data_num_MRK.head()

Unnamed: 0,Date,Open,Close,High,Volume,Movement,Daily_Return,Volatility,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1
0,2009-08-05,28.5,28.25,28.79,13375100.0,1,-0.772743,0.474785,28.47,28.88,16077892.0,-0.593575,0.395866
1,2009-08-06,28.32,28.01,28.32,11499494.0,1,-0.849558,0.495732,28.25,28.79,13375100.0,-0.772743,0.474785
2,2009-08-07,28.34,28.72,29.1,13656278.0,1,2.534809,1.42062,28.01,28.32,11499494.0,-0.849558,0.495732
3,2009-08-10,28.9,29.19,29.58,19050963.0,1,1.63649,1.581916,28.72,29.1,13656278.0,2.534809,1.42062
4,2009-08-11,29.06,28.94,29.37,11423829.0,0,-0.856458,1.626562,29.19,29.58,19050963.0,1.63649,1.581916


In [3]:
filepath_MU ='../../../dataset_final/Daily_Financial_News/MU/MU_text_annotated.csv'
filepath_num_MU = '../../../dataset_final/Daily_Financial_News/MU/MU_numerical_LR.csv'
data_MU = pd.read_csv(filepath_MU)
data_num_MU = pd.read_csv(filepath_num_MU)
data_MU.head()
data_num_MU.head()

Unnamed: 0,Date,Open,Close,High,Volume,Movement,Daily_Return,Volatility,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1
0,2011-05-02,11.36,11.23,11.4,21585500.0,1,-0.707339,2.194295,11.31,11.44,36678800.0,-1.394943,2.306145
1,2011-05-03,11.19,10.89,11.22,37000000.0,0,-3.027605,1.917838,11.23,11.4,21585500.0,-0.707339,2.194295
2,2011-05-04,10.86,10.73,10.97,29570000.0,0,-1.469238,0.992006,10.89,11.22,37000000.0,-3.027605,1.917838
3,2011-05-05,10.67,10.9,11.06,35304000.0,0,1.584343,1.677184,10.73,10.97,29570000.0,-1.469238,0.992006
4,2011-05-06,11.1,10.86,11.13,22662600.0,1,-0.366972,1.680123,10.9,11.06,35304000.0,1.584343,1.677184


In [4]:
filepath_MS ='../../../dataset_final/Daily_Financial_News/MS/MS_text_annotated.csv'
filepath_num_MS = '../../../dataset_final/Daily_Financial_News/MS/MS_numerical_LR.csv'
data_MS = pd.read_csv(filepath_MS)
data_num_MS = pd.read_csv(filepath_num_MS)
data_MS.head()
data_num_MS.head()

Unnamed: 0,Date,Open,Close,High,Volume,Movement,Daily_Return,Volatility,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1
0,2010-01-29,27.82,26.78,27.85,21166100.0,1,-2.547307,1.887446,27.48,28.25,25480700.0,-1.681574,2.748412
1,2010-02-01,26.95,27.55,28.0,19559900.0,1,2.87528,2.501603,26.78,27.85,21166100.0,-2.547307,1.887446
2,2010-02-02,27.6,28.05,28.2,20992500.0,1,1.814882,2.483122,27.55,28.0,19559900.0,2.87528,2.501603
3,2010-02-03,28.15,27.88,28.43,18498800.0,1,-0.606061,2.304377,28.05,28.2,20992500.0,1.814882,2.483122
4,2010-02-04,27.61,26.62,27.64,29785300.0,0,-4.519369,3.043955,27.88,28.43,18498800.0,-0.606061,2.304377


In [5]:
filepath_NVDA ='../../../dataset_final/Daily_Financial_News/NVDA/NVDA_text_annotated.csv'
filepath_num_NVDA = '../../../dataset_final/Daily_Financial_News/NVDA/NVDA_numerical_LR.csv'
data_NVDA = pd.read_csv(filepath_NVDA)
data_num_NVDA = pd.read_csv(filepath_num_NVDA)
data_NVDA.head()
data_num_NVDA.head()

Unnamed: 0,Date,Open,Close,High,Volume,Movement,Daily_Return,Volatility,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1
0,2011-03-14,0.45,0.45,0.46,1003360000.0,0,0.0,3.731099,0.45,0.46,1486832000.0,2.272727,3.731099
1,2011-03-15,0.43,0.44,0.44,1256280000.0,0,-2.222222,3.202386,0.45,0.46,1003360000.0,0.0,3.731099
2,2011-03-16,0.43,0.43,0.45,1476520000.0,0,-2.272727,3.209415,0.44,0.44,1256280000.0,-2.222222,3.202386
3,2011-03-17,0.44,0.44,0.45,1238516000.0,1,2.325581,2.273491,0.43,0.45,1476520000.0,-2.272727,3.209415
4,2011-03-18,0.45,0.44,0.45,886960000.0,1,0.0,1.908577,0.44,0.45,1238516000.0,2.325581,2.273491


In [9]:
def merge_with_prev_day_news(text_path, num_path):
    # Load data
    data = pd.read_csv(text_path)
    data_num = pd.read_csv(num_path)

    # Convert dates
    data['Date'] = pd.to_datetime(data['Date'])
    data_num['Date'] = pd.to_datetime(data_num['Date'])

    # Shift news dates FORWARD by 1 to align them with the next day's movement
    data['Date'] = data['Date'] + pd.Timedelta(days=1)

    # Merge today's movement with yesterday's news
    merged = pd.merge(data_num, data, on='Date', how='inner')  # keeps movement from data_num
    return merged


In [11]:
merged_MRK = merge_with_prev_day_news(filepath_MRK, filepath_num_MRK)
merged_MU = merge_with_prev_day_news(filepath_MU, filepath_num_MU)
merged_MS = merge_with_prev_day_news(filepath_MS, filepath_num_MS)
merged_NVDA = merge_with_prev_day_news(filepath_NVDA, filepath_num_NVDA)

# Combine
all_data = pd.concat([merged_MRK, merged_MU, merged_MS, merged_NVDA], ignore_index=True)
all_data = all_data.drop(columns=['stock']) if 'stock' in all_data.columns else all_data
all_data = all_data.sample(frac=1.0, random_state=42).reset_index(drop=True)

# Prepare inputs
texts = all_data['title'].tolist()  # or 'title'
labels = all_data['Movement'].tolist()


In [16]:
all_data.head()

Unnamed: 0.1,Date,Open,Close,High,Volume,Movement,Daily_Return,Volatility,Close_lag1,High_lag1,Volume_lag1,Daily_Return_lag1,Volatility_lag1,Unnamed: 0,title,sentiment_score
0,2020-02-20,7.81,7.71,7.9,810052000.0,0,-1.908397,3.994351,7.86,7.88,694332000.0,6.072874,3.224204,917288,NVIDIA shares are trading higher after Bernste...,0.57
1,2012-10-16,44.59,45.0,45.1,12528002.0,1,1.237345,1.30666,44.45,44.49,10774802.0,2.113485,1.255734,841532,"Barclays Maintains Merck & at Overweight, Rais...",0.0
2,2014-11-07,0.51,0.49,0.51,450800000.0,1,-2.0,1.703145,0.5,0.5,394088000.0,0.0,1.806511,919576,"Earning, Economic & IPO Calendar for Thursday ...",0.01
3,2019-08-23,82.27,81.04,82.95,10608694.0,0,-2.054629,1.465715,82.74,83.2,6588776.0,-0.217077,1.125672,839300,Analysts Share Stocks To Buy On Fed Uncertainty,-0.01
4,2011-02-02,29.79,29.65,30.01,9230300.0,0,-0.902406,2.248305,29.92,30.06,11630100.0,1.803334,2.283847,848109,Biogen Beats by a Wide Margin - Analyst Blog,0.33


In [17]:
all_data.columns

Index(['Date', 'Open', 'Close', 'High', 'Volume', 'Movement', 'Daily_Return',
       'Volatility', 'Close_lag1', 'High_lag1', 'Volume_lag1',
       'Daily_Return_lag1', 'Volatility_lag1', 'Unnamed: 0', 'title',
       'sentiment_score'],
      dtype='object')

In [6]:
''''# Convert the Date columns to datetime
data['Date'] = pd.to_datetime(data['Date'])
data_num['Date'] = pd.to_datetime(data_num['Date'])

# Merge data with the Daily_Return column from data_num using an inner join
merged_data = pd.merge(data, data_num, on='Date', how='inner')

# Optionally, view the result
merged_data.head()'''

"'# Convert the Date columns to datetime\ndata['Date'] = pd.to_datetime(data['Date'])\ndata_num['Date'] = pd.to_datetime(data_num['Date'])\n\n# Merge data with the Daily_Return column from data_num using an inner join\nmerged_data = pd.merge(data, data_num, on='Date', how='inner')\n\n# Optionally, view the result\nmerged_data.head()"

In [7]:
'''if 'Unnamed: 0' in merged_data.columns:
    merged_data = merged_data.drop(columns=['Unnamed: 0'])
merged_data.head()'''


"if 'Unnamed: 0' in merged_data.columns:\n    merged_data = merged_data.drop(columns=['Unnamed: 0'])\nmerged_data.head()"

In [8]:
'''merged_data.value_counts'''

'merged_data.value_counts'