# **Data Cleaning and Managment using Github**

**Connecting to Github Repo and creating data version folders**

In [99]:
# Set your Git identity
!git config --global user.email "bimalsen07@gmail.com"
!git config --global user.name "rajbhanb"


In [100]:
#GitHub repo
repo_url = "https://github.com/rajbhanb/MLOPSFinalProject.git"

!git clone {repo_url}
%cd MLOPSFinalProject


Cloning into 'MLOPSFinalProject'...
remote: Enumerating objects: 19, done.[K
remote: Counting objects: 100% (19/19), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 19 (delta 5), reused 11 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (19/19), 1011.76 KiB | 8.09 MiB/s, done.
Resolving deltas: 100% (5/5), done.
/content/MLOPSFinalProject/MLOPSFinalProject/MLOPSFinalProject/MLOPSFinalProject/MLOPSFinalProject/MLOPSFinalProject


In [119]:
import os

folders = [
    "data/v0_raw",
    "data/v1_clean",
    "data/v2_final"
]

for f in folders:
    os.makedirs(f, exist_ok=True)

print("Folders created!")


Folders created!


In [91]:
# --- Version 0 (Raw Data) ---
df.to_csv("data/v0_raw/raw_news.csv", index=False)
news_df.to_csv("data/v0_raw/raw_market_data.csv", index=False)

# --- Version 1 (Cleaned + Merged) ---
merged.to_csv("data/v1_clean/cleaned_merged_df.csv", index=False)

# --- Version 2 (Feature Engineered) ---
final_df.to_csv("data/v2_final/features_ready_df.csv", index=False)

print("All datasets saved!")

All datasets saved!


**Get stock data from yfinance (Yahoo Finance API)**

SPY: Represents the S&P 500 index, often used as a proxy for the overall market.

VIX: Known as the "fear index," it measures market volatility and is a common indicator of investor sentiment.


In [113]:
import pandas as pd
import yfinance as yf


In [114]:
tickers = ["SPY", "^VIX"]

data = yf.download(
    tickers,
   start="2008-01-02",
    end="2024-03-04",
    auto_adjust=True
)

# yfinance returns multi-index columns. Let's only keep the close prices.
spy = data["Close"]["SPY"].rename("SPY_Close")
vix = data["Close"]["^VIX"].rename("VIX_Close")


[*********************100%***********************]  2 of 2 completed


In [44]:
vix

Unnamed: 0_level_0,VIX_Close
Date,Unnamed: 1_level_1
2008-01-02,23.170000
2008-01-03,22.490000
2008-01-04,23.940001
2008-01-07,23.790001
2008-01-08,25.430000
...,...
2024-02-26,13.740000
2024-02-27,13.430000
2024-02-28,13.840000
2024-02-29,13.400000


In [45]:
spy

Unnamed: 0_level_0,SPY_Close
Date,Unnamed: 1_level_1
2008-01-02,104.084908
2008-01-03,104.034660
2008-01-04,101.485130
2008-01-07,101.398979
2008-01-08,99.761536
...,...
2024-02-26,495.242188
2024-02-27,496.162231
2024-02-28,495.506531
2024-02-29,497.287781


In [115]:
#Merge VIX and SPY Data
df = pd.concat([spy, vix], axis=1)
df.index = pd.to_datetime(df.index)

In [116]:
df.dtypes

Unnamed: 0,0
SPY_Close,float64
VIX_Close,float64


In [117]:
df

Unnamed: 0_level_0,SPY_Close,VIX_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2008-01-02,104.084908,23.170000
2008-01-03,104.034660,22.490000
2008-01-04,101.485130,23.940001
2008-01-07,101.398979,23.790001
2008-01-08,99.761536,25.430000
...,...,...
2024-02-26,495.242188,13.740000
2024-02-27,496.162231,13.430000
2024-02-28,495.506531,13.840000
2024-02-29,497.287781,13.400000


**Save raw stock data as version 0**

In [121]:
# --- Version 0 (Raw Data) ---
df.to_csv("data/v0_raw/raw_market_data.csv", index=False)



In [122]:
!git add .
!git commit -m "Add dataset inital raw stock data"

# Push to main branch
token = "Put your Github token here"

!git push https://{token}@github.com/rajbhanb/MLOPSFinalProject.git main

[main 55ee9e9] Add dataset inital raw stock data
 1 file changed, 4070 insertions(+)
 create mode 100644 data/v0_raw/raw_market_data.csv
Enumerating objects: 6, done.
Counting objects: 100% (6/6), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (5/5), 42.33 KiB | 5.29 MiB/s, done.
Total 5 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/rajbhanb/MLOPSFinalProject.git
   bcf3fe0..55ee9e9  main -> main


In [123]:
df['Sentiment'] = df['VIX_Close'].apply(lambda x: 'fear' if x > 30 else 'greed' if x < 20 else 'neutral')
df['Sentiment_Label'] = df['VIX_Close'].apply(lambda x: 0 if x > 30 else 1 if x < 20 else 2)

In [124]:
 # Non-lagged calculations (based on today's/current row's data)
df["SPY_Daily_Return"] = df["SPY_Close"].pct_change()
df["SPY_7D_Return"] = df["SPY_Close"].pct_change(7)
df["SPY_30D_Return"] = df["SPY_Close"].pct_change(30)
df["VIX_Daily_Change"] = df["VIX_Close"].pct_change()
df["VIX_7D_MA"] = df["VIX_Close"].rolling(7).mean()
df["VIX_30D_MA"] = df["VIX_Close"].rolling(30).mean()

    # Apply the CRITICAL shift (lag) for all features
    # This ensures no data leakage: Features_t = Data_t-1
for col in ["SPY_Close", "SPY_Daily_Return", "SPY_7D_Return", "SPY_30D_Return",
                "VIX_Daily_Change", "VIX_7D_MA", "VIX_30D_MA", "VIX_Close"]:
                df[f'Lag_1_{col}'] = df[col].shift(1)



In [125]:
 # --- 5. FINAL DATASET ASSEMBLY ---

    # Define the final feature set (X)
feature_cols = [col for col in df.columns if col.startswith('Lag_1_')]

    # Define the final target (Y)
target_col = 'Sentiment_Label'

final_df = df[feature_cols + [target_col]].copy()

    # Drop rows with NaN, which result from the .shift(1) and rolling mean calculations
final_df.dropna(inplace=True)

print("\n--- Dataset Summary ---")
print(f"Final model-ready dataset shape: {final_df.shape}")
print("Feature Columns (X):", feature_cols)
print(f"Target Column (Y): {target_col}")
print("\nSentiment Label Distribution (0=Fear, 1=Greed, 2=Neutral):")
print(final_df['Sentiment_Label'].value_counts().sort_index())



--- Dataset Summary ---
Final model-ready dataset shape: (4038, 9)
Feature Columns (X): ['Lag_1_SPY_Close', 'Lag_1_SPY_Daily_Return', 'Lag_1_SPY_7D_Return', 'Lag_1_SPY_30D_Return', 'Lag_1_VIX_Daily_Change', 'Lag_1_VIX_7D_MA', 'Lag_1_VIX_30D_MA', 'Lag_1_VIX_Close']
Target Column (Y): Sentiment_Label

Sentiment Label Distribution (0=Fear, 1=Greed, 2=Neutral):
Sentiment_Label
0     430
1    2511
2    1097
Name: count, dtype: int64


In [52]:
df.shape

(4069, 18)

In [126]:
df

Unnamed: 0_level_0,SPY_Close,VIX_Close,Sentiment,Sentiment_Label,SPY_Daily_Return,SPY_7D_Return,SPY_30D_Return,VIX_Daily_Change,VIX_7D_MA,VIX_30D_MA,Lag_1_SPY_Close,Lag_1_SPY_Daily_Return,Lag_1_SPY_7D_Return,Lag_1_SPY_30D_Return,Lag_1_VIX_Daily_Change,Lag_1_VIX_7D_MA,Lag_1_VIX_30D_MA,Lag_1_VIX_Close
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2008-01-02,104.084908,23.170000,neutral,2,,,,,,,,,,,,,,
2008-01-03,104.034660,22.490000,neutral,2,-0.000483,,,-0.029348,,,104.084908,,,,,,,23.170000
2008-01-04,101.485130,23.940001,neutral,2,-0.024507,,,0.064473,,,104.034660,-0.000483,,,-0.029348,,,22.490000
2008-01-07,101.398979,23.790001,neutral,2,-0.000849,,,-0.006266,,,101.485130,-0.024507,,,0.064473,,,23.940001
2008-01-08,99.761536,25.430000,neutral,2,-0.016149,,,0.068936,,,101.398979,-0.000849,,,-0.006266,,,23.790001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-02-26,495.242188,13.740000,greed,1,-0.003663,0.014882,0.062223,-0.000727,14.434286,13.794000,497.062714,0.000690,0.027870,0.065658,-0.054333,14.525714,13.750667,13.750000
2024-02-27,496.162231,13.430000,greed,1,0.001858,0.009800,0.063460,-0.022562,14.351429,13.818333,495.242188,-0.003663,0.014882,0.062223,-0.000727,14.434286,13.794000,13.740000
2024-02-28,495.506531,13.840000,greed,1,-0.001322,0.013513,0.065968,0.030529,14.294286,13.818333,496.162231,0.001858,0.009800,0.063460,-0.022562,14.351429,13.818333,13.430000
2024-02-29,497.287781,13.400000,greed,1,0.003595,0.022787,0.075780,-0.031792,14.005714,13.772000,495.506531,-0.001322,0.013513,0.065968,0.030529,14.294286,13.818333,13.840000


**S&P 500 with Financial News Headlines (2008–2024)**

In [127]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("dyutidasmahaptra/s-and-p-500-with-financial-news-headlines-20082024")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 's-and-p-500-with-financial-news-headlines-20082024' dataset.
Path to dataset files: /kaggle/input/s-and-p-500-with-financial-news-headlines-20082024


In [128]:
import pandas as pd
import os

# Path returned by kagglehub
dataset_path = "/root/.cache/kagglehub/datasets/dyutidasmahaptra/s-and-p-500-with-financial-news-headlines-20082024/versions/1"

# List CSV files
os.listdir(dataset_path)


['sp500_headlines_2008_2024.csv']

In [129]:
news_df = pd.read_csv(os.path.join(dataset_path, "sp500_headlines_2008_2024.csv"))
news_df


Unnamed: 0,Title,Date,CP
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",2008-01-02,1447.16
1,Dow Tallies Biggest First-session-of-year Poin...,2008-01-02,1447.16
2,2008 predictions for the S&P 500,2008-01-02,1447.16
3,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03,1447.16
4,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07,1416.18
...,...,...,...
19122,REITs vs. Stocks: What Does the Data Say?,2024-03-04,5130.95
19123,"Nasdaq Index, Dow Jones, S&P 500 News: Futures...",2024-03-04,5130.95
19124,"Nasdaq 100, Dow Jones, S&P 500 News: Cautious ...",2024-03-04,5130.95
19125,"Bank of America boosts S&P 500 target to 5,400...",2024-03-04,5130.95


In [130]:
news_df = news_df.drop(columns=['CP'])
news_df

Unnamed: 0,Title,Date
0,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",2008-01-02
1,Dow Tallies Biggest First-session-of-year Poin...,2008-01-02
2,2008 predictions for the S&P 500,2008-01-02
3,"U.S. Stocks Higher After Economic Data, Monsan...",2008-01-03
4,U.S. Stocks Climb As Hopes Increase For More F...,2008-01-07
...,...,...
19122,REITs vs. Stocks: What Does the Data Say?,2024-03-04
19123,"Nasdaq Index, Dow Jones, S&P 500 News: Futures...",2024-03-04
19124,"Nasdaq 100, Dow Jones, S&P 500 News: Cautious ...",2024-03-04
19125,"Bank of America boosts S&P 500 target to 5,400...",2024-03-04


In [131]:
# Ensure titles are strings
news_df['Title'] = news_df['Title'].astype(str)

# Replace empty titles (None, NaN, '') with NaN so they are skipped
news_df['Title'] = news_df['Title'].replace(['', ' ', 'nan', 'None'], pd.NA)

# Group by date and keep the first non-null title
news_df = (
    news_df
    .sort_values(['Date'])               # ensures "first" is chronological
    .groupby('Date', as_index=False)
    .agg({'Title': 'first'})             # first non-null per group
)
news_df

Unnamed: 0,Date,Title
0,2008-01-02,"JPMorgan Predicts 2008 Will Be ""Nothing But Net"""
1,2008-01-03,"U.S. Stocks Higher After Economic Data, Monsan..."
2,2008-01-07,U.S. Stocks Climb As Hopes Increase For More F...
3,2008-01-09,How Investing in Intangibles -- Like Employee ...
4,2008-01-10,U.S. Stocks Zigzag Higher As Bernanke Speech S...
...,...,...
3502,2024-02-27,Ahead of Market: 10 things that will decide D-...
3503,2024-02-28,"Nasdaq Index, Dow Jones, S&P 500 News: US Stoc..."
3504,2024-02-29,S&P 500 History & Trends: What Does it Say Abo...
3505,2024-03-01,US stock market: Equities gain on in-line US i...


In [132]:
news_df['Date'] = pd.to_datetime(news_df['Date'])
news_df.dtypes

Unnamed: 0,0
Date,datetime64[ns]
Title,object


**Add news data in Version 0 Data Folder**

In [136]:
news_df.to_csv("data/v0_raw/raw_news.csv", index=False)
!git add .
!git commit -m "Add dataset inital raw news data"
!git push https://{token}@github.com/rajbhanb/MLOPSFinalProject.git main


[main dad50bf] Add dataset inital raw news data
 1 file changed, 3508 insertions(+)
 create mode 100644 data/v0_raw/raw_news.csv
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 2 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (5/5), 94.53 KiB | 3.38 MiB/s, done.
Total 5 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/rajbhanb/MLOPSFinalProject.git
   55ee9e9..dad50bf  main -> main


In [137]:
merged = news_df.merge(df, on="Date", how="inner")
merged

Unnamed: 0,Date,Title,SPY_Close,VIX_Close,Sentiment,Sentiment_Label,SPY_Daily_Return,SPY_7D_Return,SPY_30D_Return,VIX_Daily_Change,VIX_7D_MA,VIX_30D_MA,Lag_1_SPY_Close,Lag_1_SPY_Daily_Return,Lag_1_SPY_7D_Return,Lag_1_SPY_30D_Return,Lag_1_VIX_Daily_Change,Lag_1_VIX_7D_MA,Lag_1_VIX_30D_MA,Lag_1_VIX_Close
0,2008-01-02,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",104.084908,23.170000,neutral,2,,,,,,,,,,,,,,
1,2008-01-03,"U.S. Stocks Higher After Economic Data, Monsan...",104.034660,22.490000,neutral,2,-0.000483,,,-0.029348,,,104.084908,,,,,,,23.170000
2,2008-01-07,U.S. Stocks Climb As Hopes Increase For More F...,101.398979,23.790001,neutral,2,-0.000849,,,-0.006266,,,101.485130,-0.024507,,,0.064473,,,23.940001
3,2008-01-09,How Investing in Intangibles -- Like Employee ...,100.810043,24.120001,neutral,2,0.010510,,,-0.051514,,,99.761536,-0.016149,,,0.068936,,,25.430000
4,2008-01-10,U.S. Stocks Zigzag Higher As Bernanke Speech S...,101.470741,23.450001,neutral,2,0.006554,,,-0.027778,23.770000,,100.810043,0.010510,,,-0.051514,,,24.120001
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3501,2024-02-26,Berkshire Hathaway Stock Rises To Record High—...,495.242188,13.740000,greed,1,-0.003663,0.014882,0.062223,-0.000727,14.434286,13.794000,497.062714,0.000690,0.027870,0.065658,-0.054333,14.525714,13.750667,13.750000
3502,2024-02-27,Ahead of Market: 10 things that will decide D-...,496.162231,13.430000,greed,1,0.001858,0.009800,0.063460,-0.022562,14.351429,13.818333,495.242188,-0.003663,0.014882,0.062223,-0.000727,14.434286,13.794000,13.740000
3503,2024-02-28,"Nasdaq Index, Dow Jones, S&P 500 News: US Stoc...",495.506531,13.840000,greed,1,-0.001322,0.013513,0.065968,0.030529,14.294286,13.818333,496.162231,0.001858,0.009800,0.063460,-0.022562,14.351429,13.818333,13.430000
3504,2024-02-29,S&P 500 History & Trends: What Does it Say Abo...,497.287781,13.400000,greed,1,0.003595,0.022787,0.075780,-0.031792,14.005714,13.772000,495.506531,-0.001322,0.013513,0.065968,0.030529,14.294286,13.818333,13.840000


In [138]:
merged['Title'] = merged['Title'].astype(str)


In [63]:
merged.dtypes

Unnamed: 0,0
Date,datetime64[ns]
Title,object
SPY_Close,float64
VIX_Close,float64
Sentiment,object
Sentiment_Label,int64
SPY_Daily_Return,float64
SPY_7D_Return,float64
SPY_30D_Return,float64
VIX_Daily_Change,float64


**Add merged feature engineered stock and news data to Version one Data Folder**

In [139]:
# --- Version 1 (Cleaned + Merged) ---
merged.to_csv("data/v1_clean/cleaned_merged_df.csv", index=False)
!git add .
!git commit -m "Add merged feature engineered stock and news data"
!git push https://{token}@github.com/rajbhanb/MLOPSFinalProject.git main

[main 057b67b] Add merged feature engineered stock and news data
 1 file changed, 3507 insertions(+)
 create mode 100644 data/v1_clean/cleaned_merged_df.csv
Enumerating objects: 7, done.
Counting objects: 100% (7/7), done.
Delta compression using up to 2 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (5/5), 418.64 KiB | 3.99 MiB/s, done.
Total 5 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/rajbhanb/MLOPSFinalProject.git
   dad50bf..057b67b  main -> main


In [141]:
!pip install vaderSentiment



In [142]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER analyzer
analyzer = SentimentIntensityAnalyzer()

In [143]:
def vader_compound_score(text):
    """Calculates and returns the VADER Compound Score for a given text."""
    if pd.isna(text) or text is None:
        return None  # Handle missing values if necessary

    # Get the sentiment dictionary
    vs = analyzer.polarity_scores(str(text))

    # Return the compound score
    return vs['compound']

In [144]:
merged['VADER_Compound_Score'] = merged['Title'].apply(vader_compound_score)


In [145]:
merged


Unnamed: 0,Date,Title,SPY_Close,VIX_Close,Sentiment,Sentiment_Label,SPY_Daily_Return,SPY_7D_Return,SPY_30D_Return,VIX_Daily_Change,...,VIX_30D_MA,Lag_1_SPY_Close,Lag_1_SPY_Daily_Return,Lag_1_SPY_7D_Return,Lag_1_SPY_30D_Return,Lag_1_VIX_Daily_Change,Lag_1_VIX_7D_MA,Lag_1_VIX_30D_MA,Lag_1_VIX_Close,VADER_Compound_Score
0,2008-01-02,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",104.084908,23.170000,neutral,2,,,,,...,,,,,,,,,,0.0000
1,2008-01-03,"U.S. Stocks Higher After Economic Data, Monsan...",104.034660,22.490000,neutral,2,-0.000483,,,-0.029348,...,,104.084908,,,,,,,23.170000,0.0000
2,2008-01-07,U.S. Stocks Climb As Hopes Increase For More F...,101.398979,23.790001,neutral,2,-0.000849,,,-0.006266,...,,101.485130,-0.024507,,,0.064473,,,23.940001,0.6249
3,2008-01-09,How Investing in Intangibles -- Like Employee ...,100.810043,24.120001,neutral,2,0.010510,,,-0.051514,...,,99.761536,-0.016149,,,0.068936,,,25.430000,0.6597
4,2008-01-10,U.S. Stocks Zigzag Higher As Bernanke Speech S...,101.470741,23.450001,neutral,2,0.006554,,,-0.027778,...,,100.810043,0.010510,,,-0.051514,,,24.120001,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3501,2024-02-26,Berkshire Hathaway Stock Rises To Record High—...,495.242188,13.740000,greed,1,-0.003663,0.014882,0.062223,-0.000727,...,13.794000,497.062714,0.000690,0.027870,0.065658,-0.054333,14.525714,13.750667,13.750000,0.0000
3502,2024-02-27,Ahead of Market: 10 things that will decide D-...,496.162231,13.430000,greed,1,0.001858,0.009800,0.063460,-0.022562,...,13.818333,495.242188,-0.003663,0.014882,0.062223,-0.000727,14.434286,13.794000,13.740000,0.0000
3503,2024-02-28,"Nasdaq Index, Dow Jones, S&P 500 News: US Stoc...",495.506531,13.840000,greed,1,-0.001322,0.013513,0.065968,0.030529,...,13.818333,496.162231,0.001858,0.009800,0.063460,-0.022562,14.351429,13.818333,13.430000,0.1027
3504,2024-02-29,S&P 500 History & Trends: What Does it Say Abo...,497.287781,13.400000,greed,1,0.003595,0.022787,0.075780,-0.031792,...,13.772000,495.506531,-0.001322,0.013513,0.065968,0.030529,14.294286,13.818333,13.840000,0.0000


In [146]:
import pandas as pd
from textblob import TextBlob
import numpy as np

# --- 1. Define TextBlob Scoring Function ---

def get_textblob_scores(text):
    """
    Calculates the TextBlob Polarity and Subjectivity scores.
    Polarity: -1.0 (Negative) to +1.0 (Positive)
    Subjectivity: 0.0 (Objective/Factual) to 1.0 (Subjective/Opinion)
    """
    if pd.isna(text) or text is None:
        # Essential step for handling missing or NaN values in the Title column
        return np.nan, np.nan

    # Ensure text is treated as a string for TextBlob
    analysis = TextBlob(str(text))

    return analysis.sentiment.polarity, analysis.sentiment.subjectivity


# --- 2. Apply the Function to the 'Title' Column ---

# We use a lambda function with pd.Series to expand the tuple output
# (polarity, subjectivity) into two distinct columns in the DataFrame.
new_columns = ['TextBlob_Polarity', 'TextBlob_Subjectivity']

merged[new_columns] = merged['Title'].apply(lambda x: pd.Series(get_textblob_scores(x)))

merged

Unnamed: 0,Date,Title,SPY_Close,VIX_Close,Sentiment,Sentiment_Label,SPY_Daily_Return,SPY_7D_Return,SPY_30D_Return,VIX_Daily_Change,...,Lag_1_SPY_Daily_Return,Lag_1_SPY_7D_Return,Lag_1_SPY_30D_Return,Lag_1_VIX_Daily_Change,Lag_1_VIX_7D_MA,Lag_1_VIX_30D_MA,Lag_1_VIX_Close,VADER_Compound_Score,TextBlob_Polarity,TextBlob_Subjectivity
0,2008-01-02,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",104.084908,23.170000,neutral,2,,,,,...,,,,,,,,0.0000,0.000,0.000
1,2008-01-03,"U.S. Stocks Higher After Economic Data, Monsan...",104.034660,22.490000,neutral,2,-0.000483,,,-0.029348,...,,,,,,,23.170000,0.0000,0.225,0.350
2,2008-01-07,U.S. Stocks Climb As Hopes Increase For More F...,101.398979,23.790001,neutral,2,-0.000849,,,-0.006266,...,-0.024507,,,0.064473,,,23.940001,0.6249,0.500,0.500
3,2008-01-09,How Investing in Intangibles -- Like Employee ...,100.810043,24.120001,neutral,2,0.010510,,,-0.051514,...,-0.016149,,,0.068936,,,25.430000,0.6597,0.000,0.000
4,2008-01-10,U.S. Stocks Zigzag Higher As Bernanke Speech S...,101.470741,23.450001,neutral,2,0.006554,,,-0.027778,...,0.010510,,,-0.051514,,,24.120001,0.0000,0.075,0.250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3501,2024-02-26,Berkshire Hathaway Stock Rises To Record High—...,495.242188,13.740000,greed,1,-0.003663,0.014882,0.062223,-0.000727,...,0.000690,0.027870,0.065658,-0.054333,14.525714,13.750667,13.750000,0.0000,0.000,0.000
3502,2024-02-27,Ahead of Market: 10 things that will decide D-...,496.162231,13.430000,greed,1,0.001858,0.009800,0.063460,-0.022562,...,-0.003663,0.014882,0.062223,-0.000727,14.434286,13.794000,13.740000,0.0000,0.100,0.100
3503,2024-02-28,"Nasdaq Index, Dow Jones, S&P 500 News: US Stoc...",495.506531,13.840000,greed,1,-0.001322,0.013513,0.065968,0.030529,...,0.001858,0.009800,0.063460,-0.022562,14.351429,13.818333,13.430000,0.1027,0.000,0.000
3504,2024-02-29,S&P 500 History & Trends: What Does it Say Abo...,497.287781,13.400000,greed,1,0.003595,0.022787,0.075780,-0.031792,...,-0.001322,0.013513,0.065968,0.030529,14.294286,13.818333,13.840000,0.0000,0.000,0.125


In [147]:
# Lag the VADER and TextBlob scores by 1 day
merged['Lag_1_VADER_Compound_Score'] = merged['VADER_Compound_Score'].shift(1)
merged['Lag_1_TextBlob_Polarity'] = merged['TextBlob_Polarity'].shift(1)
merged['Lag_1_TextBlob_Subjectivity'] = merged['TextBlob_Subjectivity'].shift(1)
merged

Unnamed: 0,Date,Title,SPY_Close,VIX_Close,Sentiment,Sentiment_Label,SPY_Daily_Return,SPY_7D_Return,SPY_30D_Return,VIX_Daily_Change,...,Lag_1_VIX_Daily_Change,Lag_1_VIX_7D_MA,Lag_1_VIX_30D_MA,Lag_1_VIX_Close,VADER_Compound_Score,TextBlob_Polarity,TextBlob_Subjectivity,Lag_1_VADER_Compound_Score,Lag_1_TextBlob_Polarity,Lag_1_TextBlob_Subjectivity
0,2008-01-02,"JPMorgan Predicts 2008 Will Be ""Nothing But Net""",104.084908,23.170000,neutral,2,,,,,...,,,,,0.0000,0.000,0.000,,,
1,2008-01-03,"U.S. Stocks Higher After Economic Data, Monsan...",104.034660,22.490000,neutral,2,-0.000483,,,-0.029348,...,,,,23.170000,0.0000,0.225,0.350,0.0000,0.000000,0.000000
2,2008-01-07,U.S. Stocks Climb As Hopes Increase For More F...,101.398979,23.790001,neutral,2,-0.000849,,,-0.006266,...,0.064473,,,23.940001,0.6249,0.500,0.500,0.0000,0.225000,0.350000
3,2008-01-09,How Investing in Intangibles -- Like Employee ...,100.810043,24.120001,neutral,2,0.010510,,,-0.051514,...,0.068936,,,25.430000,0.6597,0.000,0.000,0.6249,0.500000,0.500000
4,2008-01-10,U.S. Stocks Zigzag Higher As Bernanke Speech S...,101.470741,23.450001,neutral,2,0.006554,,,-0.027778,...,-0.051514,,,24.120001,0.0000,0.075,0.250,0.6597,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3501,2024-02-26,Berkshire Hathaway Stock Rises To Record High—...,495.242188,13.740000,greed,1,-0.003663,0.014882,0.062223,-0.000727,...,-0.054333,14.525714,13.750667,13.750000,0.0000,0.000,0.000,0.5106,0.433333,0.733333
3502,2024-02-27,Ahead of Market: 10 things that will decide D-...,496.162231,13.430000,greed,1,0.001858,0.009800,0.063460,-0.022562,...,-0.000727,14.434286,13.794000,13.740000,0.0000,0.100,0.100,0.0000,0.000000,0.000000
3503,2024-02-28,"Nasdaq Index, Dow Jones, S&P 500 News: US Stoc...",495.506531,13.840000,greed,1,-0.001322,0.013513,0.065968,0.030529,...,-0.022562,14.351429,13.818333,13.430000,0.1027,0.000,0.000,0.0000,0.100000,0.100000
3504,2024-02-29,S&P 500 History & Trends: What Does it Say Abo...,497.287781,13.400000,greed,1,0.003595,0.022787,0.075780,-0.031792,...,0.030529,14.294286,13.818333,13.840000,0.0000,0.000,0.125,0.1027,0.000000,0.000000


In [148]:
feature_cols = [col for col in merged.columns if col.startswith('Lag_1_')]
feature_cols


['Lag_1_SPY_Close',
 'Lag_1_SPY_Daily_Return',
 'Lag_1_SPY_7D_Return',
 'Lag_1_SPY_30D_Return',
 'Lag_1_VIX_Daily_Change',
 'Lag_1_VIX_7D_MA',
 'Lag_1_VIX_30D_MA',
 'Lag_1_VIX_Close',
 'Lag_1_VADER_Compound_Score',
 'Lag_1_TextBlob_Polarity',
 'Lag_1_TextBlob_Subjectivity']

In [149]:
target_col= merged['Sentiment_Label']
target_col

Unnamed: 0,Sentiment_Label
0,2
1,2
2,2
3,2
4,2
...,...
3501,1
3502,1
3503,1
3504,1


In [150]:
final_df = merged[feature_cols].copy()
final_df['Sentiment_Label'] = target_col

In [74]:
final_df

Unnamed: 0,Lag_1_SPY_Close,Lag_1_SPY_Daily_Return,Lag_1_SPY_7D_Return,Lag_1_SPY_30D_Return,Lag_1_VIX_Daily_Change,Lag_1_VIX_7D_MA,Lag_1_VIX_30D_MA,Lag_1_VIX_Close,Lag_1_VADER_Compound_Score,Lag_1_TextBlob_Polarity,Lag_1_TextBlob_Subjectivity,Sentiment_Label
0,,,,,,,,,,,,2
1,104.084908,,,,,,,23.170000,0.0000,0.000000,0.000000,2
2,101.485130,-0.024507,,,0.064473,,,23.940001,0.0000,0.225000,0.350000,2
3,99.761536,-0.016149,,,0.068936,,,25.430000,0.6249,0.500000,0.500000,2
4,100.810043,0.010510,,,-0.051514,,,24.120001,0.6597,0.000000,0.000000,2
...,...,...,...,...,...,...,...,...,...,...,...,...
3501,497.062714,0.000690,0.027870,0.065658,-0.054333,14.525714,13.750667,13.750000,0.5106,0.433333,0.733333,1
3502,495.242188,-0.003663,0.014882,0.062223,-0.000727,14.434286,13.794000,13.740000,0.0000,0.000000,0.000000,1
3503,496.162231,0.001858,0.009800,0.063460,-0.022562,14.351429,13.818333,13.430000,0.0000,0.100000,0.100000,1
3504,495.506531,-0.001322,0.013513,0.065968,0.030529,14.294286,13.818333,13.840000,0.1027,0.000000,0.000000,1


In [151]:
final_df.dropna(inplace=True)
final_df

Unnamed: 0,Lag_1_SPY_Close,Lag_1_SPY_Daily_Return,Lag_1_SPY_7D_Return,Lag_1_SPY_30D_Return,Lag_1_VIX_Daily_Change,Lag_1_VIX_7D_MA,Lag_1_VIX_30D_MA,Lag_1_VIX_Close,Lag_1_VADER_Compound_Score,Lag_1_TextBlob_Polarity,Lag_1_TextBlob_Subjectivity,Sentiment_Label
11,97.614151,0.002952,0.021417,-0.037326,-0.046503,25.622857,26.366667,24.400000,-0.5106,0.000000,0.000000,2
12,96.802612,-0.008314,0.007775,-0.029660,0.029508,25.268572,26.356333,25.120001,0.0000,0.380000,0.770000,2
13,99.265999,-0.001012,0.022791,-0.021659,0.036073,23.827143,26.274000,22.690001,0.0000,0.200000,0.200000,2
14,98.296463,-0.009767,0.009962,-0.009408,0.037021,23.532857,26.280333,23.530001,0.5267,0.000000,0.000000,2
15,95.509941,-0.003820,-0.019392,0.007042,-0.028919,24.212858,26.224334,25.520000,-0.4019,-0.062500,0.625000,2
...,...,...,...,...,...,...,...,...,...,...,...,...
3501,497.062714,0.000690,0.027870,0.065658,-0.054333,14.525714,13.750667,13.750000,0.5106,0.433333,0.733333,1
3502,495.242188,-0.003663,0.014882,0.062223,-0.000727,14.434286,13.794000,13.740000,0.0000,0.000000,0.000000,1
3503,496.162231,0.001858,0.009800,0.063460,-0.022562,14.351429,13.818333,13.430000,0.0000,0.100000,0.100000,1
3504,495.506531,-0.001322,0.013513,0.065968,0.030529,14.294286,13.818333,13.840000,0.1027,0.000000,0.000000,1


**Put final datase to be used for modeling in Version 2 Final Data Folder**

In [153]:
# --- Version 2 (Feature Engineered with News Sentiment Scores) ---
final_df.to_csv("data/v2_final/final_data.csv", index=False)
!git add .
!git commit -m "Add final_data"
!git push https://{token}@github.com/rajbhanb/MLOPSFinalProject.git main

[main 7ae9e46] Add final_data
 1 file changed, 3496 insertions(+)
 create mode 100644 data/v2_final/final_data.csv
Enumerating objects: 6, done.
Counting objects: 100% (6/6), done.
Delta compression using up to 2 threads
Compressing objects: 100% (3/3), done.
Writing objects: 100% (4/4), 402 bytes | 402.00 KiB/s, done.
Total 4 (delta 1), reused 0 (delta 0), pack-reused 0
remote: Resolving deltas: 100% (1/1), completed with 1 local object.[K
To https://github.com/rajbhanb/MLOPSFinalProject.git
   057b67b..7ae9e46  main -> main
