In [11]:
pip install yfinance

Note: you may need to restart the kernel to use updated packages.


In [12]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install talib

[31mERROR: Could not find a version that satisfies the requirement talib (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for talib[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [14]:
import numpy as np
import pandas as pd
import yfinance as yf
import talib
from sklearn.model_selection import train_test_split, TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report


## NO NEWS SENTIMENT

In [117]:
# 1. Fetch Stock Data
def get_stock_data(ticker, start, end):
    data = yf.download(ticker, start=start, end=end)
    return data

# 2. Feature Engineering


def add_technical_indicators(df):
    close = df['Close'].values.ravel()
    df['SMA_10'] = talib.SMA(close, timeperiod=10)
    df['SMA_50'] = talib.SMA(close, timeperiod=50)
    df['RSI'] = talib.RSI(close, timeperiod=14)
    df['MACD'], df['MACD_signal'], _ = talib.MACD(close)
    df['Volatility'] = df['Close'].rolling(10).std()
    df['Returns'] = df['Close'].pct_change()
    df['Target'] = (df['Returns'] >= 0).astype(int)    
    df.dropna(inplace=True)
    return df

# 3. Data Preparation
def prepare_data(df):
    features = ['SMA_10', 'SMA_50', 'RSI', 'MACD', 'MACD_signal', 'Volatility']
    X = df[features]
    y = df['Target']
    # Using a time-aware train/test split (no shuffling)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

# 4. Hyperparameter Tuning
def hyperparameter_tuning(X_train, y_train):
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Grid for RandomForestClassifier
    rf_param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    }
    
    rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), 
                           param_grid=rf_param_grid, 
                           cv=tscv, 
                           scoring='accuracy',
                           n_jobs=-1)
    rf_grid.fit(X_train, y_train)
    print("Best parameters for RandomForest:", rf_grid.best_params_)
    print("Best cross-validated accuracy (RandomForest):", rf_grid.best_score_)
    
    # Grid for XGBClassifier
    xgb_param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    }
    
    xgb_grid = GridSearchCV(XGBClassifier(eval_metric='logloss', random_state=42), 
                            param_grid=xgb_param_grid, 
                            cv=tscv, 
                            scoring='accuracy',
                            n_jobs=-1)
    xgb_grid.fit(X_train, y_train)
    print("Best parameters for XGBoost:", xgb_grid.best_params_)
    print("Best cross-validated accuracy (XGBoost):", xgb_grid.best_score_)
    
    return rf_grid.best_estimator_, xgb_grid.best_estimator_

# 5. Model Training & Evaluation (using tuned hyperparameters)
def train_and_evaluate(X_train, X_test, y_train, y_test, rf_model, xgb_model):
    models = {
        'RandomForest': rf_model,
        'XGBoost': xgb_model
    }
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f'\n{name} Model Performance:')
        print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
        print(classification_report(y_test, y_pred))

In [118]:
# All in one test, for new date range
data = get_stock_data('AAPL', '2020-03-01', '2023-12-10')
data = add_technical_indicators(data)

X_train, X_test, y_train, y_test = prepare_data(data)

# Hyperparameter tuning on training data
best_rf, best_xgb = hyperparameter_tuning(X_train, y_train)

# Evaluate the tuned models on the test set
train_and_evaluate(X_train, X_test, y_train, y_test, best_rf, best_xgb)

[*********************100%***********************]  1 of 1 completed


Best parameters for RandomForest: {'max_depth': 10, 'min_samples_split': 2, 'n_estimators': 100}
Best cross-validated accuracy (RandomForest): 0.65
Best parameters for XGBoost: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50}
Best cross-validated accuracy (XGBoost): 0.6683333333333333

RandomForest Model Performance:
Accuracy: 0.64
              precision    recall  f1-score   support

           0       0.59      0.59      0.59        80
           1       0.68      0.68      0.68       101

    accuracy                           0.64       181
   macro avg       0.64      0.64      0.64       181
weighted avg       0.64      0.64      0.64       181


XGBoost Model Performance:
Accuracy: 0.69
              precision    recall  f1-score   support

           0       0.65      0.64      0.64        80
           1       0.72      0.72      0.72       101

    accuracy                           0.69       181
   macro avg       0.68      0.68      0.68       181
weighted avg  

# USE News Sentiment

In [None]:
pip install datasets

In [None]:
pip install transformers torch

In [119]:
from datasets import load_dataset
from transformers import pipeline

ModuleNotFoundError: No module named 'datasets'

In [None]:
#function to get finbert sentiment score; and to get sentiment_score.csv with rolling sentiment mean

def get_finbert_score(article):
    finbert_score = finbert(article, truncation=True, max_length=512)
    prediction = finbert_score[0]
    
    # Convert the label to lowercase for consistent comparison
    label = prediction['label'].lower()
    score = prediction['score']
    
    if label == "positive":
        return score
    elif label == "negative":
        return -score
    else:
        return 0

def get_scores(dataset): #Get sentiment_score.csv with rolling mean for score, USE: "sabareesh88/FNSPID_nasdaq"
    news_dataset = load_dataset(dataset)
    news_dataset = news_dataset['train'].to_pandas()
    apple_news = news_dataset[news_dataset['Stock_symbol']=="AAPL"]
    reduced_apple_news = apple_news[["Date","Article_title"]]
    reduced_apple_news['finbert_sentiment'] = reduced_apple_news['Article_title'].apply(get_finbert_score)
    # Convert reduced_apple_news df to entry per date
    reduced_apple_news["Date"] = pd.to_datetime(reduced_apple_news["Date"].astype(str).str.replace(" UTC", ""))

    # Now create a new column with the formatted date (YYYY-MM-DD)
    reduced_apple_news["Date"] = reduced_apple_news["Date"].dt.strftime("%Y-%m-%d")
    
    grouped_score = reduced_apple_news.groupby("Date")["finbert_sentiment"].mean().reset_index()
    grouped_score["Date"] = pd.to_datetime(grouped_score["Date"])

    # Create a DataFrame with a full date range
    full_date_range = pd.date_range(start="2020-03-09", end="2023-12-10")
    full_dates_df = pd.DataFrame({"Date": full_date_range})

    # Perform a left outer join: keep all dates in full_dates_df and join grouped_score on "Date"
    joined_df = pd.merge(full_dates_df, grouped_score, on="Date", how="left")

    # Fill missing values with 0
    joined_df = joined_df.fillna(0)
    joined_df["rolling_mean_score"] = joined_df["finbert_sentiment"].rolling(window=5, min_periods=1).mean()
    joined_df.to_csv('sentiment_score.csv', index=False)


In [16]:
sentiment_score = pd.read_csv("sentiment_score.csv")
sentiment_score.head()

Unnamed: 0,Date,rolling_mean_score
0,2020-03-13,-0.183713
1,2020-03-14,-0.117102
2,2020-03-15,-0.007185
3,2020-03-16,0.046511
4,2020-03-17,0.068373


In [None]:
# 1. Fetch Stock Data
def get_stock_data(ticker, start, end):
    data = yf.download(ticker, start=start, end=end)
    return data

# 2. Feature Engineering

def add_technical_indicators(df, sentiment_score):
    # Ensure the stock data has a Date column
    df = df.reset_index()  # 'Date' becomes a column
    if isinstance(df.columns, pd.MultiIndex):
        df.columns = df.columns.get_level_values(0)
    
    # Convert the 'Date' column to a date (dropping time info)
    df['Date'] = pd.to_datetime(df['Date']).dt.date

    # Compute technical indicators as before:
    close = df['Close'].values.ravel()
    df['SMA_10'] = talib.SMA(close, timeperiod=10)
    df['SMA_50'] = talib.SMA(close, timeperiod=50)
    df['RSI'] = talib.RSI(close, timeperiod=14)
    df['MACD'], df['MACD_signal'], _ = talib.MACD(close)
    df['Volatility'] = df['Close'].rolling(10).std()
    df['Returns'] = df['Close'].pct_change()
    df['Target'] = (df['Returns'] >= 0).astype(int)

    df.reset_index(inplace=True)
    df.columns = df.columns.get_level_values(0)

    # Convert 'Date' column in stock data to datetime

    df['Date'] = pd.to_datetime(df['Date'])
    sentiment_score["Date"] = pd.to_datetime(sentiment_score["Date"])
    #merge
    df = pd.merge(df, sentiment_score, on="Date", how="right")
    
    df["rolling_mean_score"] = df["rolling_mean_score"].fillna(0)


    return df.dropna()

# 3. Data Preparation
def prepare_data(df):
    features = ['SMA_10', 'SMA_50', 'RSI', 'MACD', 'MACD_signal', 'Volatility']
    X = df[features]
    y = df['Target']
    # Using a time-aware train/test split (no shuffling)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    return X_train, X_test, y_train, y_test

# 4. Hyperparameter Tuning
def hyperparameter_tuning(X_train, y_train):
    tscv = TimeSeriesSplit(n_splits=5)
    
    # Grid for RandomForestClassifier
    rf_param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 5, 10],
        'min_samples_split': [2, 5, 10]
    }
    
    rf_grid = GridSearchCV(RandomForestClassifier(random_state=42), 
                           param_grid=rf_param_grid, 
                           cv=tscv, 
                           scoring='accuracy',
                           n_jobs=-1)
    rf_grid.fit(X_train, y_train)
    print("Best parameters for RandomForest:", rf_grid.best_params_)
    print("Best cross-validated accuracy (RandomForest):", rf_grid.best_score_)
    
    # Grid for XGBClassifier
    xgb_param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.2]
    }
    
    xgb_grid = GridSearchCV(XGBClassifier(eval_metric='logloss', random_state=42), 
                            param_grid=xgb_param_grid, 
                            cv=tscv, 
                            scoring='accuracy',
                            n_jobs=-1)
    xgb_grid.fit(X_train, y_train)
    print("Best parameters for XGBoost:", xgb_grid.best_params_)
    print("Best cross-validated accuracy (XGBoost):", xgb_grid.best_score_)
    
    return rf_grid.best_estimator_, xgb_grid.best_estimator_

# 5. Model Training & Evaluation (using tuned hyperparameters)
def train_and_evaluate(X_train, X_test, y_train, y_test, rf_model, xgb_model):
    models = {
        'RandomForest': rf_model,
        'XGBoost': xgb_model
    }
    
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        print(f'\n{name} Model Performance:')
        print(f'Accuracy: {accuracy_score(y_test, y_pred):.2f}')
        print(classification_report(y_test, y_pred))

In [112]:
data = get_stock_data('AAPL', '2020-01-01', '2023-12-10')
data = add_technical_indicators(data, sentiment_score)
data

[*********************100%***********************]  1 of 1 completed


Unnamed: 0,index,Date,Close,High,Low,Open,Volume,SMA_10,SMA_50,RSI,MACD,MACD_signal,Volatility,Returns,Target,rolling_mean_score
0,49.0,2020-03-13,67.457481,67.930708,61.385652,64.283243,370732000.0,68.580121,74.122980,45.429018,-2.624300,-1.985395,3.957141,0.119808,1.0,-0.183713
3,50.0,2020-03-16,58.779289,62.873280,58.242967,58.716190,322423600.0,67.206557,73.844244,37.049831,-3.189920,-2.226300,4.745027,-0.128647,0.0,0.046511
4,51.0,2020-03-17,61.363819,62.516541,57.854679,60.065485,324056000.0,66.321749,73.631339,40.565776,-3.390543,-2.459149,4.943167,0.043970,1.0,0.068373
5,52.0,2020-03-18,59.861633,60.669755,57.544048,58.187150,300233600.0,64.961046,73.376913,39.195500,-3.628921,-2.693103,4.619449,-0.024480,0.0,-0.062579
6,53.0,2020-03-19,59.402969,61.358962,58.876356,60.036362,271857200.0,63.792789,73.120141,38.764930,-3.810917,-2.916666,4.368935,-0.007662,0.0,0.034781
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1361,987.0,2023-12-04,188.299454,188.915764,186.331275,188.846174,43389500.0,189.218945,178.515839,61.089395,3.301959,3.527222,0.772456,-0.009465,0.0,0.128789
1362,988.0,2023-12-05,192.265640,193.239787,189.044972,189.074806,66628400.0,189.414767,178.865175,68.322916,3.402263,3.502230,1.205741,0.021063,1.0,0.082106
1363,989.0,2023-12-06,191.172211,193.597636,190.963457,193.289488,41089700.0,189.581764,179.274442,64.749212,3.354851,3.472754,1.328591,-0.005687,0.0,0.116419
1364,990.0,2023-12-07,193.110580,193.836219,192.434631,192.474401,47477700.0,189.875999,179.752855,67.949691,3.434101,3.465024,1.736191,0.010139,1.0,0.159583


In [115]:
# Fetch stock data
data = get_stock_data('AAPL', '2020-03-15', '2023-12-10')

data.reset_index(inplace=True)
data.columns = data.columns.get_level_values(0)

# Convert 'Date' column in stock data to datetime

data['Date'] = pd.to_datetime(data['Date'])
sentiment_score["Date"] = pd.to_datetime(sentiment_score["Date"])

#merge
data = pd.merge(data, sentiment_score, on="Date", how="left")
data.head()



[*********************100%***********************]  1 of 1 completed


Unnamed: 0,Date,Close,High,Low,Open,Volume,rolling_mean_score
0,2020-03-16,58.779282,62.873271,58.242959,58.716183,322423600,0.046511
1,2020-03-17,61.363815,62.516537,57.854675,60.065481,324056000,0.068373
2,2020-03-18,59.861633,60.669755,57.544048,58.18715,300233600,-0.062579
3,2020-03-19,59.402969,61.358962,58.876356,60.036362,271857200,0.034781
4,2020-03-20,55.631748,61.113867,55.330825,59.985407,401693200,0.134781


In [116]:
#TRY
data = get_stock_data('AAPL', '2020-01-01', '2023-12-10')
data = add_technical_indicators(data, sentiment_score)

X_train, X_test, y_train, y_test = prepare_data(data)

# Hyperparameter tuning on training data
best_rf, best_xgb = hyperparameter_tuning(X_train, y_train)

# Evaluate the tuned models on the test set
train_and_evaluate(X_train, X_test, y_train, y_test, best_rf, best_xgb)


[*********************100%***********************]  1 of 1 completed


Best parameters for RandomForest: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}
Best cross-validated accuracy (RandomForest): 0.6384000000000001
Best parameters for XGBoost: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100}
Best cross-validated accuracy (XGBoost): 0.6607999999999999

RandomForest Model Performance:
Accuracy: 0.62
              precision    recall  f1-score   support

         0.0       0.56      0.67      0.61        82
         1.0       0.70      0.59      0.64       107

    accuracy                           0.62       189
   macro avg       0.63      0.63      0.62       189
weighted avg       0.64      0.62      0.63       189


XGBoost Model Performance:
Accuracy: 0.64
              precision    recall  f1-score   support

         0.0       0.58      0.63      0.60        82
         1.0       0.70      0.64      0.67       107

    accuracy                           0.64       189
   macro avg       0.64      0.64      0.64       189
w