In [5]:
#######################################################
# Compare_Models_Apple.ipynb
#######################################################

#############################
# 1) Imports
#############################

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")


In [3]:
!pip install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.4


In [6]:
!pip uninstall xgboost -y
!pip install xgboost --no-cache-dir

Found existing installation: xgboost 2.1.4
Uninstalling xgboost-2.1.4:
  Successfully uninstalled xgboost-2.1.4
Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.4-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-2.1.4


In [1]:

# ML models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

# For model evaluation
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt



In [2]:

#############################
# 2) Helper functions for technical indicators
#############################

def compute_rsi(series, period=14):
    """
    Simple RSI calculation with rolling means of gains/losses.
    """
    delta = series.diff()
    gains = delta.where(delta > 0, 0.0)
    losses = -delta.where(delta < 0, 0.0)
    avg_gain = gains.rolling(window=period).mean()
    avg_loss = losses.rolling(window=period).mean()
    rs = avg_gain / (avg_loss + 1e-9)
    rsi = 100.0 - (100.0 / (1.0 + rs))
    return rsi

def compute_macd(series, fastperiod=12, slowperiod=26, signalperiod=9):
    """
    MACD using exponential moving averages (commonly used approach).
    """
    ema_fast = series.ewm(span=fastperiod, adjust=False).mean()
    ema_slow = series.ewm(span=slowperiod, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signalperiod, adjust=False).mean()
    macd_hist = macd_line - signal_line
    return macd_line, signal_line, macd_hist

def compute_ma(series, window=20):
    """
    Simple moving average (window=20 by default).
    """
    return series.rolling(window=window).mean()



In [None]:
#############################
# 3) Load Apple Price & News Data
#############################

#--- A) Load and clean Apple price data
#   Adjust skiprows/header logic depending on your actual CSV format:
df_price_raw = pd.read_csv("AAPL.csv", skiprows=2, header=None)

# We'll rename columns carefully based on your sample snippet:
df_price_raw.columns = ["Date","Close","High","Low","Open","extra1","extra2","Volume","extra3","extra4"]
df_price_raw = df_price_raw[["Date","Open","High","Low","Close","Volume"]]

df_price_raw["Date"] = pd.to_datetime(df_price_raw["Date"], errors="coerce")
df_price_raw.dropna(subset=["Date"], inplace=True)
df_price_raw.sort_values("Date", inplace=True)
df_price_raw.reset_index(drop=True, inplace=True)

#--- B) Load Apple news data
df_news = pd.read_csv("apple_news_data.csv")
df_news["Date"] = pd.to_datetime(df_news["Date"], errors="coerce")
df_news.dropna(subset=["Date"], inplace=True)
df_news.sort_values("Date", inplace=True)
df_news.reset_index(drop=True, inplace=True)



In [6]:
#############################
# 4) Merge on date
#############################
# df_merged = pd.merge(df_price_raw, df_news, on="Date", how="inner")

df_merged = pd.read_csv("merged_news_stock_data.csv")
df_merged.sort_values("Date", inplace=True)
df_merged.dropna(subset=["Date"], inplace=True)
df_merged.reset_index(drop=True, inplace=True)


In [7]:
#############################
# 5) Compute Technical Indicators
#############################
df_merged["RSI"] = compute_rsi(df_merged["Close_Price"], period=14)
macd_line, macd_signal, macd_hist = compute_macd(df_merged["Close_Price"])
df_merged["MACD_line"] = macd_line
df_merged["MACD_signal"] = macd_signal
df_merged["MACD_hist"] = macd_hist
df_merged["MA20"] = compute_ma(df_merged["Close_Price"], window=20)

# Drop any rows with NaNs introduced by rolling calculations
df_merged.dropna(inplace=True)
df_merged.reset_index(drop=True, inplace=True)



In [8]:
#############################
# 6) Create next-day target (Regression)
#############################
# We'll try to predict the *next day's Close* price:
df_merged["Close_next"] = df_merged["Close_Price"].shift(-1)
df_merged.dropna(subset=["Close_next"], inplace=True)



In [9]:
#############################
# 7) Choose your features
#############################
# From the PDF or your approach, we want to use RSI, MACD lines, MA20, plus sentiment columns.
# Suppose the CSV has columns: sentiment_polarity, sentiment_neg, sentiment_neu, sentiment_pos
feature_cols = [
    "RSI", 
    "MACD_line","MACD_signal","MACD_hist",
    "MA20",
    "sentiment_polarity", 
    "sentiment_neg",
    "sentiment_neu",
    "sentiment_pos"
]

X = df_merged[feature_cols].copy()
y = df_merged["Close_next"].copy()



In [10]:
#############################
# 8) Partition by Year: Train, Validation, Test
#############################

# Let's assume we want train=2016-2019, valid=2020, test=2021+ 
# (Adjust these year boundaries as you like, depending on your dataset range)

df_merged["Year"] = df_merged["Date"].dt.year
# We'll keep the same indexing approach for X,y

train_mask = (df_merged["Year"] >= 2016) & (df_merged["Year"] <= 2019)
val_mask   = (df_merged["Year"] == 2020)
test_mask  = (df_merged["Year"] >= 2021)

X_train = X[train_mask].values
y_train = y[train_mask].values

X_val = X[val_mask].values
y_val = y[val_mask].values

X_test = X[test_mask].values
y_test = y[test_mask].values

print("Train shape:", X_train.shape, y_train.shape)
print("Val shape:  ", X_val.shape,   y_val.shape)
print("Test shape: ", X_test.shape,  y_test.shape)



AttributeError: Can only use .dt accessor with datetimelike values

In [None]:
#############################
# 9) Fit Multiple Models
#############################

# (A) Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

# Evaluate on val
y_val_pred_lr = lr_model.predict(X_val)
rmse_val_lr = sqrt(mean_squared_error(y_val, y_val_pred_lr))
r2_val_lr = r2_score(y_val, y_val_pred_lr)

print("Linear Regression val RMSE:", rmse_val_lr, "R^2:", r2_val_lr)

# (B) Random Forest
rf_model = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=42)
rf_model.fit(X_train, y_train)

y_val_pred_rf = rf_model.predict(X_val)
rmse_val_rf = sqrt(mean_squared_error(y_val, y_val_pred_rf))
r2_val_rf = r2_score(y_val, y_val_pred_rf)

print("RandomForest val RMSE:", rmse_val_rf, "R^2:", r2_val_rf)

# (C) XGBoost
xgb_model = XGBRegressor(n_estimators=100, max_depth=5, random_state=42)
xgb_model.fit(X_train, y_train,
              eval_set=[(X_val, y_val)],
              early_stopping_rounds=10,
              verbose=False)

y_val_pred_xgb = xgb_model.predict(X_val)
rmse_val_xgb = sqrt(mean_squared_error(y_val, y_val_pred_xgb))
r2_val_xgb = r2_score(y_val, y_val_pred_xgb)

print("XGBoost val RMSE:", rmse_val_xgb, "R^2:", r2_val_xgb)

# (D) Simple Neural Network
# e.g. an MLP with 2 hidden layers
nn_model = Sequential()
nn_model.add(Dense(32, input_dim=X_train.shape[1], activation='relu'))
nn_model.add(Dense(16, activation='relu'))
nn_model.add(Dense(1, activation='linear'))

nn_model.compile(optimizer=Adam(learning_rate=0.001), loss='mse')

# Train with validation data
nn_model.fit(X_train, y_train,
             validation_data=(X_val, y_val),
             epochs=30,
             batch_size=32,
             verbose=0)

y_val_pred_nn = nn_model.predict(X_val).flatten()
rmse_val_nn = sqrt(mean_squared_error(y_val, y_val_pred_nn))
r2_val_nn = r2_score(y_val, y_val_pred_nn)

print("Neural Network val RMSE:", rmse_val_nn, "R^2:", r2_val_nn)



In [None]:
#############################
# 10) Select best model by validation RMSE, Evaluate on Test
#############################

val_results = {
    "LinearRegression": (rmse_val_lr, r2_val_lr),
    "RandomForest": (rmse_val_rf, r2_val_rf),
    "XGBoost": (rmse_val_xgb, r2_val_xgb),
    "NeuralNet": (rmse_val_nn, r2_val_nn)
}

# We can choose best by minimum RMSE
best_model_name = min(val_results, key=lambda k: val_results[k][0])
print("\nBest model by val RMSE is:", best_model_name)

# Evaluate that best model on test set:
if best_model_name == "LinearRegression":
    y_test_pred = lr_model.predict(X_test)
elif best_model_name == "RandomForest":
    y_test_pred = rf_model.predict(X_test)
elif best_model_name == "XGBoost":
    y_test_pred = xgb_model.predict(X_test)
else:
    y_test_pred = nn_model.predict(X_test).flatten()

rmse_test = sqrt(mean_squared_error(y_test, y_test_pred))
r2_test = r2_score(y_test, y_test_pred)
print(f"\nOn Test set, {best_model_name} => RMSE: {rmse_test:.4f}, R^2: {r2_test:.4f}")



In [None]:
#############################
# 11) Plot predictions vs actual (test)
#############################

plt.figure(figsize=(10,5))
plt.plot(y_test, label='Actual', c='blue')
plt.plot(y_test_pred, label='Predicted', c='red')
plt.title(f"{best_model_name} on Test Set - Next-Day Price Prediction")
plt.xlabel("Test Data Index")
plt.ylabel("Price")
plt.legend()
plt.show()

#############################
# End
#############################
