<a href="https://colab.research.google.com/github/rayan-arya/rayan-arya/blob/main/quantsignalpredictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Part 1 - get data

In [1]:
import yfinance as yf
import pandas as pd
import matplotlib.pyplot as plt

# 5 min intervals for last 7 days
ticker = yf.Ticker("SPY")
df = ticker.history(interval="5m", period = "7d")

df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Dividends,Stock Splits,Capital Gains
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-08-13 09:30:00-04:00,644.909973,645.309998,644.48999,645.159973,2767643,0.0,0.0,0.0
2025-08-13 09:35:00-04:00,645.190002,645.559998,644.971985,645.475098,836062,0.0,0.0,0.0
2025-08-13 09:40:00-04:00,645.409973,645.539978,645.200012,645.530029,1086503,0.0,0.0,0.0
2025-08-13 09:45:00-04:00,645.539978,645.88501,645.375,645.859985,1081051,0.0,0.0,0.0
2025-08-13 09:50:00-04:00,645.869995,646.190002,645.859985,646.054993,935435,0.0,0.0,0.0


In [None]:
#plot of 5min closing price

df['Close'].plot(figsize=(15,5), title="SPY 5 Min Close Prices")
plt.xlabel("Time")
plt.ylabel("Price")
plt.grid(True)
plt.show()

In [None]:
df_cleaned = df[['Open', 'High', 'Low', 'Close', 'Volume']]
df_cleaned.to_csv("spy_intraday.csv")

#OHLCV - Open: Price at start of interval, High: Highest price during interval, Low: Lowest price during interval,
#Close: Price at end of interval, Volume: Number of shares traded during the interval


In [None]:
import os
os.listdir()

Part 2 - Features

In [None]:
import pandas as pd

df = pd.read_csv("spy_intraday.csv", index_col=0, parse_dates=True)
df.head()



In [None]:
import os
os.listdir()


In [None]:
#log returns - % change from each bar

df['Return'] = (df['Close'] / df['Close'].shift(1))-1

In [None]:
#lag(see the past)

for i in range(1, 6):
    df[f'return_lag_{i}'] = df['Return'].shift(i)

In [None]:
pip install ta

In [None]:
from ta.trend import MACD
from ta.momentum import RSIIndicator
from ta.volatility import BollingerBands

#rsi
rsi = RSIIndicator(close=df['Close'])
df['RSI'] = rsi.rsi()

#momentum indicator
macd = MACD(close=df['Close'])
df['MACD'] = macd.macd_diff()

#Volatility
bb = BollingerBands(close=df['Close'])
df['BB_High'] = bb.bollinger_hband()
df['BB_Low'] = bb.bollinger_lband()
df['BB_Width'] = bb.bollinger_wband()

In [None]:
df = df.dropna()
df.head()

In [None]:
df.to_csv("spy_features.csv")

Part 3 - Target eda

In [None]:
import pandas as pd

df = pd.read_csv("spy_features.csv", index_col=0, parse_dates=True)
df.head()

In [None]:
#1 if price goes up; 0 if price goes down or stays the same

df['Target'] = (df['Return'].shift(-1) > 0).astype(int)

In [None]:
df = df.dropna()


In [None]:
df['Target'].value_counts(normalize=True)
#proportion of up/down in dataset

In [None]:
!pip install seaborn

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation = df.corr()['Target'].sort_values(ascending=False)
print(correlation)

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Feature Correlation Matrix")
plt.show()


In [None]:
sns.boxplot(x='Target', y='RSI', data=df)
plt.title("RSI vs. Next-Bar Direction")
plt.show()


In [None]:
df.to_csv("spy_features.csv")


Day 4 - Model Training

In [None]:
import pandas as pd

df = pd.read_csv("spy_features.csv", index_col=0, parse_dates=True)

In [None]:
features = ['Volume', 'Return'] + [f'return_lag_{i}' for i in range(1, 6)] + ['RSI', 'MACD', 'BB_High', 'BB_Low', 'BB_Width']

X = df[features]
y = df['Target']

In [None]:
split = int(0.8 * len(df))
X_train, X_test = X[:split], X[split:]
y_train, y_test = y[:split], y[split:]


In [None]:
!pip install scikit-learn


In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d", cmap="Blues")
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
import numpy as np

importances = model.feature_importances_
sorted_indices = np.argsort(importances)[::-1]

for i in sorted_indices:
    print(f"{features[i]}: {importances[i]:.4f}")

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5],
}

#Grid search with 3-fold cross-validation
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1
)

grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Params:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


In [None]:
# Refit model with best parameters
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)


In [None]:
df["Signal"] = best_model.predict(X)


In [None]:
df["Signal"] = df["Signal"].shift(1)


In [None]:
# Calculate returns when model says "buy"
df["Strategy_Returns"] = df["Signal"] * df["Return"]


In [None]:
import matplotlib.pyplot as plt

df[["Return", "Strategy_Returns"]].cumsum().plot(figsize=(10, 5))
plt.title("Cumulative Returns: Market vs Model Strategy")
plt.show()


In [None]:
import numpy as np

strategy_returns = df["Strategy_Returns"].dropna()
sharpe_ratio = np.mean(strategy_returns) / np.std(strategy_returns) * np.sqrt(252*6.5*60)  # intraday minutes per year
print("Sharpe Ratio:", sharpe_ratio)

In [None]:
cumulative = (1 + strategy_returns).cumprod()
peak = cumulative.cummax()
drawdown = (cumulative - peak) / peak
max_drawdown = drawdown.min()
print("Max Drawdown:", max_drawdown)


In [None]:
num_trades = (df["Signal"].diff() == 1).sum()
win_trades = df.loc[df["Strategy_Returns"] > 0, "Strategy_Returns"].count()
win_rate = win_trades / num_trades
avg_win = df.loc[df["Strategy_Returns"] > 0, "Strategy_Returns"].mean()
avg_loss = df.loc[df["Strategy_Returns"] < 0, "Strategy_Returns"].mean()

print("Number of Trades:", num_trades)
print("Win Rate:", win_rate)
print("Avg Win:", avg_win)
print("Avg Loss:", avg_loss)


In [None]:
# Use model's predicted probability instead of hard 0/1
df["Prob_1"] = best_model.predict_proba(X)[:, 1]
df["Signal"] = df["Prob_1"]  # position sizing based on confidence
df["Signal"] = df["Signal"].shift(1)

df["Strategy_Returns"] = df["Signal"] * df["Return"]


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
