# Stock Price Predictor - Enhanced with Full Evaluation

Random Forest model with comprehensive evaluation metrics

In [None]:
import pandas as pd
import numpy as np
import requests
from datetime import datetime, timedelta
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, confusion_matrix, classification_report,
    precision_recall_fscore_support, roc_curve, roc_auc_score
)
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
API_KEY = "vFDjkUVRfPnedLrbRjm75BZ9CJHz3dfv"
TICKER = "AAPL"
START_DATE = "2025-10-01"
END_DATE = "2025-11-01"

In [None]:
def pull_polygon_data(ticker, start, end, api_key):
    url = f"https://api.polygon.io/v2/aggs/ticker/{ticker}/range/1/minute/{start}/{end}?apiKey={api_key}"
    response = requests.get(url)
    data = response.json()
    
    if 'results' not in data or len(data['results']) < 2:
        raise ValueError("Not enough data returned from Polygon API")
    
    df = pd.DataFrame(data['results'])
    df['timestamp'] = pd.to_datetime(df['t'], unit='ms')
    df = df.rename(columns={'o':'open','h':'high','l':'low','c':'close','v':'volume'})
    df = df[['timestamp','open','high','low','close','volume']]
    return df

In [None]:
def calculate_features(df):
    df = df.copy()
    
    # Momentum & volatility
    df['momentum_1min'] = df['close'].pct_change()
    df['volatility_1min'] = df['momentum_1min'] ** 2
    
    # Price direction
    df['price_direction'] = (df['close'] > df['open']).astype(int)
    
    # VWAP and deviation
    df['vwap'] = (df['close'] * df['volume']).cumsum() / df['volume'].cumsum()
    df['vwap_dev'] = (df['close'] - df['vwap']) / df['vwap']
    
    # Time features
    df['hour'] = df['timestamp'].dt.hour
    df['minute'] = df['timestamp'].dt.minute
    
    # Target: next-minute movement
    df['next_return'] = df['close'].shift(-1) / df['close'] - 1
    df['target'] = (df['next_return'] > 0).astype(int)
    
    df = df.dropna()
    return df

In [None]:
df = pull_polygon_data(TICKER, START_DATE, END_DATE, API_KEY)
df = calculate_features(df)

features = ['momentum_1min', 'volatility_1min', 'price_direction', 'vwap_dev', 'hour', 'minute']
X = df[features]
y = df['target']

# Chronological train/test split
split_index = int(len(X)*0.8)
X_train, X_test = X.iloc[:split_index], X.iloc[split_index:]
y_train, y_test = y.iloc[:split_index], y.iloc[split_index:]

print(f"train size: {len(X_train)}")
print(f"test size: {len(X_test)}")
print(f"train UP ratio: {y_train.mean():.3f}")
print(f"test UP ratio: {y_test.mean():.3f}")

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Probabilities (for ROC curve)
y_prob_train = model.predict_proba(X_train)[:, 1]
y_prob_test = model.predict_proba(X_test)[:, 1]

print("Train Accuracy:", accuracy_score(y_train, y_pred_train))
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))

## Confusion Matrix

In [None]:
cm_train = confusion_matrix(y_train, y_pred_train)
cm_test = confusion_matrix(y_test, y_pred_test)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

sns.heatmap(cm_train, annot=True, fmt='d', cmap='Blues', ax=axes[0],
            xticklabels=['DOWN', 'UP'], yticklabels=['DOWN', 'UP'])
axes[0].set_title('Training Set Confusion Matrix')
axes[0].set_ylabel('Actual')
axes[0].set_xlabel('Predicted')

sns.heatmap(cm_test, annot=True, fmt='d', cmap='Blues', ax=axes[1],
            xticklabels=['DOWN', 'UP'], yticklabels=['DOWN', 'UP'])
axes[1].set_title('Test Set Confusion Matrix')
axes[1].set_ylabel('Actual')
axes[1].set_xlabel('Predicted')

plt.tight_layout()
plt.show()

# calculate TP, TN, FP, FN
tn, fp, fn, tp = cm_test.ravel()
print(f"\nTest Set Breakdown:")
print(f"True Negatives (correct DOWN predictions): {tn}")
print(f"False Positives (predicted UP, was DOWN): {fp}")
print(f"False Negatives (predicted DOWN, was UP): {fn}")
print(f"True Positives (correct UP predictions): {tp}")

## Precision, Recall, F1-Score

In [None]:
print("\n=== TRAINING SET ===")
print(classification_report(y_train, y_pred_train, target_names=['DOWN', 'UP']))

print("\n=== TEST SET ===")
print(classification_report(y_test, y_pred_test, target_names=['DOWN', 'UP']))

## ROC Curve and AUC

In [None]:
# calculate ROC curves
fpr_train, tpr_train, _ = roc_curve(y_train, y_prob_train)
fpr_test, tpr_test, _ = roc_curve(y_test, y_prob_test)

auc_train = roc_auc_score(y_train, y_prob_train)
auc_test = roc_auc_score(y_test, y_prob_test)

plt.figure(figsize=(10, 6))
plt.plot(fpr_train, tpr_train, label=f'Train (AUC={auc_train:.4f})', linewidth=2)
plt.plot(fpr_test, tpr_test, label=f'Test (AUC={auc_test:.4f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random (AUC=0.5)', alpha=0.5)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate (Recall)')
plt.title('ROC Curve')
plt.legend()
plt.grid(True)
plt.show()

print(f"\nTrain AUC: {auc_train:.4f}")
print(f"Test AUC: {auc_test:.4f}")
print(f"Overfitting gap: {auc_train - auc_test:.4f}")

## Feature Importance

In [None]:
importances = model.feature_importances_
importance_df = pd.DataFrame({
    'feature': features,
    'importance': importances
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(importance_df.to_string(index=False))

# plot
plt.figure(figsize=(10, 6))
plt.barh(importance_df['feature'], importance_df['importance'])
plt.xlabel('Importance')
plt.title('Random Forest Feature Importance')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Train vs Test Performance Comparison

In [None]:
# calculate metrics
train_acc = accuracy_score(y_train, y_pred_train)
test_acc = accuracy_score(y_test, y_pred_test)

train_prec, train_rec, train_f1, _ = precision_recall_fscore_support(y_train, y_pred_train, average='weighted')
test_prec, test_rec, test_f1, _ = precision_recall_fscore_support(y_test, y_pred_test, average='weighted')

comparison = pd.DataFrame([
    {'metric': 'Accuracy', 'train': train_acc, 'test': test_acc, 'gap': train_acc - test_acc},
    {'metric': 'Precision', 'train': train_prec, 'test': test_prec, 'gap': train_prec - test_prec},
    {'metric': 'Recall', 'train': train_rec, 'test': test_rec, 'gap': train_rec - test_rec},
    {'metric': 'F1-Score', 'train': train_f1, 'test': test_f1, 'gap': train_f1 - test_f1},
    {'metric': 'AUC', 'train': auc_train, 'test': auc_test, 'gap': auc_train - auc_test}
])

print("\n=== TRAIN VS TEST COMPARISON ===")
print(comparison.to_string(index=False))

# visual comparison
fig, ax = plt.subplots(figsize=(10, 6))
x = np.arange(len(comparison))
width = 0.35

ax.bar(x - width/2, comparison['train'], width, label='Train', alpha=0.8)
ax.bar(x + width/2, comparison['test'], width, label='Test', alpha=0.8)

ax.set_ylabel('Score')
ax.set_title('Model Performance: Train vs Test')
ax.set_xticks(x)
ax.set_xticklabels(comparison['metric'])
ax.legend()
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Save Model

In [None]:
joblib.dump(model, "trained_stock_model.pkl")
print("Model saved to trained_stock_model.pkl")

## Live Prediction Function

In [None]:
def get_recent_minute_decision(ticker, api_key, model, prob_threshold=0.55):
    today = datetime.now().date()
    start = today - timedelta(days=1)
    
    # Pull data
    df = pull_polygon_data(ticker, start, today, api_key)
    
    # Only last 2 minutes needed for momentum, volatility, price direction
    last_two = df.iloc[-2:]
    
    momentum_1min = (last_two['close'].iloc[1] - last_two['close'].iloc[0]) / last_two['close'].iloc[0]
    volatility_1min = momentum_1min ** 2
    price_direction = int(last_two['close'].iloc[1] > last_two['open'].iloc[1])
    
    # VWAP deviation using cumulative VWAP
    vwap = (df['close'] * df['volume']).cumsum() / df['volume'].cumsum()
    vwap_dev = (last_two['close'].iloc[1] - vwap.iloc[-1]) / vwap.iloc[-1]
    
    hour = last_two['timestamp'].iloc[1].hour
    minute = last_two['timestamp'].iloc[1].minute
    
    feature_row = pd.DataFrame([{
        'momentum_1min': momentum_1min,
        'volatility_1min': volatility_1min,
        'price_direction': price_direction,
        'vwap_dev': vwap_dev,
        'hour': hour,
        'minute': minute
    }])
    
    # Model prediction
    pred_proba = model.predict_proba(feature_row)[0]
    
    # Decision logic with HOLD for uncertain predictions
    if pred_proba[1] > prob_threshold:
        decision = "BUY"
    elif pred_proba[0] > prob_threshold:
        decision = "SELL"
    else:
        decision = "HOLD"
    
    return decision, last_two.iloc[1], feature_row, pred_proba

In [None]:
model = joblib.load("trained_stock_model.pkl")

decision, last_bar, features_used, pred_proba = get_recent_minute_decision(TICKER, API_KEY, model)
print("Decision:", decision)
print("Last bar:\n", last_bar)
print("Features:\n", features_used)
print("Predicted probabilities (DOWN, UP):", pred_proba)