In [1]:
import pandas as pd

df_merged = pd.read_csv('../datasets/processed/eda.csv', index_col=0)
df_numeric = pd.read_csv('../datasets/processed/numeric_data.csv', index_col=0)
df_pca = pd.read_csv('../datasets/processed/pca_data.csv', index_col=0)

In [19]:
ohlc_data_daily = df_merged[['date','close','high','low','open','volume','30_DMA','10_DMA','signal']]

In [20]:
df_overall= pd.concat([ohlc_data_daily,df_pca],axis=1)

In [21]:
df_pca.columns

Index(['PC1', 'PC2', 'PC3', 'PC4'], dtype='object')

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def prepare_features(df):
    """
    Prepare features for modeling
    """
    features = [
        'close', 'high', 'low', 'open', 'volume','30_DMA', '10_DMA', # Moving averages
        'PC1', 'PC2', 'PC3', 'PC4'       # Previous profit/loss
    ]
    
    X = df[features]
    y = (df['signal'] == 'BUY').astype(int)  # Convert to binary
    
    return X, y

def train_model(X, y, test_size=0.2, random_state=42):
    """
    Train logistic regression model
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    model = LogisticRegression(random_state=random_state)
    model.fit(X_train, y_train)
    
    return model, X_train, X_test, y_train, y_test

def plot_confusion_matrix(y_test, y_pred):
    """
    Plot confusion matrix using plotly
    """
    cm = confusion_matrix(y_test, y_pred)
    
    fig = go.Figure(data=go.Heatmap(
        z=cm,
        x=['Predicted Sell', 'Predicted Buy'],
        y=['Actual Sell', 'Actual Buy'],
        text=cm,
        texttemplate="%{text}",
        textfont={"size": 16},
        colorscale='RdBu'
    ))
    
    fig.update_layout(
        title='Confusion Matrix',
        xaxis_title='Predicted Label',
        yaxis_title='True Label',
        width=600,
        height=600
    )
    
    return fig

def plot_roc_curve(y_test, y_pred_proba):
    """
    Plot ROC curve using plotly
    """
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    fig = go.Figure()
    
    # Add ROC curve
    fig.add_trace(go.Scatter(
        x=fpr, y=tpr,
        name=f'ROC curve (AUC = {roc_auc:.2f})',
        mode='lines'
    ))
    
    # Add diagonal line
    fig.add_trace(go.Scatter(
        x=[0, 1], y=[0, 1],
        name='Random',
        mode='lines',
        line=dict(dash='dash')
    ))
    
    fig.update_layout(
        title='Receiver Operating Characteristic (ROC) Curve',
        xaxis_title='False Positive Rate',
        yaxis_title='True Positive Rate',
        width=800,
        height=600,
        showlegend=True
    )
    
    return fig

def evaluate_model(model, X_test, y_test):
    """
    Evaluate model and display metrics
    """
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    cm_fig = plot_confusion_matrix(y_test, y_pred)
    cm_fig.show()
    
    # Plot ROC curve
    roc_fig = plot_roc_curve(y_test, y_pred_proba)
    roc_fig.show()
    
    return y_pred, y_pred_proba

def plot_feature_importance(model, feature_names):
    """
    Plot feature importance
    """
    importance = abs(model.coef_[0])
    
    fig = go.Figure(go.Bar(
        x=feature_names,
        y=importance,
        text=importance.round(3),
        textposition='auto',
    ))
    
    fig.update_layout(
        title='Feature Importance',
        xaxis_title='Features',
        yaxis_title='Absolute Coefficient Value',
        width=1000,
        height=600
    )
    
    return fig

# Main execution
def run_stock_prediction_model(df):
    """
    Run the complete modeling pipeline
    """
    # Prepare data
    X, y = prepare_features(df)
    
    # Train model
    model, X_train, X_test, y_train, y_test = train_model(X, y)
    
    # Evaluate model
    y_pred, y_pred_proba = evaluate_model(model, X_test, y_test)
    
    # Plot feature importance
    feature_importance_fig = plot_feature_importance(model, X.columns)
    feature_importance_fig.show()
    
    return model, y_pred, y_pred_proba

# Run the pipeline
model, y_pred, y_pred_proba = run_stock_prediction_model(df_overall)


Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.20      0.29       215
           1       0.63      0.89      0.74       327

    accuracy                           0.61       542
   macro avg       0.58      0.54      0.51       542
weighted avg       0.59      0.61      0.56       542



## Model Components

### 1. Data Preparation
- Selects relevant features:
  - OHLC prices
  - Volume
  - Moving averages
  - Technical indicators
- Converts target variable to binary format (BUY/SELL)

### 2. Model Training
- Splits data into train/test sets
- Trains logistic regression model
- Implements cross-validation

### 3. Evaluation Metrics
- Classification report showing:
  - Precision
  - Recall
  - F1-score
  - Support
- Confusion matrix visualization
- ROC curve with AUC score
- Feature importance plot

### 4. Visualizations
- Interactive Plotly plots for all visualizations
- Clear titles and labels
- Color-coded results for better interpretation
- Includes:
  - Confusion matrix heatmap
  - ROC curve plot
  - Feature importance bar chart
  - Signal prediction overlay on price chart

In [29]:
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def prepare_sarimax_data(df):
    """
    Prepare data for SARIMAX model
    """
    # Prepare endogenous variable (close price)
    endog = df['close']
    
    # Prepare exogenous variables
    exog = df[['open', 'high', 'low', 'volume', '10_DMA', '30_DMA','PC1','PC2','PC3','PC4']]
    
    return endog, exog

def fit_sarimax_model(endog, exog, order=(1,1,1), seasonal_order=(1,1,1,12)):
    """
    Fit SARIMAX model
    """
    model = SARIMAX(
        endog,
        exog=exog,
        order=order,
        seasonal_order=seasonal_order
    )
    results = model.fit()
    return results

def plot_sarimax_predictions(df, actual, predicted):
    """
    Plot actual vs predicted prices
    """
    fig = go.Figure()
    
    # Plot actual prices
    fig.add_trace(go.Scatter(
        x=df.index,
        y=actual,
        name='Actual Price',
        line=dict(color='blue')
    ))
    
    # Plot predicted prices
    fig.add_trace(go.Scatter(
        x=df.index,
        y=predicted,
        name='Predicted Price',
        line=dict(color='red', dash='dash')
    ))
    
    fig.update_layout(
        title='SARIMAX: Actual vs Predicted Prices',
        xaxis_title='Date',
        yaxis_title='Price',
        width=1200,
        height=600,
        showlegend=True
    )
    
    return fig

def prepare_rf_features(df, predicted_prices):
    """
    Prepare features for Random Forest including SARIMAX predictions
    """
    df['predicted_close'] = predicted_prices
    df['price_diff'] = df['predicted_close'] - df['close']
    
    features = [
        'close', 'high', 'low', 'open', 'volume',
        '10_DMA', '30_DMA', 'predicted_close', 'price_diff',''
    ]
    
    X = df[features]
    y = (df['signal'] == 'BUY').astype(int)
    
    return X, y

def train_random_forest(X, y, test_size=0.2, random_state=42):
    """
    Train Random Forest model
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    model = RandomForestClassifier(n_estimators=100, random_state=random_state)
    model.fit(X_train, y_train)
    
    return model, X_train, X_test, y_train, y_test

def plot_feature_importance_rf(model, feature_names):
    """
    Plot Random Forest feature importance
    """
    importance = model.feature_importances_
    
    fig = go.Figure(go.Bar(
        x=feature_names,
        y=importance,
        text=importance.round(3),
        textposition='auto',
    ))
    
    fig.update_layout(
        title='Random Forest Feature Importance',
        xaxis_title='Features',
        yaxis_title='Importance Score',
        width=1000,
        height=600
    )
    
    return fig

def run_combined_model(df):
    """
    Run complete modeling pipeline (SARIMAX + Random Forest)
    """
    # SARIMAX modeling
    print("Fitting SARIMAX model...")
    endog, exog = prepare_sarimax_data(df)
    sarimax_results = fit_sarimax_model(endog, exog)
    
    # Get predictions
    predicted_prices = sarimax_results.predict()
    
    # Plot SARIMAX results
    sarimax_fig = plot_sarimax_predictions(df, endog, predicted_prices)
    sarimax_fig.show()
    
    # Random Forest modeling
    print("\nTraining Random Forest model...")
    X, y = prepare_rf_features(df, predicted_prices)
    rf_model, X_train, X_test, y_train, y_test = train_random_forest(X, y)
    
    # Evaluate Random Forest
    y_pred = rf_model.predict(X_test)
    y_pred_proba = rf_model.predict_proba(X_test)[:, 1]
    
    # Print classification report
    print("\nRandom Forest Classification Report:")
    print(classification_report(y_test, y_pred))
    
    # Plot confusion matrix
    cm_fig = plot_confusion_matrix(y_test, y_pred)
    cm_fig.show()
    
    # Plot ROC curve
    roc_fig = plot_roc_curve(y_test, y_pred_proba)
    roc_fig.show()
    
    # Plot feature importance
    importance_fig = plot_feature_importance_rf(rf_model, X.columns)
    importance_fig.show()
    
    return sarimax_results, rf_model, predicted_prices, y_pred, y_pred_proba

def plot_trading_signals(df, predicted_signals, test_indices):
    """
    Plot actual price with predicted trading signals
    """
    fig = go.Figure()
    
    # Plot price
    fig.add_trace(go.Scatter(
        x=df.index[test_indices],
        y=df['close'].iloc[test_indices],
        name='Close Price',
        line=dict(color='blue')
    ))
    
    # Plot buy signals
    buy_signals = test_indices[predicted_signals == 1]
    fig.add_trace(go.Scatter(
        x=df.index[buy_signals],
        y=df['close'].iloc[buy_signals],
        mode='markers',
        name='Buy Signal',
        marker=dict(
            symbol='triangle-up',
            size=12,
            color='green'
        )
    ))
    
    # Plot sell signals
    sell_signals = test_indices[predicted_signals == 0]
    fig.add_trace(go.Scatter(
        x=df.index[sell_signals],
        y=df['close'].iloc[sell_signals],
        mode='markers',
        name='Sell Signal',
        marker=dict(
            symbol='triangle-down',
            size=12,
            color='red'
        )
    ))
    
    fig.update_layout(
        title='Predicted Trading Signals',
        xaxis_title='Date',
        yaxis_title='Price',
        width=1200,
        height=600,
        showlegend=True
    )
    
    return fig

# Run the complete pipeline
sarimax_model, rf_model, predicted_prices, y_pred, y_pred_proba = run_combined_model(df_overall)

# Plot final trading signals
test_indices = np.arange(len(df_overall))[-len(y_pred):]  # Get test set indices
signals_fig = plot_trading_signals(df_overall, y_pred, test_indices)
signals_fig.show()

Fitting SARIMAX model...



Maximum Likelihood optimization failed to converge. Check mle_retvals




Training Random Forest model...

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.67      0.69       215
           1       0.79      0.82      0.80       327

    accuracy                           0.76       542
   macro avg       0.75      0.74      0.75       542
weighted avg       0.76      0.76      0.76       542



# SARIMAX and Random Forest Trading Model

## Model Components

### 1. SARIMAX Modeling
- Uses close price as endogenous variable
- Uses other features as exogenous variables
- Predicts future prices

### 2. Random Forest Classification
- Uses SARIMAX predictions as additional features
- Predicts buy/sell signals
- Includes feature importance analysis

### 3. Visualizations
- SARIMAX predictions vs actual prices
- Confusion matrix
- ROC curve
- Feature importance
- Final trading signals on price chart

### 4. Evaluation Metrics
- Classification report
- ROC-AUC score
- Confusion matrix

## Usage Instructions

1. **Data Preparation**
   - Ensure DataFrame has all required columns:
     - OHLC data (open, high, low, close)
     - Volume
     - Technical indicators (10_DMA, 30_DMA)

2. **Model Execution**
   ```python
   run_combined_model(df)
   ```

3. **Output**
   - Displays all relevant plots
   - Prints performance metrics
   - Returns model objects and predictions

## Approach Benefits

1. **Two-Stage Prediction**
   - First predicts price movements using SARIMAX
   - Uses predictions to enhance signal generation

2. **Comprehensive Analysis**
   - Provides detailed visualization of results
   - Includes multiple performance metrics

3. **Model Interpretability**
   - Feature importance analysis
   - Visual representation of predictions
   - Clear performance metrics

4. **Easy Evaluation**
   - Multiple evaluation metrics
   - Visual confirmation of predictions
   - Trading signal visualization

