<a href="https://colab.research.google.com/github/pyt3r/springboard-package/blob/master/capstone/2_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount Colab Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

import pathlib
root = pathlib.Path('/content/drive/MyDrive/home/Research/Springboard/Colab Notebooks')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

## Read Features and Target

In [3]:
Features = pd.read_parquet( root / '2. features.parquet' )
Features.head()

Unnamed: 0_level_0,sma(30),ema(30),"x(sma(15),sma(50))","bollUpper(20,2)","bollLower(20,2)","macdDistance(12, 26, 9)",rsiSma(14),rsiEma(14),mfi(14)
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2024-10-30,65422.667667,66722.644315,1.0,72663.833481,62161.088519,420.346515,66.640357,75.020891,57.71112
2024-10-31,65735.273667,66955.221805,1.0,72638.366976,62963.565024,310.777749,58.735537,58.709448,56.491327
2024-11-01,66030.263,67123.168955,1.0,72486.842207,63744.034793,166.342215,53.360096,54.033862,47.100807
2024-11-02,66314.592,67266.837741,1.0,72060.759885,64813.906115,41.115159,52.901738,52.755618,43.71217
2024-11-03,66537.046667,67364.444311,1.0,72017.14281,65127.02219,-87.963996,49.179247,48.963673,38.483594


In [4]:
Target = pd.read_parquet( root / '2. target.parquet' )
Target = Target.squeeze()
Target.head()

Unnamed: 0_level_0,Target
Date,Unnamed: 1_level_1
2024-10-30,0
2024-10-31,0
2024-11-01,0
2024-11-02,0
2024-11-03,0


In [5]:
print( len(Features), len(Target) )
print( Features.isnull().sum().sum(), Target.isnull().sum() )

Target.value_counts()

315 315
0 0


Unnamed: 0_level_0,count
Target,Unnamed: 1_level_1
1,160
0,155


## Train Logistic Regression

In [6]:
def crossValidationForLogisticRegression( features, target, n_splits=5, random_state=42 ):

    metrics = []
    importances = []


    X = features.values
    y = target.values
    tscv = TimeSeriesSplit( n_splits=n_splits )
    for trainIdx, testIdx in tscv.split(X):

        pipeline = Pipeline(steps=[
            ("scaler", StandardScaler()),
            ("logistic", LogisticRegression(
                solver       = 'liblinear',
                max_iter     = 2000,
                random_state = random_state
            ))
        ])
        pipeline.fit(X[trainIdx], y[trainIdx])

        y_pred  = pipeline.predict(X[testIdx])
        y_proba = pipeline.predict_proba(X[testIdx])[:, 1]

        metrics.append({
            'roc_auc'  : roc_auc_score( y[testIdx], y_proba ),
            'accuracy' : accuracy_score( y[testIdx], y_pred ),
            'f1'       : f1_score( y[testIdx], y_pred ),
        })

        lr = pipeline.named_steps["logistic"]
        importances.append( dict( zip( Features.columns, lr.coef_[0], )))

    return pd.DataFrame( metrics ), pd.DataFrame( importances ).T


metrics, importances = crossValidationForLogisticRegression( Features, Target, n_splits=5 )

print( metrics.mean() )

importances.mean(1).abs().sort_values( ascending=False )


roc_auc     0.546083
accuracy    0.503846
f1          0.326203
dtype: float64


Unnamed: 0,0
rsiSma(14),0.583134
"macdDistance(12, 26, 9)",0.525296
"bollLower(20,2)",0.373629
ema(30),0.282009
sma(30),0.222065
"x(sma(15),sma(50))",0.180008
rsiEma(14),0.168882
"bollUpper(20,2)",0.149608
mfi(14),0.072163
