## Logistic Regression Grid Search Results

### Example Usage
```python
lr_500 = LR_training('LR_500_normal', train_data=train_df, save_path=None, test_size=0.15, threshold=0.98)
```
### Grid Search Results
- Best parameters found: {'C': 100, 'max_iter': 300, 'solver': 'saga'}
- Best estimator: LogisticRegression(C=100, max_iter=300, solver='saga')
- Logistic regression model score: 0.876136700080362

### Classification report
              precision    recall  f1-score   support

         0.0       0.87      0.88      0.88     23584
         1.0       0.88      0.87      0.88     23702

    accuracy                           0.88     47286
   macro avg       0.88      0.88      0.88     47286
weighted avg       0.88      0.88      0.88     47286

### Classification report - Thresholded

              precision    recall  f1-score   support

         0.0       0.99      0.12      0.21     23584
         1.0       0.53      1.00      0.69     23702

    accuracy                           0.56     47286
   macro avg       0.76      0.56      0.45     47286
weighted avg       0.76      0.56      0.45     47286

# Find Optimal Parameters for LR model

In [8]:
import yfinance as yf
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from scipy.signal import argrelextrema
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
import logging
import os
from pathlib import Path
import pickle
from tqdm import tqdm
from sklearn.model_selection import GridSearchCV

logging.basicConfig(level=logging.INFO)

In [9]:
class LR_training:

    def __init__(self, model_name: str, save_path: str, train_data: pd.DataFrame, test_size: float=0.05, threshold: float = 0.98):

        self.model_version = model_name
        self.save_path = save_path
        self.threshold = threshold
        self.test_size = test_size

        #main dataframe
        self.main_df = train_data.copy()

        #init models
        self.scaler = MinMaxScaler()
        self.lr = LogisticRegression()

        #run logistic regresion
        self.create_train_test().grid_search()

        if save_path:
            self.save_model()

    def create_train_test(self):
        """
        create train and test data
        """
        self.main_df['target'] = self.main_df['target'].astype('category')
        
        y = self.main_df.pop('target').to_numpy()
        y = y.reshape(y.shape[0], 1)
        x = self.scaler.fit_transform(self.main_df)

        #test train split
        self.train_x, self.test_x, self.train_y, self.test_y = train_test_split(x, y, \
            test_size = self.test_size, random_state = 50, shuffle = True)

        logging.info('Created test and train data...')
        return self
    
    def grid_search(self):
        """
        Perform grid search to find the best parameters for Logistic Regression
        """
        param_grid = {
            'C': [0.1, 1, 10, 100],
            'solver': ['liblinear', 'saga'],
            'max_iter': [100, 200, 300]
        }

        grid_search = GridSearchCV(self.lr, param_grid, cv=5, scoring='precision')
        grid_search.fit(self.train_x, self.train_y.ravel())

        self.best_params_ = grid_search.best_params_
        self.best_estimator_ = grid_search.best_estimator_

        logging.info(f'Best parameters found: {self.best_params_}')
        logging.info(f'Best estimator: {self.best_estimator_}')

        # Predict the test data with the best estimator
        self.predictions = self.best_estimator_.predict(self.test_x)
        self.score = self.best_estimator_.score(self.test_x, self.test_y)
        logging.info(f'Logistic regression model score: {self.score}')

        # Predictions with threshold
        self.predictions_proba = self.best_estimator_._predict_proba_lr(self.test_x)
        self.predictions_proba_thresholded = self._threshold(self.predictions_proba, self.threshold)
        logging.info(f'Predictions with threshold: {self.predictions_proba_thresholded}')
        self.confusion_matrix()
        return self

    def fit_model(self):

        logging.info('Training model...')
        self.lr.fit(self.train_x, self.train_y)
        
        #predict the test data
        self.predictions = self.lr.predict(self.test_x)
        self.score = self.lr.score(self.test_x, self.test_y)
        logging.info(f'Logistic regression model score: {self.score}')

        #preds with threshold
        self.predictions_proba = self.lr._predict_proba_lr(self.test_x)
        self.predictions_proba_thresholded = self._threshold(self.predictions_proba, self.threshold)
        return self
      
    def confusion_matrix(self):
        cm = confusion_matrix(self.test_y, self.predictions)
        self.cmd = ConfusionMatrixDisplay(cm)

        logging.info("classification report")
        logging.info(classification_report(self.test_y, self.predictions))
        
        cm_thresholded = confusion_matrix(self.test_y, self.predictions_proba_thresholded)
        self.cmd_thresholded = ConfusionMatrixDisplay(cm_thresholded)      

        logging.info("classification report with threshold")
        logging.info(classification_report(self.test_y, self.predictions_proba_thresholded))
        return self

        
    def _threshold(self, predictions, threshold):

        prob_thresholded = [0 if x > threshold else 1 for x in predictions[:, 0]]

        return np.array(prob_thresholded)

    def save_model(self):

        saved_models_dir = self.save_path
        model_file = f'lr_{self.model_version}.sav'
        model_dir = os.path.join(saved_models_dir, model_file)
        pickle.dump(self.lr, open(model_dir, 'wb'))

        scaler_file = f'scaler_{self.model_version}.sav'
        scaler_dir = os.path.join(saved_models_dir, scaler_file)
        pickle.dump(self.scaler, open(scaler_dir, 'wb'))

        logging.info(f'Saved the model and scaler in {saved_models_dir}')

        # If folder is non-existent, create it
        cm_path = os.path.join(saved_models_dir, 'figures')
        Path(cm_path).mkdir(parents=True, exist_ok=True)
        
        #save cms
        plt.figure()
        self.cmd.plot()
        plt.title(f'Confusion Matrix for {self.model_version}')
        plt.savefig(os.path.join(cm_path, f'cm_{self.model_version}.jpg'))

        plt.figure()
        self.cmd_thresholded.plot()
        plt.title(f'Confusion Matrix with Threshold {self.threshold * 100}% for {self.model_version}')
        plt.savefig(os.path.join(cm_path, f'cm_thresholded_{self.model_version}.jpg'))
        logging.info(f'Figures saved in {cm_path}')

## Train using relative Volume

In [10]:
# load data
train_df = pd.read_csv(os.path.join('data','train500.csv'), index_col=0)
train_df

Unnamed: 0,relative_volume,normalized_value,3_reg,5_reg,10_reg,20_reg,target
0,1.704803,0.000000,-1.238008e-02,-0.010873,-0.008358,-0.005767,0.0
1,0.595399,0.812497,-3.833234e-20,0.001292,0.002277,0.002852,1.0
2,0.429421,0.125013,-5.383641e-04,-0.001507,-0.001683,0.002107,0.0
3,1.114311,0.375006,8.612275e-03,0.006028,0.000776,-0.000189,1.0
4,1.274003,0.812507,4.626080e-03,0.008310,0.004896,0.001601,1.0
...,...,...,...,...,...,...,...
315230,1.121881,0.426447,2.750015e-01,-0.189001,0.162727,0.339736,0.0
315231,0.618599,0.850681,-2.245003e+00,-0.248000,0.646667,0.246474,1.0
315232,0.502217,0.260869,-7.150040e-01,-0.466002,-0.333697,0.196820,0.0
315233,0.965139,0.524875,2.310005e+00,1.498000,0.811576,0.727023,1.0


In [11]:
lr_500 = LR_training('LR_500_normal', train_data=train_df, save_path=None, test_size=0.15, threshold=0.98)

INFO:root:Created test and train data...
INFO:root:Best parameters found: {'C': 100, 'max_iter': 300, 'solver': 'saga'}
INFO:root:Best estimator: LogisticRegression(C=100, max_iter=300, solver='saga')
INFO:root:Logistic regression model score: 0.876136700080362
INFO:root:Predictions with threshold: [1 1 1 ... 1 1 1]
INFO:root:classification report
INFO:root:              precision    recall  f1-score   support

         0.0       0.87      0.88      0.88     23584
         1.0       0.88      0.87      0.88     23702

    accuracy                           0.88     47286
   macro avg       0.88      0.88      0.88     47286
weighted avg       0.88      0.88      0.88     47286

INFO:root:classification report with threshold
INFO:root:              precision    recall  f1-score   support

         0.0       0.99      0.12      0.21     23584
         1.0       0.53      1.00      0.69     23702

    accuracy                           0.56     47286
   macro avg       0.76      0.56     