In [5]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import brier_score_loss, confusion_matrix, classification_report, balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', 50)

## Modeling

- Simple Data
    - Logistic Regression
        - ~~simple~~
        - ~~tuned + cross validation~~
    - Gaussian NB
        - simple
        - cross validation
    - Random Forest
        - untuned
        - tuned
    - XGBoost
        - untuned
        - tuned
- Scaled Data
    - Logistic Regression
        - simple
        - cross validation
    - Gaussian NB
        - simple
        - cross validation
    - Random Forest
        - untuned
        - tuned
    - XGBoost
        - untuned
        - tuned
- PCA 
    - Logistic Regression
        - simple
        - cross validation
    - Gaussian NB
        - simple
        - cross validation
    - Random Forest
        - untuned
        - tuned
    - XGBoost
        - untuned
        - tuned

In [6]:
final_data = pd.read_csv('computed_data/reg_avg_data.csv')
st1_data = final_data[final_data['Season']<2017]
st1_data_x = st1_data.iloc[:,4:-1]
st1_data_y = st1_data.iloc[:,-1]
st2_data = final_data[final_data['Season']<2023]
st2_data_x = st2_data.iloc[:,4:-1]
st2_data_y = st2_data.iloc[:,-1]

In [7]:
scoring = 'neg_brier_score'

#Creating model class so that testing and tuning is easy to run all the different models.
class Model_Data:
    def __init__(self, model_ud, X, y, scoring='f1'):
        self.user_defined_model=model_ud
        self.X= X
        self.y = y
        self.scoring = scoring
        
    def split(self, test_size):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, test_size = test_size)
    
    '''@staticmethod
    def scaledata(X_train, X_test):
        scaler = StandardScaler()
        idx=X_train.columns.get_loc("is_lhb")
        colnames = X_train.columns[0:idx]
        features_train= X_train[colnames]
        features_test= X_test[colnames]
        scaler.fit(features_train)
        features_train = scaler.transform(features_train)
        X_train[colnames] = features_train
        features_test = scaler.transform(features_test)
        X_test[colnames] = features_test
        return X_train, X_test'''
            
    def tune(self, params):

        cv = KFold(n_splits=3)
        self.rand_search = GridSearchCV(estimator=self.user_defined_model, param_grid=params, n_jobs=8, 
                                         cv=cv, scoring=self.scoring, verbose=2)

            
        self.rand_search = self.rand_search.fit(self.X_train, self.y_train)
        # summarize results
        print("Best: %f using %s" % (self.rand_search.best_score_, self.rand_search.best_params_))

        means = self.rand_search.cv_results_['mean_test_score']
        stds = self.rand_search.cv_results_['std_test_score']
        parameters = self.rand_search.cv_results_['params']
        for mean, stdev, parameters in zip(means, stds, parameters):
            print("%f (%f) with: %r" % (mean, stdev, parameters))
            
    def tune_comp(self, params):

        cv = KFold(n_splits=2)
        self.rand_search = GridSearchCV(estimator=self.user_defined_model, param_grid=params, n_jobs=8,
                                     cv=cv, scoring=self.scoring, verbose=2)
            
        self.rand_search = self.rand_search.fit(self.X, self.y)
        # summarize results
        print("Best: %f using %s" % (self.rand_search.best_score_, self.rand_search.best_params_))

        means = self.rand_search.cv_results_['mean_test_score']
        stds = self.rand_search.cv_results_['std_test_score']
        parameters = self.rand_search.cv_results_['params']
        for mean, stdev, parameters in zip(means, stds, parameters):
            print("%f (%f) with: %r" % (mean, stdev, parameters))
    
    def fit(self):
        self.user_defined_model.set_params(**self.rand_search.best_params_)
        self.user_defined_model = self.user_defined_model.fit(self.X_train, self.y_train)
        
    def fit_comp(self):
        self.user_defined_model.set_params(**self.rand_search.best_params_)
        self.user_defined_model = self.user_defined_model.fit(self.X, self.y)
    
    def predict(self, input_value=None):
        if input_value == None:
            result = self.user_defined_model.predict(self.X_test)
        else: 
            result = self.user_defined_model.predict(np.array([input_value]))
        return result

    def AccuracyReport(self, predictions, input_value=None):
        if input_value == None:
            print(confusion_matrix(self.y_test,predictions))
            print(classification_report(self.y_test,predictions))
            acc2 = balanced_accuracy_score(self.y_test,predictions)
            acc3 = brier_score_loss(self.y_test,predictions)
            print(acc2)
            print(acc3)
        else:
            print(confusion_matrix(np.array([input_value]),predictions))
            print(classification_report(np.array([input_value]),predictions))
            acc2 = balanced_accuracy_score(np.array([input_value]),predictions)
            acc3 = brier_score_loss(self.y_test,predictions)
            print(acc2)

params_rand = {
    "learning_rate"    : [0.1, 0.25, 0.3] ,
    "n_estimators": [50, 100 , 150],
    "max_depth"        : [3, 4, 5],
    "min_child_weight" : [1, 3, 5],
    "colsample_bytree" : [0.5, 0.7, 0.8 ]
}

params_logreg =    {
    'penalty' : ['l1', 'l2', 'elasticnet'],
    'C' : np.logspace(-4, 4, 20),
    'solver' : ['lbfgs','newton-cg','liblinear','sag','saga'],
    'max_iter' : [100, 1000,2500, 5000]
}

#### Basic Logistic Regression model

In [8]:
lr = Model_Data(LogisticRegression(), st1_data_x, st1_data_y, 'neg_brier_score')
lr.split(test_size=0.3)
lr.tune(params_logreg)
lr.fit()
preds = lr.predict()
lr.AccuracyReport(preds)

Fitting 3 folds for each of 1200 candidates, totalling 3600 fits


KeyboardInterrupt: 

#### Logistic Regression Tuned + KFold

#### Random Forest Simple