In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from data import resample_nba_data as re
from data import clean_and_split_nba_data as clean
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from models import plot_validation_curve as vc
from models import eval_model as evm
from joblib import dump
import joblib
import altair as alt

In [2]:
sp_wk3_linear_reg_model	 = joblib.load("../models/sp_wk3_linear_reg_model.joblib")
sp_wk3_logistic_reg_model = joblib.load("../models/sp_wk3_logistic_reg_model.joblib")
sp_wk3_rf_downsample_model = joblib.load("../models/sp_wk3_rf_downsample_model.joblib")
sp_wk3_rf_upsample_model = joblib.load("../models/sp_wk3_rf_upsample_model.joblib")
sp_wk3_xgboost_downsample_model = joblib.load("../models/sp_wk3_xgboost_downsample_model.joblib")
sp_wk3_xgboost_upsample_model = joblib.load("../models/sp_wk3_xgboost_upsample_model.joblib")
sp_wk3_kmean_model = joblib.load("../models/sp_wk3_kmean_model.joblib")
sp_wk3_gussian_mixture_model = joblib.load("../models/sp_wk3_gussian_mixture_model.joblib")


scaler = np.load('../models/sp_wk3_scaler_logistic_regression.joblib',allow_pickle=True)

In [3]:
df=pd.read_csv('../data/raw/train.csv')

In [4]:
def clean_nba_data(df):
    """
    Imports the file and splits it into Train, Valdiation and Test set.
    It returns all splits.
    
    Parameters
    ----------
    file : input csv file with the path
    """
    
    df_cleaned = df.copy()
    df_cleaned[ df_cleaned<0 ] = 0
    df_cleaned.loc[df_cleaned['3P Made'] > df_cleaned['3PA'], ['3P Made' , '3PA', 'CALC3P%']] = 0, 0, 0
    df_cleaned.loc[df_cleaned['FGM'] > df_cleaned['FGA'], ['FGM', 'FGA', 'CALCFG%']] = 0, 0, 0
    df_cleaned.loc[df_cleaned['FTM'] > df_cleaned['FTA'], ['FTM', 'FTA', 'CALCFT%']] = 0, 0, 0
    df_cleaned.loc[df_cleaned['3P Made'] > 0, ['CALC3P%']] = df_cleaned['3P Made']/df_cleaned['3PA']*100
    df_cleaned.loc[df_cleaned['FGM'] > 0, ['CALCFG%']] =df_cleaned['FGM']/df_cleaned['FGA']*100
    df_cleaned.loc[df_cleaned['FTM'] > 0, ['CALCFT%']] = df_cleaned['FTM']/df_cleaned['FTA']*100
    df_cleaned = df_cleaned.drop(['3P%','FT%','FG%','Id_old','Id','TARGET_5Yrs'],axis=1)
    df_cleaned = df_cleaned.fillna(0)
    return df_cleaned

In [5]:
x_data_stg1 = clean_nba_data(df)

In [6]:
x_data_stg1.shape

(8000, 19)

In [7]:
x_data_stg1.columns

Index(['GP', 'MIN', 'PTS', 'FGM', 'FGA', '3P Made', '3PA', 'FTM', 'FTA',
       'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'CALC3P%', 'CALCFG%',
       'CALCFT%'],
      dtype='object')

In [13]:
def _pred_prob_for_model(mod,df):
    y_pred=0.0
    y_pred=mod.predict_proba(df)
    print('Classes',mod.classes_)
#     print('Unique Values ',np.unique(y_pred,return_counts=True))
    print('Unique Values ',np.unique(y_pred.round(),return_counts=True))
    print('Unique Values for Prediction Class',np.unique(y_pred[:,1].round(),return_counts=True))
    return y_pred

# For GMM

In [16]:
y_pred_gmm = sp_wk3_gussian_mixture_model.predict(x_data_stg1)
y_pred_prob_gmm = sp_wk3_gussian_mixture_model.predict_proba(x_data_stg1)

In [18]:
print('Unique Values for Prediction Class',np.unique(y_pred_gmm,return_counts=True))

Unique Values for Prediction Class (array([0, 1], dtype=int64), array([5651, 2349], dtype=int64))


In [19]:
print('Unique Values for Prediction Class',np.unique(y_pred_prob_gmm[:,0].round(),return_counts=True))
print('Unique Values for Prediction Class',np.unique(y_pred_prob_gmm[:,1].round(),return_counts=True))

Unique Values for Prediction Class (array([0., 1.]), array([2349, 5651], dtype=int64))
Unique Values for Prediction Class (array([0., 1.]), array([5651, 2349], dtype=int64))


In [21]:
df['gmm_probability']=y_pred_prob_gmm[:,1]

# Logistic Regression

In [22]:
x_data_stg1_lir= scaler.fit_transform(x_data_stg1)

In [23]:
y_pred_log_reg = _pred_prob_for_model(sp_wk3_logistic_reg_model,x_data_stg1_lir)

Classes [0 1]
Unique Values  (array([0., 1.]), array([8000, 8000], dtype=int64))
Unique Values for Prediction Class (array([0., 1.]), array([2418, 5582], dtype=int64))


In [24]:
df['logistic_reg_probability']=y_pred_log_reg[:,1]

# Random Forest Down-Sample Prediction

In [25]:
y_pred_rf_ds = _pred_prob_for_model(sp_wk3_rf_downsample_model,x_data_stg1)

Classes [0 1]
Unique Values  (array([0., 1.]), array([8000, 8000], dtype=int64))
Unique Values for Prediction Class (array([0., 1.]), array([3188, 4812], dtype=int64))


In [26]:
df['rf_ds_probability']=y_pred_rf_ds[:,1]

# Random Forest Up-Sample Prediction

In [27]:
y_pred_rf_us = _pred_prob_for_model(sp_wk3_rf_upsample_model,x_data_stg1)

Classes [0 1]
Unique Values  (array([0., 1.]), array([8000, 8000], dtype=int64))
Unique Values for Prediction Class (array([0., 1.]), array([3419, 4581], dtype=int64))


In [28]:
df['rf_us_probability']=y_pred_rf_us[:,1]

# XGBoost Down-Sample Prediction

In [29]:
y_pred_xg_ds = _pred_prob_for_model(sp_wk3_xgboost_downsample_model,x_data_stg1)

Classes [0 1]
Unique Values  (array([0., 1.], dtype=float32), array([8000, 8000], dtype=int64))
Unique Values for Prediction Class (array([0., 1.], dtype=float32), array([3135, 4865], dtype=int64))


In [30]:
df['xg_ds_probability']=y_pred_xg_ds[:,1]

# XGBoost Up-Sample Prediction

In [31]:
y_pred_xg_us = _pred_prob_for_model(sp_wk3_xgboost_upsample_model,x_data_stg1)

Classes [0 1]
Unique Values  (array([0., 1.], dtype=float32), array([8000, 8000], dtype=int64))
Unique Values for Prediction Class (array([0., 1.], dtype=float32), array([3312, 4688], dtype=int64))


In [32]:
df['xg_us_probability']=y_pred_xg_us[:,1]

In [33]:
df

Unnamed: 0,Id_old,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,...,STL,BLK,TOV,TARGET_5Yrs,gmm_probability,logistic_reg_probability,rf_ds_probability,rf_us_probability,xg_ds_probability,xg_us_probability
0,10556,3799,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,...,1.1,0.2,1.6,1,0.0,0.510778,0.637954,0.585195,0.676121,0.673573
1,5342,3800,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,...,0.5,0.6,1.4,1,1.0,0.514854,0.637954,0.581390,0.676121,0.673573
2,5716,3801,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,...,0.4,0.2,0.6,1,0.0,0.501034,0.586783,0.474740,0.487850,0.508030
3,13790,3802,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,...,0.4,0.1,1.9,1,0.0,0.505454,0.616250,0.573161,0.612665,0.596278
4,5470,3803,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,...,0.4,0.6,0.7,1,0.0,0.502677,0.595122,0.509322,0.564377,0.562738
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,2996,11794,32,9.2,1.8,0.7,1.8,40.3,-0.1,-0.2,...,0.3,0.2,0.4,1,1.0,0.493806,0.377884,0.432964,0.362439,0.386535
7996,11679,11795,54,6.0,1.8,0.7,1.4,48.7,0.1,0.1,...,0.0,0.3,0.3,1,0.0,0.495506,0.368252,0.424285,0.395084,0.377646
7997,5537,11796,85,28.2,10.7,4.0,9.0,45.1,0.2,0.6,...,1.2,0.2,1.8,1,0.0,0.512967,0.637954,0.586038,0.676121,0.673573
7998,1292,11797,39,7.7,2.5,1.0,2.3,40.1,-0.3,-0.5,...,0.3,0.3,0.5,1,1.0,0.494742,0.368252,0.423860,0.373433,0.377646


In [34]:
df.to_csv('../data/processed/Train_Stg1_Output.csv',index=False)