# Predict Trained Model on Actual Test Data 

In [41]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import joblib
from sklearn.preprocessing import Normalizer
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Load the saved model

In [3]:
rf_upsample_mod = joblib.load("../models/sp_wk2_upsample_model.joblib")
rf_upsample_mod

RandomForestClassifier(max_depth=26, max_features=5, min_samples_leaf=6,
                       n_estimators=378, random_state=8)

In [68]:
rf_downsample_mod = joblib.load("../models/sp_wk2_downsample_model.joblib")
rf_downsample_mod

RandomForestClassifier(max_depth=2, max_features=10, min_samples_leaf=11,
                       n_estimators=230, random_state=8)

All good.

Load the Test data from csv file

In [43]:
df=pd.read_csv('../data/raw/test.csv')

Performing basic investigation into the data 

In [44]:
df.head(5)

Unnamed: 0,Id_old,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,...,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
0,1,0,56,9.1,4.0,1.6,3.7,43.7,0.1,0.3,...,0.7,1.2,63.4,1.2,0.8,1.7,0.4,0.2,0.3,0.8
1,8194,1,43,19.3,10.1,3.7,8.1,46.0,0.6,1.7,...,1.8,2.5,75.3,0.5,0.9,1.5,3.5,0.6,0.0,1.8
2,3,2,82,33.9,11.3,4.9,10.6,45.6,0.5,1.9,...,1.8,2.7,71.2,1.3,3.3,4.5,2.5,1.3,0.3,2.0
3,8196,3,86,44.7,18.8,6.8,15.9,42.9,0.5,1.8,...,4.5,6.3,70.9,1.5,3.2,5.0,4.1,0.9,0.1,3.6
4,8197,4,58,12.3,4.7,1.6,4.0,40.0,0.5,1.7,...,1.1,1.3,76.9,0.2,0.6,0.9,1.5,0.5,-0.4,0.9


In [45]:
df.shape

(3799, 21)

In [46]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3799 entries, 0 to 3798
Data columns (total 21 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Id_old   3799 non-null   int64  
 1   Id       3799 non-null   int64  
 2   GP       3799 non-null   int64  
 3   MIN      3799 non-null   float64
 4   PTS      3799 non-null   float64
 5   FGM      3799 non-null   float64
 6   FGA      3799 non-null   float64
 7   FG%      3799 non-null   float64
 8   3P Made  3799 non-null   float64
 9   3PA      3799 non-null   float64
 10  3P%      3799 non-null   float64
 11  FTM      3799 non-null   float64
 12  FTA      3799 non-null   float64
 13  FT%      3799 non-null   float64
 14  OREB     3799 non-null   float64
 15  DREB     3799 non-null   float64
 16  REB      3799 non-null   float64
 17  AST      3799 non-null   float64
 18  STL      3799 non-null   float64
 19  BLK      3799 non-null   float64
 20  TOV      3799 non-null   float64
dtypes: float64(18)

In [47]:
df.describe()

Unnamed: 0,Id_old,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,...,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
count,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,...,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0,3799.0
mean,7010.614109,1899.0,62.853909,18.650224,7.328034,2.835404,6.30258,44.599079,0.255962,0.79692,...,1.399842,1.953567,71.612924,1.096025,2.179495,3.275783,1.636483,0.653593,0.257726,1.25791
std,3954.173641,1096.821164,17.15174,8.727259,4.294724,1.688427,3.579221,6.040168,0.380987,1.052862,...,0.92614,1.250376,10.457336,0.785678,1.371935,2.070646,1.335496,0.410573,0.63966,0.712449
min,1.0,0.0,6.0,3.7,0.7,0.3,0.8,25.1,-1.0,-2.7,...,0.0,0.0,23.7,0.0,0.2,0.3,0.0,0.0,-7.1,0.1
25%,3644.0,949.5,51.0,12.2,4.2,1.6,3.7,40.5,0.0,0.1,...,0.7,1.0,65.0,0.5,1.2,1.8,0.6,0.4,0.1,0.7
50%,7062.0,1899.0,63.0,17.0,6.4,2.5,5.5,44.6,0.3,0.8,...,1.2,1.7,71.5,0.9,1.9,2.8,1.3,0.6,0.2,1.1
75%,10402.5,2848.5,74.0,23.3,9.4,3.7,8.1,48.5,0.5,1.5,...,1.9,2.6,78.0,1.5,2.9,4.3,2.3,0.9,0.4,1.6
max,13792.0,3798.0,126.0,68.0,33.0,13.4,26.2,74.6,1.6,4.3,...,7.8,9.8,127.1,6.9,12.0,18.5,9.0,2.7,14.8,5.2


In [48]:
df_cleaned = df.copy()

In [49]:
df_cleaned[ df_cleaned<0 ] = 0

In [50]:
df_cleaned.loc[df_cleaned['3P Made'] <= 0, ['3P Made', '3PA', 'CALC3P%']] = 0, 0, 0
df_cleaned.loc[df_cleaned['FGM'] <= 0, ['FGM', 'FGA', 'CALCFG%']] = 0, 0, 0
df_cleaned.loc[df_cleaned['FTM'] <= 0, ['FTM', 'FTA', 'CALCFT%']] = 0, 0, 0

In [51]:
df_cleaned.loc[df_cleaned['3P Made'] > df_cleaned['3PA'], ['3P Made' , '3PA', 'CALC3P%']] = 0, 0, 0
df_cleaned.loc[df_cleaned['FGM'] > df_cleaned['FGA'], ['FGM', 'FGA', 'CALCFG%']] = 0, 0, 0
df_cleaned.loc[df_cleaned['FTM'] > df_cleaned['FTA'], ['FTM', 'FTA', 'CALCFT%']] = 0, 0, 0

In [52]:
df_cleaned.loc[df_cleaned['3P Made'] > 0, ['CALC3P%']] = df_cleaned['3P Made']/df_cleaned['3PA']*100
df_cleaned.loc[df_cleaned['FGM'] > 0, ['CALCFG%']] =df_cleaned['FGM']/df_cleaned['FGA']*100
df_cleaned.loc[df_cleaned['FTM'] > 0, ['CALCFT%']] = df_cleaned['FTM']/df_cleaned['FTA']*100

In [53]:
for cols in df.columns:
    chk_rows = df_cleaned[df_cleaned[cols]<0].shape[0]
    if chk_rows > 0 :
        print(f'Column Name {cols},\tRows with Negative Value {chk_rows},\tPercentage {chk_rows/len(df)*100}')

In [54]:
df_cleaned

Unnamed: 0,Id_old,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,...,OREB,DREB,REB,AST,STL,BLK,TOV,CALC3P%,CALCFG%,CALCFT%
0,1,0,56,9.1,4.0,1.6,3.7,43.7,0.1,0.3,...,1.2,0.8,1.7,0.4,0.2,0.3,0.8,33.333333,43.243243,58.333333
1,8194,1,43,19.3,10.1,3.7,8.1,46.0,0.6,1.7,...,0.5,0.9,1.5,3.5,0.6,0.0,1.8,35.294118,45.679012,72.000000
2,3,2,82,33.9,11.3,4.9,10.6,45.6,0.5,1.9,...,1.3,3.3,4.5,2.5,1.3,0.3,2.0,26.315789,46.226415,66.666667
3,8196,3,86,44.7,18.8,6.8,15.9,42.9,0.5,1.8,...,1.5,3.2,5.0,4.1,0.9,0.1,3.6,27.777778,42.767296,71.428571
4,8197,4,58,12.3,4.7,1.6,4.0,40.0,0.5,1.7,...,0.2,0.6,0.9,1.5,0.5,0.0,0.9,29.411765,40.000000,84.615385
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3794,8175,3794,84,21.2,8.7,3.4,6.7,50.2,0.0,0.0,...,1.9,2.3,3.9,1.5,0.6,0.3,2.0,0.000000,50.746269,68.000000
3795,8176,3795,49,16.3,6.4,2.9,6.6,44.4,0.0,0.0,...,1.7,2.8,4.4,0.4,0.4,0.4,0.7,0.000000,43.939394,52.631579
3796,8178,3796,53,9.9,2.1,0.8,1.8,43.1,0.0,0.0,...,0.7,1.0,1.7,0.4,0.4,0.2,0.5,0.000000,44.444444,60.000000
3797,8181,3797,89,38.3,14.5,5.4,11.8,45.2,0.5,1.2,...,1.5,4.0,5.5,3.7,1.3,0.3,2.4,41.666667,45.762712,86.206897


In [55]:
df_cleaned = df_cleaned.drop(['3P%','FT%','FG%','Id_old','Id'],axis=1)

In [56]:
df_cleaned.shape

(3799, 19)

In [57]:
x=df_cleaned.copy()

Applying the same Feature Scaling 

#### Predict Result and Export to CSV for uploading to Kaggle

In [69]:
# y_upsample_pred_proba=rf_upsample_mod.predict_proba(x)
y_downsample_pred_proba=rf_downsample_mod.predict_proba(x)

In [70]:
# y_upsample_pred=rf_upsample_mod.predict(x)
y_downsample_pred=rf_downsample_mod.predict(x)

In [71]:
# print(np.unique(y_upsample_pred_proba,return_counts=True))
print(np.unique(y_downsample_pred_proba,return_counts=True))

(array([0.22936107, 0.23305444, 0.23313792, ..., 0.76686208, 0.76694556,
       0.77063893]), array([1, 1, 1, ..., 1, 1, 1], dtype=int64))


In [72]:
# print(np.unique(y_upsample_pred,return_counts=True))
print(np.unique(y_downsample_pred,return_counts=True))

(array([0, 1], dtype=int64), array([2255, 1544], dtype=int64))


In [73]:
y_upsample_pred_proba, y_downsample_pred_proba

(array([[0.49215471, 0.50784529],
        [0.49777465, 0.50222535],
        [0.78563487, 0.21436513],
        ...,
        [0.51817487, 0.48182513],
        [0.83045137, 0.16954863],
        [0.60688437, 0.39311563]]),
 array([[0.41469435, 0.58530565],
        [0.44610223, 0.55389777],
        [0.73297866, 0.26702134],
        ...,
        [0.33737208, 0.66262792],
        [0.74379788, 0.25620212],
        [0.39603466, 0.60396534]]))

In [74]:
print(rf_upsample_mod.classes_)
print(rf_downsample_mod.classes_)

[0 1]
[0 1]


In [75]:
# df_cleaned_upsample_result = df.copy()
df_cleaned_downsample_result = df.copy()

In [76]:
# df_cleaned_upsample_result['TARGET_5Yrs'] = y_upsample_pred_proba[:,0]
df_cleaned_downsample_result['TARGET_5Yrs'] = y_downsample_pred_proba[:,0]

In [77]:
# print(df_cleaned_upsample_result['TARGET_5Yrs'].round().value_counts())
print(df_cleaned_downsample_result['TARGET_5Yrs'].round().value_counts())

1.0    2255
0.0    1544
Name: TARGET_5Yrs, dtype: int64


In [78]:
# df_cleaned_upsample_result.to_csv('../data/processed/TestResult_RF_UpSample_Result.csv',index=False,columns=['Id', 'TARGET_5Yrs'])
df_cleaned_downsample_result.to_csv('../data/processed/TestResult_RF_DownSample_Result.csv',index=False,columns=['Id', 'TARGET_5Yrs'])