##### Import Package

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from joblib import dump
from src.models import eval_model as evm
from src.models import eval_baseline as evb
%load_ext autoreload
%autoreload 2

In [22]:
df = pd.read_csv("../data/raw/train.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id_old       8000 non-null   int64  
 1   Id           8000 non-null   int64  
 2   GP           8000 non-null   int64  
 3   MIN          8000 non-null   float64
 4   PTS          8000 non-null   float64
 5   FGM          8000 non-null   float64
 6   FGA          8000 non-null   float64
 7   FG%          8000 non-null   float64
 8   3P Made      8000 non-null   float64
 9   3PA          8000 non-null   float64
 10  3P%          8000 non-null   float64
 11  FTM          8000 non-null   float64
 12  FTA          8000 non-null   float64
 13  FT%          8000 non-null   float64
 14  OREB         8000 non-null   float64
 15  DREB         8000 non-null   float64
 16  REB          8000 non-null   float64
 17  AST          8000 non-null   float64
 18  STL          8000 non-null   float64
 19  BLK   

In [4]:
df.shape

(8000, 22)

In [20]:
df['TARGET_5Yrs'].value_counts()

1    6669
0    1331
Name: TARGET_5Yrs, dtype: int64

In [6]:
df.describe(include='all')

Unnamed: 0,Id_old,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,...,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,6856.971,7798.5,62.777875,18.576662,7.267088,2.807037,6.231212,44.6089,0.264525,0.816562,...,1.947788,71.365825,1.077838,2.1685,3.2453,1.624513,0.648687,0.245212,1.257763,0.833625
std,3977.447579,2309.54541,17.118774,8.935263,4.318732,1.693373,3.584559,6.155453,0.384093,1.060964,...,1.252352,10.430447,0.78567,1.392224,2.085154,1.355986,0.407626,0.821037,0.72327,0.37244
min,4.0,3799.0,-8.0,2.9,0.8,0.3,0.8,21.3,-1.1,-3.1,...,0.0,-13.3,0.0,0.2,0.3,0.0,0.0,-17.9,0.1,0.0
25%,3413.75,5798.75,51.0,12.0,4.1,1.6,3.6,40.4,0.0,0.1,...,1.0,65.0,0.5,1.1,1.7,0.7,0.3,0.1,0.7,1.0
50%,6787.5,7798.5,63.0,16.8,6.3,2.4,5.4,44.4,0.3,0.8,...,1.7,71.4,0.9,1.9,2.8,1.3,0.6,0.2,1.1,1.0
75%,10299.25,9798.25,74.0,23.5,9.5,3.7,8.1,48.7,0.5,1.5,...,2.6,77.5,1.5,2.9,4.3,2.2,0.9,0.4,1.6,1.0
max,13798.0,11798.0,123.0,73.8,34.2,13.1,28.9,67.2,1.7,4.7,...,11.1,168.9,5.5,11.0,15.9,12.8,3.6,18.9,5.3,1.0


In [23]:
df_cleaned = df.copy()

In [24]:
df_cleaned[ df_cleaned<0 ] = 0

In [25]:
df_cleaned.loc[df_cleaned['3P Made'] <= 0, ['3P Made', '3PA', 'CALC3P%']] = 0, 0, 0
df_cleaned.loc[df_cleaned['FGM'] <= 0, ['FGM', 'FGA', 'CALCFG%']] = 0, 0, 0
df_cleaned.loc[df_cleaned['FTM'] <= 0, ['FTM', 'FTA', 'CALCFT%']] = 0, 0, 0

In [26]:
df_cleaned.loc[df_cleaned['3P Made'] > df_cleaned['3PA'], ['3P Made' , '3PA', 'CALC3P%']] = 0, 0, 0
df_cleaned.loc[df_cleaned['FGM'] > df_cleaned['FGA'], ['FGM', 'FGA', 'CALCFG%']] = 0, 0, 0
df_cleaned.loc[df_cleaned['FTM'] > df_cleaned['FTA'], ['FTM', 'FTA', 'CALCFT%']] = 0, 0, 0

In [27]:
df_cleaned.loc[df_cleaned['3P Made'] > 0, ['CALC3P%']] = df_cleaned['3P Made']/df_cleaned['3PA']*100
df_cleaned.loc[df_cleaned['FGM'] > 0, ['CALCFG%']] =df_cleaned['FGM']/df_cleaned['FGA']*100
df_cleaned.loc[df_cleaned['FTM'] > 0, ['CALCFT%']] = df_cleaned['FTM']/df_cleaned['FTA']*100

In [28]:
for cols in df.columns:
    chk_rows = df_cleaned[df_cleaned[cols]<0].shape[0]
    if chk_rows > 0 :
        print(f'Column Name {cols},\tRows with Negative Value {chk_rows},\tPercentage {chk_rows/len(df)*100}')

In [29]:
df_cleaned = df_cleaned.drop(['3P%','FT%','FG%','Id_old','Id'],axis=1)

In [38]:
df_cleaned['TARGET_5Yrs_Inv'] =df_cleaned['TARGET_5Yrs']
df_cleaned[['TARGET_5Yrs','TARGET_5Yrs_Inv']].value_counts()

TARGET_5Yrs  TARGET_5Yrs_Inv
1            1                  6669
0            0                  1331
dtype: int64

In [39]:
df_cleaned['TARGET_5Yrs_Inv'] = df_cleaned['TARGET_5Yrs_Inv'].replace([0,1],[1,0])

In [40]:
df_cleaned[['TARGET_5Yrs','TARGET_5Yrs_Inv']].value_counts()

TARGET_5Yrs  TARGET_5Yrs_Inv
1            0                  6669
0            1                  1331
dtype: int64

In [53]:
x=df_cleaned.drop(['TARGET_5Yrs','TARGET_5Yrs_Inv'],axis=1)
y=df_cleaned['TARGET_5Yrs_Inv']

In [54]:
from sklearn.model_selection import train_test_split
x_data , x_test ,y_data,  y_test = train_test_split(x, y, test_size=0.2, random_state = 8, stratify=y)
x_train , x_val , y_train, y_val = train_test_split(x_data, y_data, test_size=0.2, random_state = 8, stratify=y_data)

In [55]:
y_train.value_counts(normalize=True)

0    0.833594
1    0.166406
Name: TARGET_5Yrs_Inv, dtype: float64

In [56]:
y_val.value_counts(normalize=True)

0    0.833594
1    0.166406
Name: TARGET_5Yrs_Inv, dtype: float64

In [57]:
y_test.value_counts(normalize=True)

0    0.83375
1    0.16625
Name: TARGET_5Yrs_Inv, dtype: float64

In [58]:
x_col_names=x.columns.tolist()
y_col_names =['TARGET_5Yrs_Inv']
x_col_names , y_col_names

(['GP',
  'MIN',
  'PTS',
  'FGM',
  'FGA',
  '3P Made',
  '3PA',
  'FTM',
  'FTA',
  'OREB',
  'DREB',
  'REB',
  'AST',
  'STL',
  'BLK',
  'TOV',
  'CALC3P%',
  'CALCFG%',
  'CALCFT%'],
 ['TARGET_5Yrs_Inv'])

In [60]:
from joblib import dump
# for cross validation training
np.save('../data/processed/sp_w2_x', x)
np.save('../data/processed/sp_w2_y', y)
#for normal train/validate/test 
np.save('../data/processed/sp_w2_x_train', x_train)
np.save('../data/processed/sp_w2_x_val',   x_val)
np.save('../data/processed/sp_w2_x_test',  x_test)
np.save('../data/processed/sp_w2_y_train', y_train)
np.save('../data/processed/sp_w2_y_val',   y_val)
np.save('../data/processed/sp_w2_y_test',  y_test)

np.save('../data/processed/sp_w2_x_col_name',  x_col_names)
np.save('../data/processed/sp_w2_y_col_name',  y_col_names)

In [62]:
df_cleaned.to_csv('../data/processed/df_cleaned_nba_prediction.csv',index=False)