## Set up

Install required packages and load relevant data.

These functions derived from Roger's prepare data notebook

In [5]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from scipy import stats
from joblib import dump
from src.data import make_dataset
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from  matplotlib.ticker import FuncFormatter
import seaborn as sns

In [6]:
project_dir = Path.cwd().parent
report_dir = project_dir / 'reports'
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
interim_data_dir = data_dir / 'interim'
processed_data_dir = data_dir / 'processed'
models_dir = project_dir / 'models'

## Download and Prepare Data

In [7]:
competition = 'uts-advdsi-nba-career-prediction'
make_dataset.download_data(competition=competition,
                           path=raw_data_dir,
                           unzip=True)

In [8]:
df_train = pd.read_csv(raw_data_dir / 'train.csv')
df_train.describe()

Unnamed: 0,Id_old,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,...,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,6856.971,7798.5,62.777875,18.576662,7.267088,2.807037,6.231212,44.6089,0.264525,0.816562,...,1.947788,71.365825,1.077838,2.1685,3.2453,1.624513,0.648687,0.245212,1.257763,0.833625
std,3977.447579,2309.54541,17.118774,8.935263,4.318732,1.693373,3.584559,6.155453,0.384093,1.060964,...,1.252352,10.430447,0.78567,1.392224,2.085154,1.355986,0.407626,0.821037,0.72327,0.37244
min,4.0,3799.0,-8.0,2.9,0.8,0.3,0.8,21.3,-1.1,-3.1,...,0.0,-13.3,0.0,0.2,0.3,0.0,0.0,-17.9,0.1,0.0
25%,3413.75,5798.75,51.0,12.0,4.1,1.6,3.6,40.4,0.0,0.1,...,1.0,65.0,0.5,1.1,1.7,0.7,0.3,0.1,0.7,1.0
50%,6787.5,7798.5,63.0,16.8,6.3,2.4,5.4,44.4,0.3,0.8,...,1.7,71.4,0.9,1.9,2.8,1.3,0.6,0.2,1.1,1.0
75%,10299.25,9798.25,74.0,23.5,9.5,3.7,8.1,48.7,0.5,1.5,...,2.6,77.5,1.5,2.9,4.3,2.2,0.9,0.4,1.6,1.0
max,13798.0,11798.0,123.0,73.8,34.2,13.1,28.9,67.2,1.7,4.7,...,11.1,168.9,5.5,11.0,15.9,12.8,3.6,18.9,5.3,1.0


In [9]:
X_test = pd.read_csv(raw_data_dir / 'test.csv')
X_test

Unnamed: 0,Id_old,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,...,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
0,1,0,56,9.1,4.0,1.6,3.7,43.7,0.1,0.3,...,0.7,1.2,63.4,1.2,0.8,1.7,0.4,0.2,0.3,0.8
1,8194,1,43,19.3,10.1,3.7,8.1,46.0,0.6,1.7,...,1.8,2.5,75.3,0.5,0.9,1.5,3.5,0.6,0.0,1.8
2,3,2,82,33.9,11.3,4.9,10.6,45.6,0.5,1.9,...,1.8,2.7,71.2,1.3,3.3,4.5,2.5,1.3,0.3,2.0
3,8196,3,86,44.7,18.8,6.8,15.9,42.9,0.5,1.8,...,4.5,6.3,70.9,1.5,3.2,5.0,4.1,0.9,0.1,3.6
4,8197,4,58,12.3,4.7,1.6,4.0,40.0,0.5,1.7,...,1.1,1.3,76.9,0.2,0.6,0.9,1.5,0.5,-0.4,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3794,8175,3794,84,21.2,8.7,3.4,6.7,50.2,0.0,0.0,...,1.7,2.5,68.1,1.9,2.3,3.9,1.5,0.6,0.3,2.0
3795,8176,3795,49,16.3,6.4,2.9,6.6,44.4,-0.1,-0.4,...,1.0,1.9,50.2,1.7,2.8,4.4,0.4,0.4,0.4,0.7
3796,8178,3796,53,9.9,2.1,0.8,1.8,43.1,-0.4,-0.6,...,0.6,1.0,63.9,0.7,1.0,1.7,0.4,0.4,0.2,0.5
3797,8181,3797,89,38.3,14.5,5.4,11.8,45.2,0.5,1.2,...,2.5,2.9,89.2,1.5,4.0,5.5,3.7,1.3,0.3,2.4


In [10]:
df_train.drop(columns=['Id_old', 'Id'], inplace=True)
X_test.drop(columns=['Id_old'], inplace=True)
test_id = X_test.pop('Id')

## Modelling
TODO:
* Linear model - exploratory first step
* Consider PCA regression - Roger's PCA suggests high colinearity among predictors
* Use random forest as a good out of the box tree method to handle colinearity

In [35]:
target = 'TARGET_5Yrs'
X, y = make_dataset.separate_target(df_train, target=target)
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.1,
                                                  random_state=1)

### LogisticRegression

This is a basic Random Forest with no parameter tuning

In [36]:
from sklearn.linear_model import LogisticRegression


## Tuning

In [37]:
pipe = Pipeline([        ('scaler', StandardScaler()),
        ('pca', PCA()),
        ('classifier',LogisticRegression())
])

param_dist = {
    'pca__n_components': stats.randint(1, X_train.shape[1]),
    'classifier__penalty': ['l2','none'],
    'classifier__class_weight': ['balanced']
    
}

cv = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    random_state=42,
    n_iter=40,
    cv=10,
    n_jobs=7,
    verbose=10
)

cv.fit(X_train, y_train)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


RandomizedSearchCV(cv=10,
                   estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                             ('pca', PCA()),
                                             ('classifier',
                                              LogisticRegression())]),
                   n_iter=40, n_jobs=7,
                   param_distributions={'classifier__class_weight': ['balanced'],
                                        'classifier__penalty': ['l2', 'none'],
                                        'pca__n_components': <scipy.stats._distn_infrastructure.rv_frozen object at 0x1240d7358>},
                   random_state=42, verbose=10)

In [38]:
pd.DataFrame(cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__class_weight,param_classifier__penalty,param_pca__n_components,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.451176,0.038487,0.029319,0.009362,balanced,l2,15,"{'classifier__class_weight': 'balanced', 'clas...",0.644444,0.625,...,0.644444,0.651389,0.65,0.651389,0.652778,0.641667,0.613889,0.639444,0.013752,21
1,0.25807,0.052814,0.021751,0.003519,balanced,l2,8,"{'classifier__class_weight': 'balanced', 'clas...",0.630556,0.622222,...,0.622222,0.640278,0.640278,0.651389,0.652778,0.634722,0.606944,0.633889,0.013296,23
2,0.20867,0.01819,0.018968,0.006902,balanced,l2,7,"{'classifier__class_weight': 'balanced', 'clas...",0.631944,0.613889,...,0.626389,0.636111,0.636111,0.656944,0.655556,0.626389,0.608333,0.631806,0.014809,29
3,0.310107,0.016618,0.026474,0.015274,balanced,none,11,"{'classifier__class_weight': 'balanced', 'clas...",0.633333,0.633333,...,0.6375,0.65,0.648611,0.655556,0.65,0.631944,0.620833,0.639861,0.010245,19
4,0.23065,0.031819,0.020961,0.010697,balanced,l2,4,"{'classifier__class_weight': 'balanced', 'clas...",0.605556,0.573611,...,0.580556,0.590278,0.611111,0.616667,0.601389,0.593056,0.576389,0.59375,0.013934,32
5,0.177611,0.038864,0.018015,0.010941,balanced,none,3,"{'classifier__class_weight': 'balanced', 'clas...",0.602778,0.566667,...,0.568056,0.581944,0.594444,0.613889,0.597222,0.583333,0.573611,0.586389,0.014643,37
6,0.155736,0.014002,0.019119,0.006019,balanced,none,2,"{'classifier__class_weight': 'balanced', 'clas...",0.602778,0.569444,...,0.569444,0.591667,0.594444,0.609722,0.593056,0.586111,0.579167,0.588333,0.01241,35
7,0.236887,0.015424,0.017063,0.006611,balanced,none,12,"{'classifier__class_weight': 'balanced', 'clas...",0.654167,0.616667,...,0.629167,0.630556,0.647222,0.661111,0.659722,0.647222,0.613889,0.640417,0.016095,12
8,0.181601,0.012601,0.013738,0.004632,balanced,none,6,"{'classifier__class_weight': 'balanced', 'clas...",0.631944,0.615278,...,0.623611,0.6375,0.638889,0.656944,0.654167,0.625,0.6125,0.632083,0.014211,27
9,0.138145,0.030769,0.015635,0.005036,balanced,none,1,"{'classifier__class_weight': 'balanced', 'clas...",0.601389,0.544444,...,0.541667,0.568056,0.573611,0.602778,0.588889,0.570833,0.566667,0.571528,0.02012,40


In [39]:
preds = cv.predict(X_val)
probs = cv.predict_proba(X_val)

In [40]:
fpr, tpr, thresholds = roc_curve(y_val, probs[:, 1])
roc_auc_score(y_val, probs[:, 1])

0.6769336734144793

In [32]:
print(len(y_val))
confusion_matrix(y_val, preds)

800


array([[ 80,  36],
       [226, 458]])

In [33]:
from kaggle.api.kaggle_api_extended import KaggleApi
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())
api = KaggleApi()
api.authenticate()

In [34]:
preds = cv.predict(X_test)
pred_name = 'TARGET_5Yrs'
pred_path = processed_data_dir / 'preds_rf_cv.csv'

submission = pd.DataFrame({'id':test_id,'TARGET_5Yrs': preds})

submission.to_csv(pred_path, index = False)

api.competition_submit(file_name=pred_path,
                       message="including randomised CV search",
                       competition=competition,
                       quiet=False)

100%|██████████| 24.9k/24.9k [00:09<00:00, 2.59kB/s]


Successfully submitted to [UTS AdvDSI] NBA Career Prediction