## Set up

Install required packages and load relevant data.

These functions derived from Roger's prepare data notebook

In [73]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from scipy import stats
from joblib import dump
from src.data import make_dataset
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
from  matplotlib.ticker import FuncFormatter
import seaborn as sns

In [80]:
project_dir = Path.cwd().parent
report_dir = project_dir / 'reports'
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
interim_data_dir = data_dir / 'interim'
processed_data_dir = data_dir / 'processed'
models_dir = project_dir / 'models'

## Download and Prepare Data

In [63]:
competition = 'uts-advdsi-nba-career-prediction'
make_dataset.download_data(competition=competition,
                           path=raw_data_dir,
                           unzip=True)

In [64]:
df_train = pd.read_csv(raw_data_dir / 'train.csv')
df_train.describe()

Unnamed: 0,Id_old,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,...,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,6856.971,7798.5,62.777875,18.576662,7.267088,2.807037,6.231212,44.6089,0.264525,0.816562,...,1.947788,71.365825,1.077838,2.1685,3.2453,1.624513,0.648687,0.245212,1.257763,0.833625
std,3977.447579,2309.54541,17.118774,8.935263,4.318732,1.693373,3.584559,6.155453,0.384093,1.060964,...,1.252352,10.430447,0.78567,1.392224,2.085154,1.355986,0.407626,0.821037,0.72327,0.37244
min,4.0,3799.0,-8.0,2.9,0.8,0.3,0.8,21.3,-1.1,-3.1,...,0.0,-13.3,0.0,0.2,0.3,0.0,0.0,-17.9,0.1,0.0
25%,3413.75,5798.75,51.0,12.0,4.1,1.6,3.6,40.4,0.0,0.1,...,1.0,65.0,0.5,1.1,1.7,0.7,0.3,0.1,0.7,1.0
50%,6787.5,7798.5,63.0,16.8,6.3,2.4,5.4,44.4,0.3,0.8,...,1.7,71.4,0.9,1.9,2.8,1.3,0.6,0.2,1.1,1.0
75%,10299.25,9798.25,74.0,23.5,9.5,3.7,8.1,48.7,0.5,1.5,...,2.6,77.5,1.5,2.9,4.3,2.2,0.9,0.4,1.6,1.0
max,13798.0,11798.0,123.0,73.8,34.2,13.1,28.9,67.2,1.7,4.7,...,11.1,168.9,5.5,11.0,15.9,12.8,3.6,18.9,5.3,1.0


In [65]:
X_test = pd.read_csv(raw_data_dir / 'test.csv')
X_test

Unnamed: 0,Id_old,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,...,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
0,1,0,56,9.1,4.0,1.6,3.7,43.7,0.1,0.3,...,0.7,1.2,63.4,1.2,0.8,1.7,0.4,0.2,0.3,0.8
1,8194,1,43,19.3,10.1,3.7,8.1,46.0,0.6,1.7,...,1.8,2.5,75.3,0.5,0.9,1.5,3.5,0.6,0.0,1.8
2,3,2,82,33.9,11.3,4.9,10.6,45.6,0.5,1.9,...,1.8,2.7,71.2,1.3,3.3,4.5,2.5,1.3,0.3,2.0
3,8196,3,86,44.7,18.8,6.8,15.9,42.9,0.5,1.8,...,4.5,6.3,70.9,1.5,3.2,5.0,4.1,0.9,0.1,3.6
4,8197,4,58,12.3,4.7,1.6,4.0,40.0,0.5,1.7,...,1.1,1.3,76.9,0.2,0.6,0.9,1.5,0.5,-0.4,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3794,8175,3794,84,21.2,8.7,3.4,6.7,50.2,0.0,0.0,...,1.7,2.5,68.1,1.9,2.3,3.9,1.5,0.6,0.3,2.0
3795,8176,3795,49,16.3,6.4,2.9,6.6,44.4,-0.1,-0.4,...,1.0,1.9,50.2,1.7,2.8,4.4,0.4,0.4,0.4,0.7
3796,8178,3796,53,9.9,2.1,0.8,1.8,43.1,-0.4,-0.6,...,0.6,1.0,63.9,0.7,1.0,1.7,0.4,0.4,0.2,0.5
3797,8181,3797,89,38.3,14.5,5.4,11.8,45.2,0.5,1.2,...,2.5,2.9,89.2,1.5,4.0,5.5,3.7,1.3,0.3,2.4


In [66]:
df_train.drop(columns=['Id_old', 'Id'], inplace=True)
X_test.drop(columns=['Id_old'], inplace=True)
test_id = X_test.pop('Id')

## Exploration

In [67]:
profile_report = ProfileReport(df_train,
                               title='Raw data report',
                               explorative=True)
profile_report.to_file(report_dir / 'profile_report.html')

Summarize dataset:   0%|          | 0/33 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

### Exploration Notes
TODO: summarise

## Modelling
TODO:
* Linear model - exploratory first step
* Consider PCA regression - Roger's PCA suggests high colinearity among predictors
* Use random forest as a good out of the box tree method to handle colinearity

In [68]:
target = 'TARGET_5Yrs'
X, y = make_dataset.separate_target(df_train, target=target)
X_train, X_val, y_train, y_val = train_test_split(X, y,
                                                  test_size=0.2,
                                                  random_state=42)

### Random Forest

This is a basic Random Forest with no parameter tuning

In [3]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=3, random_state=0, oob_score = True,class_weight = 'balanced')

clf.fit(X_train, y_train)

In [70]:
y_pred = clf.predict(X_train)
print(classification_report(y_train,y_pred))

              precision    recall  f1-score   support

           0       0.27      0.67      0.38      1068
           1       0.91      0.63      0.75      5332

    accuracy                           0.64      6400
   macro avg       0.59      0.65      0.56      6400
weighted avg       0.80      0.64      0.69      6400



In [71]:
confusion_matrix(y_train,y_pred)

array([[ 719,  349],
       [1957, 3375]])

In [30]:
y_pred = clf.predict(X_val)
y_p = clf.predict_proba(X_val)

In [31]:
print(classification_report(y_val,y_pred))

              precision    recall  f1-score   support

           0       0.26      0.65      0.37       263
           1       0.90      0.64      0.75      1337

    accuracy                           0.64      1600
   macro avg       0.58      0.64      0.56      1600
weighted avg       0.80      0.64      0.68      1600



In [55]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_val,y_p[:,[1]])
metrics.auc(fpr, tpr)

0.7095165102053005

In [56]:
y_test_pred = clf.predict(X_test)

In [57]:
submission = pd.DataFrame({'id':test_id,'TARGET_5Yrs': y_test_pred})

In [59]:
submission.to_csv('mark_submission.csv', index = False)

## Submissions
To submit, you can submit from the command line:
```bash
kaggle competitions submit -c uts-advdsi-nba-career-prediction -f submission.csv -m "Message"
```

## Tuning

In [88]:
pipe = Pipeline([
        ('classifier', RandomForestClassifier(oob_score = True))
])

param_dist = {
    'classifier__n_estimators': stats.randint(150, 1000),
    'classifier__max_depth': [3, 4, 5, 6, 7, 8, 9],
    'classifier__max_features': ["auto",'sqrt','log2'],
    'classifier__class_weight': ['balanced','balanced_subsample']
}

cv = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    random_state=42,
    n_iter=15,
    cv=5,
    n_jobs=7,
    verbose=10
)

cv.fit(X_train, y_train)

Fitting 5 folds for each of 15 candidates, totalling 75 fits


RandomizedSearchCV(cv=5,
                   estimator=Pipeline(steps=[('classifier',
                                              RandomForestClassifier(oob_score=True))]),
                   n_iter=15, n_jobs=7,
                   param_distributions={'classifier__class_weight': ['balanced',
                                                                     'balanced_subsample'],
                                        'classifier__max_depth': [3, 4, 5, 6, 7,
                                                                  8, 9],
                                        'classifier__max_features': ['auto',
                                                                     'sqrt',
                                                                     'log2'],
                                        'classifier__n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x12f134320>},
                   random_state=42, verbose=10)

In [89]:
preds = cv.predict(X_val)
probs = cv.predict_proba(X_val)

In [90]:
fpr, tpr, thresholds = roc_curve(y_val, probs[:, 1])
roc_auc_score(y_val, probs[:, 1])

0.7019233230289708

In [91]:
confusion_matrix(y_val, preds)

array([[ 119,  144],
       [ 260, 1077]])

In [92]:
from kaggle.api.kaggle_api_extended import KaggleApi
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())
api = KaggleApi()
api.authenticate()

In [93]:
preds = cv.predict(X_test)
pred_name = 'TARGET_5Yrs'
pred_path = processed_data_dir / 'preds_rf_cv.csv'

submission = pd.DataFrame({'id':test_id,'TARGET_5Yrs': preds})

submission.to_csv(pred_path, index = False)

api.competition_submit(file_name=pred_path,
                       message="including randomised CV search",
                       competition=competition,
                       quiet=False)

100%|██████████| 24.9k/24.9k [00:08<00:00, 3.17kB/s]


Successfully submitted to [UTS AdvDSI] NBA Career Prediction

In [1]:
now()

NameError: name 'now' is not defined

In [11]:
str(clf)

"RandomForestClassifier(class_weight='balanced', max_depth=3, oob_score=True,\n                       random_state=0)"