In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2
import joblib
import math
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression

from src.data import getAbsolute, resampling, explore_dataset as ex
from src.features import kpw_build_features, standardization
from src.models import cross_validation, save_predictions, blending
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split
import xgboost as xgb

## Explore Data

Understand the training set and test set, and what issues there are to determine what data preparation steps are required.

In [2]:
ex.explore("train.csv")

=== dataframe info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 22 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id_old       8000 non-null   int64  
 1   Id           8000 non-null   int64  
 2   GP           8000 non-null   int64  
 3   MIN          8000 non-null   float64
 4   PTS          8000 non-null   float64
 5   FGM          8000 non-null   float64
 6   FGA          8000 non-null   float64
 7   FG%          8000 non-null   float64
 8   3P Made      8000 non-null   float64
 9   3PA          8000 non-null   float64
 10  3P%          8000 non-null   float64
 11  FTM          8000 non-null   float64
 12  FTA          8000 non-null   float64
 13  FT%          8000 non-null   float64
 14  OREB         8000 non-null   float64
 15  DREB         8000 non-null   float64
 16  REB          8000 non-null   float64
 17  AST          8000 non-null   float64
 18  STL          8000 non-nul

In [3]:
ex.explore("test.csv")

=== dataframe info ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3799 entries, 0 to 3798
Data columns (total 21 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Id_old   3799 non-null   int64  
 1   Id       3799 non-null   int64  
 2   GP       3799 non-null   int64  
 3   MIN      3799 non-null   float64
 4   PTS      3799 non-null   float64
 5   FGM      3799 non-null   float64
 6   FGA      3799 non-null   float64
 7   FG%      3799 non-null   float64
 8   3P Made  3799 non-null   float64
 9   3PA      3799 non-null   float64
 10  3P%      3799 non-null   float64
 11  FTM      3799 non-null   float64
 12  FTA      3799 non-null   float64
 13  FT%      3799 non-null   float64
 14  OREB     3799 non-null   float64
 15  DREB     3799 non-null   float64
 16  REB      3799 non-null   float64
 17  AST      3799 non-null   float64
 18  STL      3799 non-null   float64
 19  BLK      3799 non-null   float64
 20  TOV      3799 non-null   floa

## Data Quality Concerns
- GP, 3PM, 3PA, 3p%, FT%, BLK have negative value as minimum - **Convert to absolute value** 
- There are potential outliers in the value between 75% mark and MAX is huge jump across all features
- Check if all made value is smaller than attempt value
- All % values are bit off and not close to Made/attempt - **Consider dropping these fields or recreate them**
- BLK has outliers - **Fix it or drop these**
- 3PA and FTA has 0 value - Need to ensure 3PM and FTM are also 0 in these cases
- Imbalanced data - 21 : 4 Ratio

## Data Preparation

Convert to absolute value

In [4]:
df_train_abs = getAbsolute.abs(pd.read_csv("../data/raw/train.csv"),'train')
df_test_abs = getAbsolute.abs(pd.read_csv("../data/raw/test.csv"),'test')

## Feature Engineering

- Recalculate percentage features
- Add new features
- Drop features based on coefficient


In [5]:
df_train_clean = kpw_build_features.build(df_train_abs)
df_test_clean = kpw_build_features.build(df_test_abs)

In [6]:
y = df_train_clean.pop('TARGET_5Yrs')
X = df_train_clean.iloc[:,2:] 
X_test = df_test_clean.iloc[:,2:]

In [7]:
X = np.asarray(X)
y = np.asarray(y)
X_test = np.asarray(X_test)

Create a validation set

In [8]:
X, X_val, y, y_val = train_test_split(X,y,test_size=0.2, random_state=8, stratify=y)

Dump all processed data

In [9]:
joblib.dump(X, "../data/processed/X")
joblib.dump(y, "../data/processed/y")
joblib.dump(X_test, "../data/processed/X_test")

['../data/processed/X_test']

## Week 1 baseline

In [10]:
max_roc_auc_score = 0.0
def get_roc_auc_score(classifier):
    global max_roc_auc_score
    roc_score_training, roc_score_val = cross_validation.cv(classifier, X, y)
    combined_roc_auc_score = roc_score_val * (1 - abs(roc_score_training - roc_score_val))
    hot_icon = u"\U0001F525"
    cold_icon = u"\U00002744"

    if max_roc_auc_score < combined_roc_auc_score:
        print(f"{hot_icon} The score {str(combined_roc_auc_score)} is better than {str(max_roc_auc_score)} so save the model {hot_icon}")
        # joblib.dump(classifier, "../models/kpw_best_classifier_assignmentA")
        max_roc_auc_score = combined_roc_auc_score
    else:
        print(f"{cold_icon} The score {str(combined_roc_auc_score)} is not better {cold_icon}")

In [11]:
classifier = LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.3, C=0.01)
get_roc_auc_score(classifier)

Avg ROC AUC score of training set is: 0.7041085862252553
Avg ROC AUC score of valuation set is: 0.7009464471929986
🔥 The score 0.6987299570728079 is better than 0.0 so save the model 🔥


## Blending approach
Create a collection of classifier with different training dataset

In [12]:
df_binned_clean = pd.DataFrame(X)
df_binned_clean['TARGET_5Yrs'] = pd.DataFrame(y)

In [13]:
df3 = df_binned_clean[df_binned_clean[19]==3]
y3 = df3.pop('TARGET_5Yrs')
X3 = df3.iloc[:,:]

In [14]:
X3 = np.asarray(X3)
y3 = np.asarray(y3)

In [15]:
model1 = xgb.XGBRFClassifier(max_depth=3, learning_rate=0.1, objective='binary:logistic')
model2 = xgb.XGBRFClassifier(max_depth=4, learning_rate=0.03, objective='binary:logistic', scale_pos_weight=0.4, subsample=0.6) 
model3 = xgb.XGBRFClassifier(max_depth=4, learning_rate=0.03, objective='binary:logistic', scale_pos_weight=0.2, subsample=0.3)
model4 = xgb.XGBClassifier(max_depth=5, learning_rate=0.001, objective='binary:logistic')
model5 = xgb.XGBClassifier(max_depth=4, learning_rate=0.03, objective='binary:logistic', scale_pos_weight=0.4, subsample=0.4)
model6 = xgb.XGBRFClassifier(max_depth=4, learning_rate=0.03, objective='binary:logistic', scale_pos_weight=0.4, subsample=0.3)

In [16]:
model1 = model1.fit(X3,y3)
model2 = model2.fit(X3,y3)
model3 = model3.fit(X3,y3)
model4 = model4.fit(X3,y3)
model5 = model5.fit(X3,y3)
model6 = model6.fit(X,y)



## Train model, save model and generate prediction in Blending approach

In [17]:
clfs = [ model1, model2, model3, model4, model5, model6]
blending.blend(X,y,X_val,y_val,X_test,clfs,'../data/predictions/kpw_submission_assignmentB.csv')

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)
Creating train, validation and test sets for blending.
Blending.
==== ROC AUC Score for training set ====
0.78670987499505
==== ROC AUC Score for valuation set ====
0.6856181871470279
Linear stretch of predictions to [0,1]
Saving Results.


In [18]:
clfs = [ model3, model6]
blending.blend(X,y,X_val,y_val,X_test,clfs,'../data/predictions/kpw_submission_assignmentB.csv')

StratifiedKFold(n_splits=10, random_state=None, shuffle=False)
Creating train, validation and test sets for blending.
Blending.
==== ROC AUC Score for training set ====
0.75366351184269
==== ROC AUC Score for valuation set ====
0.6919604107720576
Linear stretch of predictions to [0,1]
Saving Results.
