In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
import time
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.datasets import make_classification
import xgboost as xgb

# will need to: pip install scikit-learn catboost xgboost

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)

path = 'training.csv' # path to the file we're looking at. 
# in this case the file is in the same folder as my notebook, so I don't need to specify the full path
train_df = pd.read_csv(path)

path = 'test.csv'
test_df = pd.read_csv(path)

# want to keep info about the patient themselves also some specifc regional data that could have an impact on the patient's health
# i'm excluding the demographic info that's not a characteristic of the patients themselves
cols = [
    'patient_race',
    'payer_type',
    'patient_state',
    'patient_age',
    'patient_gender',
    'breast_cancer_diagnosis_code',
    'metastatic_cancer_diagnosis_code',
    'patient_zip3'
]

train_df_simple = train_df[cols]
test_df_simple = test_df[cols]

display(train_df_simple)
display(test_df_simple)

Unnamed: 0,patient_race,payer_type,patient_state,patient_age,patient_gender,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,,MEDICAID,CA,84,F,C50919,C7989,924
1,White,COMMERCIAL,CA,62,F,C50411,C773,928
2,White,COMMERCIAL,TX,43,F,C50112,C773,760
3,White,COMMERCIAL,CA,45,F,C50212,C773,926
4,,COMMERCIAL,ID,55,F,1749,C773,836
...,...,...,...,...,...,...,...,...
12901,White,,OH,50,F,C50411,C773,436
12902,,COMMERCIAL,CA,50,F,C50912,C773,945
12903,,COMMERCIAL,CA,61,F,C50912,C7931,926
12904,,,NY,37,F,1749,C773,112


Unnamed: 0,patient_race,payer_type,patient_state,patient_age,patient_gender,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,White,MEDICAID,IN,54,F,C50412,C773,467
1,,COMMERCIAL,FL,52,F,C50912,C787,337
2,Hispanic,MEDICAID,CA,61,F,C50911,C773,917
3,Hispanic,MEDICARE ADVANTAGE,CA,64,F,C50912,C779,917
4,Black,,CA,40,F,C50412,C779,900
...,...,...,...,...,...,...,...,...
5787,White,,KY,63,F,C50411,C773,404
5788,White,,IA,69,F,C50912,C773,507
5789,,MEDICARE ADVANTAGE,WA,84,F,C50411,C773,980
5790,,COMMERCIAL,OK,58,F,1749,C773,740


In [4]:

# fill in nulls
categorical_cols = train_df_simple.select_dtypes(include=['object']).columns.to_list() + ['patient_zip3']
numerical_cols = train_df_simple.select_dtypes(exclude=['object']).columns.to_list()

print('CATEGORICAL: ', categorical_cols)
print('NUMERICAL: ', numerical_cols)

# median for numerical - zip3 is technically categorical so skip it for now
for c in numerical_cols :
    if c == 'patient_zip3':
        continue
    train_df_simple[c].fillna(value=train_df_simple[c].median(), inplace=True)
    test_df_simple[c].fillna(value=test_df_simple[c].median(), inplace=True)
    
# 'Unknown' for categorical (the rest)
for c in categorical_cols:
    train_df_simple[c].fillna('Unknown', inplace=True)
    test_df_simple[c].fillna('Unknown', inplace=True)


display(train_df_simple)
display(test_df_simple)



CATEGORICAL:  ['patient_race', 'payer_type', 'patient_state', 'patient_gender', 'breast_cancer_diagnosis_code', 'metastatic_cancer_diagnosis_code', 'patient_zip3']
NUMERICAL:  ['patient_age', 'patient_zip3']


Unnamed: 0,patient_race,payer_type,patient_state,patient_age,patient_gender,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,Unknown,MEDICAID,CA,84,F,C50919,C7989,924
1,White,COMMERCIAL,CA,62,F,C50411,C773,928
2,White,COMMERCIAL,TX,43,F,C50112,C773,760
3,White,COMMERCIAL,CA,45,F,C50212,C773,926
4,Unknown,COMMERCIAL,ID,55,F,1749,C773,836
...,...,...,...,...,...,...,...,...
12901,White,Unknown,OH,50,F,C50411,C773,436
12902,Unknown,COMMERCIAL,CA,50,F,C50912,C773,945
12903,Unknown,COMMERCIAL,CA,61,F,C50912,C7931,926
12904,Unknown,Unknown,NY,37,F,1749,C773,112


Unnamed: 0,patient_race,payer_type,patient_state,patient_age,patient_gender,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,White,MEDICAID,IN,54,F,C50412,C773,467
1,Unknown,COMMERCIAL,FL,52,F,C50912,C787,337
2,Hispanic,MEDICAID,CA,61,F,C50911,C773,917
3,Hispanic,MEDICARE ADVANTAGE,CA,64,F,C50912,C779,917
4,Black,Unknown,CA,40,F,C50412,C779,900
...,...,...,...,...,...,...,...,...
5787,White,Unknown,KY,63,F,C50411,C773,404
5788,White,Unknown,IA,69,F,C50912,C773,507
5789,Unknown,MEDICARE ADVANTAGE,WA,84,F,C50411,C773,980
5790,Unknown,COMMERCIAL,OK,58,F,1749,C773,740


In [8]:
# encode categorical variables
# ordinal encoder - fit and transform on train df - ordinal encoder will label each category from 0 to n. This is the opposite of one-hot encoding, which will add a column for each category
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1)
# for each categorical col, fit the encoder on train data, encode the train data, and encode the test data
for c in categorical_cols:
    train_df_simple[c] = encoder.fit_transform(train_df_simple[[c]])
    test_df_simple[c] = encoder.transform(test_df_simple[[c]])

display(train_df_simple)
display(test_df_simple)


Unnamed: 0,patient_race,payer_type,patient_state,patient_age,patient_gender,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,4.0,1.0,4.0,84,0.0,48.0,41.0,675.0
1,5.0,0.0,4.0,62,0.0,29.0,3.0,679.0
2,5.0,0.0,43.0,43,0.0,18.0,3.0,556.0
3,5.0,0.0,4.0,45,0.0,22.0,3.0,677.0
4,4.0,0.0,13.0,55,0.0,7.0,3.0,619.0
...,...,...,...,...,...,...,...,...
12901,5.0,3.0,34.0,50,0.0,29.0,3.0,299.0
12902,4.0,0.0,4.0,50,0.0,47.0,3.0,693.0
12903,4.0,0.0,4.0,61,0.0,47.0,27.0,677.0
12904,4.0,3.0,33.0,37,0.0,7.0,3.0,10.0


Unnamed: 0,patient_race,payer_type,patient_state,patient_age,patient_gender,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,5.0,1.0,15.0,54,0.0,-1.0,3.0,329.0
1,4.0,0.0,9.0,52,0.0,-1.0,18.0,214.0
2,2.0,1.0,4.0,61,0.0,-1.0,3.0,668.0
3,2.0,2.0,4.0,64,0.0,-1.0,7.0,668.0
4,1.0,3.0,4.0,40,0.0,-1.0,7.0,653.0
...,...,...,...,...,...,...,...,...
5787,5.0,3.0,17.0,63,0.0,-1.0,3.0,270.0
5788,5.0,3.0,12.0,69,0.0,-1.0,3.0,369.0
5789,4.0,2.0,47.0,84,0.0,-1.0,3.0,721.0
5790,4.0,0.0,35.0,58,0.0,-1.0,3.0,538.0


In [9]:
# standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df_simple)
X_test = scaler.transform(test_df_simple)

y_train = train_df['DiagPeriodL90D']

display(X_train)
display(X_test)

array([[ 0.20044331, -0.00533137, -1.20201911, ...,  1.04585614,
         2.12828255,  1.2201719 ],
       [ 0.96557831, -0.91068215, -1.20201911, ..., -0.10508549,
        -0.68912921,  1.2380891 ],
       [ 0.96557831, -0.91068215,  1.4181687 , ..., -0.77142011,
        -0.68912921,  0.68713522],
       ...,
       [ 0.20044331, -0.91068215, -1.20201911, ...,  0.98528027,
         1.09028874,  1.2291305 ],
       [ 0.20044331,  1.80537018,  0.74632567, ..., -1.43775474,
        -0.68912921, -1.7585625 ],
       [-1.32982667, -0.00533137, -0.86609759, ..., -0.04450961,
        -0.68912921, -0.84926463]])

array([[ 0.96557831, -0.00533137, -0.46299178, ..., -1.92236174,
        -0.68912921, -0.32966585],
       [ 0.20044331, -0.91068215, -0.86609759, ..., -1.92236174,
         0.42300701, -0.84478533],
       [-1.32982667, -0.00533137, -1.20201911, ..., -1.92236174,
        -0.68912921,  1.1888168 ],
       ...,
       [ 0.20044331,  0.90001941,  1.68690591, ..., -1.92236174,
        -0.68912921,  1.42621969],
       [ 0.20044331, -0.91068215,  0.88069428, ..., -1.92236174,
        -0.68912921,  0.60650782],
       [-1.32982667,  0.90001941,  0.74632567, ..., -1.92236174,
        -0.68912921, -1.7496039 ]])

In [10]:
# train models
iteration = 0

In [11]:
start = time.time()
# initialize XGBoost classifier
xgb = xgb.XGBClassifier(eval_metric='logloss')

# these are different parameters we can test for XGB. we will iterate through ALL possible combinations and see which one gives best accuracy
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 500, 1000],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 0.9],
}

# use the gridsearchcv function to train every model using the aforementioned params
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring='accuracy', cv=3, verbose=1) # cv = how many ways to split the training data to train the model. the higher the number the longer it will take to train
# the number of times a model will fit = combos_of_param_grid * cv
# in this example, we have 5 features with 3 possible values and cv = 3. therefore = 3^4 * 3 = 3^5 = 243 fits.
grid_search.fit(X_train, y_train)

# Print best parameters and best score 
print("\nBest parameters found: ", grid_search.best_params_)
print("Best accuracy found: ", grid_search.best_score_)

# Print the time
end_time = time.time() 
elapsed_time_seconds = end_time - start_time
print(f"\nElapsed Time: {int(elapsed_time_seconds // 60)} minutes and {elapsed_time_seconds % 60} seconds")


Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] END learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.6; total time=   0.2s
[CV] END learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.6; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END learning_rate=0.01, max_depth=4, min_child_weight=1, n_estimators=100, subsample=1.0; total time=   0.0s
[CV] END learning_rate=0.

In [14]:

# Predict on test data with the best model. 
# we want to use the predict_proba function to get the class probabilities, since that is what we're submitting. it will give us the probabilites of the output being 0 or 1
best_model = grid_search.best_estimator_
y_pred = best_model.predict_proba(X_test)

print(y_pred)

[[0.9013593  0.0986407 ]
 [0.9077699  0.09223005]
 [0.8508215  0.14917852]
 ...
 [0.6300889  0.36991107]
 [0.8303647  0.16963528]
 [0.8420317  0.1579683 ]]


In [15]:
# save the predictions of output = 1
probabilities = y_pred[:,1]

# add the probabilities back to the test dataframe 
test_df_simple['DiagPeriodL90D'] = probabilities

display(test_df_simple)

Unnamed: 0,patient_race,payer_type,patient_state,patient_age,patient_gender,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3,DiagPeriodL90D
0,5.0,1.0,15.0,54,0.0,-1.0,3.0,329.0,0.098641
1,4.0,0.0,9.0,52,0.0,-1.0,18.0,214.0,0.092230
2,2.0,1.0,4.0,61,0.0,-1.0,3.0,668.0,0.149179
3,2.0,2.0,4.0,64,0.0,-1.0,7.0,668.0,0.123827
4,1.0,3.0,4.0,40,0.0,-1.0,7.0,653.0,0.064773
...,...,...,...,...,...,...,...,...,...
5787,5.0,3.0,17.0,63,0.0,-1.0,3.0,270.0,0.152798
5788,5.0,3.0,12.0,69,0.0,-1.0,3.0,369.0,0.245274
5789,4.0,2.0,47.0,84,0.0,-1.0,3.0,721.0,0.369911
5790,4.0,0.0,35.0,58,0.0,-1.0,3.0,538.0,0.169635


In [None]:
# export patient_id, DiagPeriodL90D to submission file
filename = 'simplified_submission_xgb_' + str(iteration) + '.csv'
iteration += 1
test_df_simple[['patient_id', 'DiagPeriodL90D']].to_csv(filename, index=False)
print('successfully saved file: ' + filename)
