In [19]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
import time
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.datasets import make_classification
import xgboost as xgb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier

# will need to: pip install scikit-learn catboost xgboost lightgbm 

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)

path = 'training.csv' # path to the file we're looking at. 
# in this case the file is in the same folder as my notebook, so I don't need to specify the full path
train_df = pd.read_csv(path)

path = 'test.csv'
test_df = pd.read_csv(path)

# want to keep info about the patient themselves also some specifc regional data that could have an impact on the patient's health
# i'm excluding the demographic info that's not a characteristic of the patients themselves
cols = [
    'patient_race',
    'payer_type',
    'patient_state',
    'patient_age',
    # 'patient_gender',
    'breast_cancer_diagnosis_code',
    'metastatic_cancer_diagnosis_code',
    'patient_zip3'
]

train_df_simple = train_df[cols]
test_df_simple = test_df[cols]

display(train_df_simple)
display(test_df_simple)

Unnamed: 0,patient_race,payer_type,patient_state,patient_age,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,,MEDICAID,CA,84,C50919,C7989,924
1,White,COMMERCIAL,CA,62,C50411,C773,928
2,White,COMMERCIAL,TX,43,C50112,C773,760
3,White,COMMERCIAL,CA,45,C50212,C773,926
4,,COMMERCIAL,ID,55,1749,C773,836
...,...,...,...,...,...,...,...
12901,White,,OH,50,C50411,C773,436
12902,,COMMERCIAL,CA,50,C50912,C773,945
12903,,COMMERCIAL,CA,61,C50912,C7931,926
12904,,,NY,37,1749,C773,112


Unnamed: 0,patient_race,payer_type,patient_state,patient_age,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,White,MEDICAID,IN,54,C50412,C773,467
1,,COMMERCIAL,FL,52,C50912,C787,337
2,Hispanic,MEDICAID,CA,61,C50911,C773,917
3,Hispanic,MEDICARE ADVANTAGE,CA,64,C50912,C779,917
4,Black,,CA,40,C50412,C779,900
...,...,...,...,...,...,...,...
5787,White,,KY,63,C50411,C773,404
5788,White,,IA,69,C50912,C773,507
5789,,MEDICARE ADVANTAGE,WA,84,C50411,C773,980
5790,,COMMERCIAL,OK,58,1749,C773,740


In [20]:

# fill in nulls
categorical_cols = train_df_simple.select_dtypes(include=['object']).columns.to_list() + ['patient_zip3']
numerical_cols = train_df_simple.select_dtypes(exclude=['object']).columns.to_list()

print('CATEGORICAL: ', categorical_cols)
print('NUMERICAL: ', numerical_cols)

# median for numerical - zip3 is technically categorical so skip it for now
for c in numerical_cols :
    if c == 'patient_zip3':
        continue
    train_df_simple[c].fillna(value=train_df_simple[c].median(), inplace=True)
    test_df_simple[c].fillna(value=test_df_simple[c].median(), inplace=True)
    
# 'Unknown' for categorical (the rest)
for c in categorical_cols:
    train_df_simple[c].fillna('Unknown', inplace=True)
    test_df_simple[c].fillna('Unknown', inplace=True)


display(train_df_simple)
display(test_df_simple)



CATEGORICAL:  ['patient_race', 'payer_type', 'patient_state', 'breast_cancer_diagnosis_code', 'metastatic_cancer_diagnosis_code', 'patient_zip3']
NUMERICAL:  ['patient_age', 'patient_zip3']


Unnamed: 0,patient_race,payer_type,patient_state,patient_age,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,Unknown,MEDICAID,CA,84,C50919,C7989,924
1,White,COMMERCIAL,CA,62,C50411,C773,928
2,White,COMMERCIAL,TX,43,C50112,C773,760
3,White,COMMERCIAL,CA,45,C50212,C773,926
4,Unknown,COMMERCIAL,ID,55,1749,C773,836
...,...,...,...,...,...,...,...
12901,White,Unknown,OH,50,C50411,C773,436
12902,Unknown,COMMERCIAL,CA,50,C50912,C773,945
12903,Unknown,COMMERCIAL,CA,61,C50912,C7931,926
12904,Unknown,Unknown,NY,37,1749,C773,112


Unnamed: 0,patient_race,payer_type,patient_state,patient_age,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,White,MEDICAID,IN,54,C50412,C773,467
1,Unknown,COMMERCIAL,FL,52,C50912,C787,337
2,Hispanic,MEDICAID,CA,61,C50911,C773,917
3,Hispanic,MEDICARE ADVANTAGE,CA,64,C50912,C779,917
4,Black,Unknown,CA,40,C50412,C779,900
...,...,...,...,...,...,...,...
5787,White,Unknown,KY,63,C50411,C773,404
5788,White,Unknown,IA,69,C50912,C773,507
5789,Unknown,MEDICARE ADVANTAGE,WA,84,C50411,C773,980
5790,Unknown,COMMERCIAL,OK,58,1749,C773,740


In [21]:
# encode categorical variables
# ordinal encoder - fit and transform on train df - ordinal encoder will label each category from 0 to n. This is the opposite of one-hot encoding, which will add a column for each category
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1)
# for each categorical col, fit the encoder on train data, encode the train data, and encode the test data
for c in categorical_cols:
    train_df_simple[c] = encoder.fit_transform(train_df_simple[[c]])
    test_df_simple[c] = encoder.transform(test_df_simple[[c]])

display(train_df_simple)
display(test_df_simple)


Unnamed: 0,patient_race,payer_type,patient_state,patient_age,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,4.0,1.0,4.0,84,48.0,41.0,675.0
1,5.0,0.0,4.0,62,29.0,3.0,679.0
2,5.0,0.0,43.0,43,18.0,3.0,556.0
3,5.0,0.0,4.0,45,22.0,3.0,677.0
4,4.0,0.0,13.0,55,7.0,3.0,619.0
...,...,...,...,...,...,...,...
12901,5.0,3.0,34.0,50,29.0,3.0,299.0
12902,4.0,0.0,4.0,50,47.0,3.0,693.0
12903,4.0,0.0,4.0,61,47.0,27.0,677.0
12904,4.0,3.0,33.0,37,7.0,3.0,10.0


Unnamed: 0,patient_race,payer_type,patient_state,patient_age,breast_cancer_diagnosis_code,metastatic_cancer_diagnosis_code,patient_zip3
0,5.0,1.0,15.0,54,30.0,3.0,329.0
1,4.0,0.0,9.0,52,47.0,18.0,214.0
2,2.0,1.0,4.0,61,46.0,3.0,668.0
3,2.0,2.0,4.0,64,47.0,7.0,668.0
4,1.0,3.0,4.0,40,30.0,7.0,653.0
...,...,...,...,...,...,...,...
5787,5.0,3.0,17.0,63,29.0,3.0,270.0
5788,5.0,3.0,12.0,69,47.0,3.0,369.0
5789,4.0,2.0,47.0,84,29.0,3.0,721.0
5790,4.0,0.0,35.0,58,7.0,3.0,538.0


In [22]:
# standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df_simple)
X_test = scaler.transform(test_df_simple)

y_train = train_df['DiagPeriodL90D']

display(X_train)
display(X_test)

array([[ 0.20044331, -0.00533137, -1.20201911, ...,  1.04585614,
         2.12828255,  1.2201719 ],
       [ 0.96557831, -0.91068215, -1.20201911, ..., -0.10508549,
        -0.68912921,  1.2380891 ],
       [ 0.96557831, -0.91068215,  1.4181687 , ..., -0.77142011,
        -0.68912921,  0.68713522],
       ...,
       [ 0.20044331, -0.91068215, -1.20201911, ...,  0.98528027,
         1.09028874,  1.2291305 ],
       [ 0.20044331,  1.80537018,  0.74632567, ..., -1.43775474,
        -0.68912921, -1.7585625 ],
       [-1.32982667, -0.00533137, -0.86609759, ..., -0.04450961,
        -0.68912921, -0.84926463]])

array([[ 0.96557831, -0.00533137, -0.46299178, ..., -0.04450961,
        -0.68912921, -0.32966585],
       [ 0.20044331, -0.91068215, -0.86609759, ...,  0.98528027,
         0.42300701, -0.84478533],
       [-1.32982667, -0.00533137, -1.20201911, ...,  0.92470439,
        -0.68912921,  1.1888168 ],
       ...,
       [ 0.20044331,  0.90001941,  1.68690591, ..., -0.10508549,
        -0.68912921,  1.42621969],
       [ 0.20044331, -0.91068215,  0.88069428, ..., -1.43775474,
        -0.68912921,  0.60650782],
       [-1.32982667,  0.90001941,  0.74632567, ..., -0.83199599,
        -0.68912921, -1.7496039 ]])

In [23]:
# train models
iteration = 0

In [31]:
from sklearn.metrics import accuracy_score
start_time = time.time()

# Initialize individual classifiers
rf_classifier = RandomForestClassifier(n_estimators=500, random_state=42)
catboost_classifier = CatBoostClassifier(iterations=500, random_state=42, verbose=0)
xgboost_classifier = XGBClassifier(n_estimators=500, random_state=42)
lgbm_classifier = LGBMClassifier(n_estimators=500, random_state=42)

# Combine models into a Voting Classifier
ensemble_classifier = VotingClassifier(
    estimators=[
        # ('random_forest', rf_classifier),
        ('catboost', catboost_classifier),
        ('xgboost', xgboost_classifier)
        # ('lightgbm', lgbm_classifier)
    ],
    voting='soft'  # 'soft' for probabilities, 'hard' for majority voting
)

# Train the ensemble model
ensemble_classifier.fit(X_train, y_train)

# Evaluate the ensemble model on training
accuracy = accuracy_score(y_train, ensemble_classifier.predict(X_train))
print('\nACCURACY: ', accuracy)

# Make predictions on the test set
y_pred = ensemble_classifier.predict_proba(X_test)

# Print the time
end_time = time.time() 
elapsed_time_seconds = end_time - start_time
print(f"\nElapsed Time: {int(elapsed_time_seconds // 60)} minutes and {elapsed_time_seconds % 60} seconds")



ACCURACY:  0.9002014566868123

Elapsed Time: 0 minutes and 7.42572021484375 seconds


In [32]:
print(y_pred)

[[0.125321   0.874679  ]
 [0.22741577 0.77258423]
 [0.49771827 0.50228173]
 ...
 [0.0602338  0.9397662 ]
 [0.9377121  0.06228788]
 [0.07074818 0.92925182]]


In [33]:
# save the predictions of output = 1
probabilities = y_pred[:,1]

# add the probabilities back to the test dataframe 
test_df['DiagPeriodL90D'] = probabilities

display( test_df[['patient_id', 'DiagPeriodL90D']] )


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.874679
1,593679,0.772584
2,184532,0.502282
3,447383,0.791501
4,687972,0.900250
...,...,...
5787,977076,0.847511
5788,922960,0.855728
5789,759690,0.939766
5790,911717,0.062288


In [34]:
# export patient_id, DiagPeriodL90D to submission file
filename = 'simplified_submission_ensemble_' + str(iteration) + '.csv'
test_df[['patient_id', 'DiagPeriodL90D']].to_csv(filename, index=False)
print('successfully saved file: ' + filename)
iteration += 1


successfully saved file: simplified_submission_ensemble_1.csv


In [None]:
# this method does not break 0.79