In [9]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
import time
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OrdinalEncoder

import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
pd.set_option('display.max_columns', 100)

path = 'training.csv' # path to the file we're looking at. 
# in this case the file is in the same folder as my notebook, so I don't need to specify the full path
train_df = pd.read_csv(path)

path = 'test.csv'
test_df = pd.read_csv(path)

# want to keep info about the patient themselves also some specifc regional data that could have an impact on the patient's health
# i'm excluded the demographic info that's not a characteristic of the patients themselves
cols = [
    'patient_race',
    'payer_type',
    'patient_state',
    'patient_age',
    # 'patient_gender',
    'breast_cancer_diagnosis_code',
    'metastatic_cancer_diagnosis_code',
    'patient_zip3'
]

train_df_simple = train_df[cols]
test_df_simple = test_df[cols]

# fill in nulls
categorical_cols = train_df_simple.select_dtypes(include=['object']).columns.to_list() + ['patient_zip3']
numerical_cols = train_df_simple.select_dtypes(exclude=['object']).columns.to_list()

print('CATEGORICAL: ', categorical_cols)
print('NUMERICAL: ', numerical_cols)

# median for numerical - zip3 is technically categorical so skip it for now
for c in numerical_cols :
    if c == 'patient_zip3':
        continue
    train_df_simple[c].fillna(value=train_df_simple[c].median(), inplace=True)
    test_df_simple[c].fillna(value=test_df_simple[c].median(), inplace=True)
    
# 'Unknown' for categorical (the rest)
for c in categorical_cols:
    train_df_simple[c].fillna('Unknown', inplace=True)
    test_df_simple[c].fillna('Unknown', inplace=True)
    
# #### no preprocessing... let's see if catboost like it ####
# X_train, y_train, X_test = train_df_simple, train_df['DiagPeriodL90D'], test_df_simple

# encode categorical variables
# ordinal encoder - fit and transform on train df - ordinal encoder will label each category from 0 to n. This is the opposite of one-hot encoding, which will add a column for each category
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value = -1)
# for each categorical col, fit the encoder on train data, encode the train data, and encode the test data
for c in categorical_cols:
    train_df_simple[c] = encoder.fit_transform(train_df_simple[[c]])
    test_df_simple[c] = encoder.transform(test_df_simple[[c]])

# train_df_simple = pd.get_dummies(train_df_simple, columns=categorical_cols)  
# test_df_simple = pd.get_dummies(test_df_simple, columns=categorical_cols) 

# # get list of missing cols in test data 
# missing_cols = [x for x in train_df_simple.columns if x not in test_df_simple.columns and x != 'DiagPeriodL90D']

# # add these cols with default value = 0
# for c in missing_cols:
#     test_df_simple[c] = 0
    
# # reorder the columns to make sure the test and train are the same
# test_df_simple = test_df_simple[train_df_simple.columns]


print('\nAFTER PREPROCESSING:')
print(train_df_simple.shape)
print(test_df_simple.shape)

# standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df_simple)
X_test = scaler.transform(test_df_simple)

y_train = train_df['DiagPeriodL90D']



CATEGORICAL:  ['patient_race', 'payer_type', 'patient_state', 'breast_cancer_diagnosis_code', 'metastatic_cancer_diagnosis_code', 'patient_zip3']
NUMERICAL:  ['patient_age', 'patient_zip3']

AFTER PREPROCESSING:
(12906, 7)
(5792, 7)


In [10]:
iterations = 0

In [15]:
start_time = time.time() # start timer

# Define hyperparameters - this will depend on the model. you can refer to the docs or use ChatGPT to spit out different parameter tests
# these are arbitrary values I got from ChatGPT
param_grid = {
    'iterations': [500, 1000],
    'learning_rate':  [0.01, 0.05, 0.1],
    'depth': [2, 4, 6 ], # , 8],
    # 'l2_leaf_reg': [1, 5, 10],
    'loss_function': ['Logloss'], # 'border_count': [5, 10, 20, 50, 100],
    'random_strength': [0.1, 0.5, 1.0],
    # 'bagging_temperature': [0.2, 0.5, 0.8],
    'eval_metric' : ['AUC', 'Logloss'],
    # 'cat_features' : [categorical_cols] # comment if encoding categorical variables
}

model = CatBoostClassifier(silent=True)

# Number of iterations for RandomizedSearchCV
num_iterations = 5
test_predictions_list = []

# Perform multiple iterations of RandomizedSearchCV
for i in range(num_iterations):
    print("---------------- ITERATION %d --------------" % i)
    # RandomizedSearchCV to find the best hyperparameters
    random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=15, cv=3, random_state=None)
    random_search.fit(X_train, y_train)
    # get bets values
    best_model = random_search.best_estimator_
    test_predictions = best_model.predict_proba(X_test)[:,1]
    # print best parameters and accuracy
    print("\nBest Parameters: ", random_search.best_params_)
    print("\nBest Accuracy: {:.8f}\n".format(random_search.best_score_))
    # Append the test predictions to the list
    test_predictions_list.append(test_predictions)
    
    
    
end_time = time.time() # end the timer
elapsed_time_seconds = end_time - start_time
print(f"\nElapsed Time: {int(elapsed_time_seconds // 60)} minutes and {elapsed_time_seconds % 60} seconds")

---------------- ITERATION 0 --------------

Best Parameters:  {'random_strength': 0.5, 'loss_function': 'Logloss', 'learning_rate': 0.05, 'iterations': 1000, 'eval_metric': 'AUC', 'depth': 2}

Best Accuracy: 0.81349760

---------------- ITERATION 1 --------------

Best Parameters:  {'random_strength': 1.0, 'loss_function': 'Logloss', 'learning_rate': 0.1, 'iterations': 500, 'eval_metric': 'AUC', 'depth': 2}

Best Accuracy: 0.81342011

---------------- ITERATION 2 --------------

Best Parameters:  {'random_strength': 0.5, 'loss_function': 'Logloss', 'learning_rate': 0.05, 'iterations': 1000, 'eval_metric': 'AUC', 'depth': 2}

Best Accuracy: 0.81349760

---------------- ITERATION 3 --------------

Best Parameters:  {'random_strength': 0.1, 'loss_function': 'Logloss', 'learning_rate': 0.01, 'iterations': 1000, 'eval_metric': 'Logloss', 'depth': 4}

Best Accuracy: 0.81318766

---------------- ITERATION 4 --------------

Best Parameters:  {'random_strength': 1.0, 'loss_function': 'Logloss'

In [16]:
# Best Parameters (standardizing):  {'random_strength': 0.5, 'learning_rate': 0.05, 'l2_leaf_reg': 5, 'iterations': 200, 'eval_metric': 'AUC', 'depth': 4, 'border_count': 100, 'bagging_temperature': 0.5}
# Best Parameters:  {'random_strength': 0.5, 'learning_rate': 0.05, 'l2_leaf_reg': 5, 'iterations': 200, 'eval_metric': 'AUC', 'depth': 4, 'border_count': 100, 'bagging_temperature': 0.2}


## ADDED BACK PATIENT_ZIP3

## added back all params in grid

## removed preprocessing steps

## fixed preprocessing steps - ordinal encoding. instead of None replaced with Unknown

## removed gender from features


In [17]:
# get the mean/med of test set predictions across iterations
mean_test_predictions =  np.median(test_predictions_list, axis=0)
print(mean_test_predictions)

# create df for submission
sub_df = test_df[['patient_id']]
sub_df['DiagPeriodL90D'] = mean_test_predictions

# save the submission
display(sub_df)

[0.81093585 0.79869847 0.76538146 ... 0.93318363 0.10393951 0.87487749]


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.810936
1,593679,0.798698
2,184532,0.765381
3,447383,0.802683
4,687972,0.795524
...,...,...
5787,977076,0.845578
5788,922960,0.866911
5789,759690,0.933184
5790,911717,0.103940


In [18]:
sub_df.to_csv('simplified_submission_cb_mean_' + str(iterations) + '.csv', index=False)
iterations += 1


In [91]:
# try different method of k-fold 
from sklearn.metrics import roc_auc_score
# Step 1: Divide train set into 5 folds and train CatBoost classifier
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=26)

# Initialize an array to store test predictions from each fold
test_predictions_list = []

# params 
params = {
    'silent': True,'random_strength': 0.1, 'loss_function': 'Logloss', 'learning_rate': 0.05, 'iterations': 1000, 'eval_metric': 'AUC', 'depth': 2
}


# Initialize and train CatBoost model
model = CatBoostClassifier(**params)

for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[test_index]

    model.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold), use_best_model=True)
    
    # Step 2: Use test set to get predictions
    test_predictions_fold = model.predict_proba(X_test)[:,1]

    # get prediction accuracy of the validation fold
    fold_score = roc_auc_score(y_val_fold, model.predict(X_val_fold))
    train_score = roc_auc_score(y_train_fold, model.predict(X_train_fold)) 
    print(f"accuracy on fold: {fold_score} | accuracy on train: {train_score}")

    # Step 3: Append test predictions to the list
    test_predictions_list.append(test_predictions_fold)
    

# Step 4: Get the mean of all 5 test predictions
mean_test_predictions = np.mean(test_predictions_list, axis=0)
print(mean_test_predictions)



accuracy on fold: 0.7631865846450229 | accuracy on train: 0.7652966161665647
accuracy on fold: 0.7678979514944677 | accuracy on train: 0.7621671236612236
accuracy on fold: 0.7595383654150969 | accuracy on train: 0.7660074708981784
[0.80135131 0.77071885 0.78559735 ... 0.89862395 0.10440774 0.86427718]


In [90]:
# create df for submission
sub_df = test_df[['patient_id']]
sub_df['DiagPeriodL90D'] = mean_test_predictions

# save the submission
display(sub_df)


sub_df.to_csv('simplified_submission_cb_mean_kfold_' + str(iterations) + '.csv', index=False)
iterations += 1


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.802034
1,593679,0.775271
2,184532,0.778288
3,447383,0.793738
4,687972,0.823346
...,...,...
5787,977076,0.833914
5788,922960,0.871146
5789,759690,0.896733
5790,911717,0.106153


In [9]:
# will it help to use ordinal encoding cb_1 --> YES (less features works out better. however, it is dangerous to apply ordinal encoding to features such as race)

# use kfold on the best parameters i've seen --> eh it's kinda the same (or worse)

# will it help to bring back patient_zip3? --> it's the same. best i can do is 0.808

# will it help to NOT encode categorical variables (catboost can handle non-encoded vals) --> NO

# median? --> nah

# take best accuract amongst iterations?
