In [1]:
import pandas as pd 
import numpy as np
import time
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier


In [2]:
path = 'training.csv' # path to the file we're looking at. 
# in this case the file is in the same folder as my notebook, so I don't need to specify the full path
train_df = pd.read_csv(path)

FileNotFoundError: [Errno 2] No such file or directory: 'training.csv'

In [74]:
path = 'test.csv'
test_df = pd.read_csv(path)

In [75]:
# very basic preprocessing...let's just keep the columns we want to keep! 

In [91]:
# want to keep info about the patient themselves also some specifc regional data that could have an impact on the patient's health
# i'm excluded the demographic info that's not a characteristic of the patients themselves
cols = [
    'patient_race',
    'payer_type',
    'patient_state',
    'patient_age',
    'patient_gender',
    'breast_cancer_diagnosis_code',
    'metastatic_cancer_diagnosis_code',
    'patient_zip3'
]
# NOTE - IF USING PATIENT_ZIP3, TREAT IT LIKE A CATEGORICAL VARIABLE

train_df_simple = train_df[cols]
test_df_simple = test_df[cols]

In [92]:
# simple preprocessing 

In [93]:
# fill in nulls
categorical_cols = train_df_simple.select_dtypes(include=['object']).columns.to_list() + ['patient_zip3']
numerical_cols = train_df_simple.select_dtypes(exclude=['object']).columns.to_list()

print('CATEGORICAL: ', categorical_cols)
print('NUMERICAL: ', numerical_cols)

# median for numerical - zip3 is categorical so fill it in with 0 for now
for c in numerical_cols :
    if c == 'patient_zip3':
        train_df_simple[c].fillna(value=0, inplace=True)
        continue
    train_df_simple[c].fillna(value=train_df_simple[c].median(), inplace=True)
    test_df_simple[c].fillna(value=test_df_simple[c].median(), inplace=True)
    
# 'None' for categorical (the rest)
train_df_simple.replace(np.nan, 'None', regex=True,inplace=True)
test_df_simple.replace(np.nan, 'None', regex=True,inplace=True)

CATEGORICAL:  ['patient_race', 'payer_type', 'patient_state', 'patient_gender', 'breast_cancer_diagnosis_code', 'metastatic_cancer_diagnosis_code', 'patient_zip3']
NUMERICAL:  ['patient_age', 'patient_zip3']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_simple[c].fillna(value=train_df_simple[c].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_simple[c].fillna(value=test_df_simple[c].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_simple[c].fillna(value=0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu

In [94]:
# encode categorical variables
train_df_simple = pd.get_dummies(train_df_simple, columns=categorical_cols)  
test_df_simple = pd.get_dummies(test_df_simple, columns=categorical_cols) 

print(train_df_simple.shape)
print(test_df_simple.shape)

(12906, 895)
(5792, 820)


In [95]:
# get list of missing cols in test data 
missing_cols = [x for x in train_df_simple.columns if x not in test_df_simple.columns and x != 'DiagPeriodL90D']

# add these cols with default value = 0
for c in missing_cols:
    test_df_simple[c] = 0
    
# reorder the columns to make sure the test and train are the same
test_df_simple = test_df_simple[train_df_simple.columns]

print(train_df_simple.shape)
print(test_df_simple.shape)

(12906, 895)
(5792, 895)


  test_df_simple[c] = 0
  test_df_simple[c] = 0
  test_df_simple[c] = 0
  test_df_simple[c] = 0
  test_df_simple[c] = 0
  test_df_simple[c] = 0
  test_df_simple[c] = 0
  test_df_simple[c] = 0


In [96]:
# standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df_simple)
X_test = scaler.transform(test_df_simple)


In [97]:
# train models
iteration = 0
y_train = train_df['DiagPeriodL90D']

In [None]:
# hard code params

In [98]:
iteration2 = 0

In [None]:
param_grid = {
    
    'depth':2,
    'random_state': 42,
    'eval_metric': 'AUC',
    'verbose': False,
    'loss_function': 'Logloss',
    'learning_rate':0.3,
    'iterations':1000
}

model = CatBoostClassifier(**param_grid)
model.fit(X_train, y_train)

# get the probabilities using the test data
probabilities = model.predict_proba(X_test)

print (probabilities)

# create df for submission
sub_df = test_df[['patient_id']]
sub_df['DiagPeriodL90D'] = probabilities[:,1]

# save the submission
display(sub_df)

sub_df.to_csv('simplified_submission_cb_' + str(iteration2) + '.csv', index=False)


In [100]:
# cross validation and parameter tuning example - catboost
from sklearn.model_selection import GridSearchCV
start_time = time.time() # start timer


# param_grid = {'random_strength': 0.1, 'learning_rate': 0.05, 'l2_leaf_reg': 5, 
#               'iterations': 150, 'depth': 8, 'border_count': 100, 'bagging_temperature': 0.2,
#                'eval_metric':'AUC',
#              }
# random_search = CatBoostClassifier(**param_grid)


model = CatBoostClassifier(silent=True)

# Define hyperparameters - this will depend on the model. you can refer to the docs or use ChatGPT to spit out different parameter tests
# these are arbitrary values I got from ChatGPT
param_grid = {
    'iterations': [[50, 100, 150, 200],
    'learning_rate': [0.05, 0.1], # [0.01, 0.05, 0.1],
    'depth': [4, 6, 8 ],
    'l2_leaf_reg': [5], # [1, 5, 10],
    'border_count': [100], # [5, 10, 20, 50, 100],
    'random_strength': [0.1, 0.5], # [0.1, 0.5 1.0],
    'bagging_temperature': [0.2, 0.5, 0.8],
    'eval_metric' : ['AUC'],
}


# Use RandomizedSearchCV/GridSearchCV to search for the best hyperparameters - will use n_iter random combinations of  parameters
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                                   scoring='accuracy' , n_iter=20, cv=3, random_state=26)

# random_search = GridSearchCV(estimator=model, param_grid=param_grid,
#                              scoring='accuracy', cv=3)



random_search.fit(X_train, y_train)

end_time = time.time() # end the timer
elapsed_time_seconds = end_time - start_time

# print the elapsed time
print(f"\nElapsed Time: {int(elapsed_time_seconds // 60)} minutes and {elapsed_time_seconds % 60} seconds")

# Print the best parameters and corresponding accuracy
print("\nBest Parameters: ", random_search.best_params_)
print("\nBest Accuracy: {:.2f}".format(random_search.best_score_))

iteration2 += 1



Elapsed Time: 6 minutes and 35.19725847244263 seconds

Best Parameters:  {'random_strength': 0.5, 'learning_rate': 0.05, 'l2_leaf_reg': 5, 'iterations': 500, 'eval_metric': 'AUC', 'depth': 4, 'border_count': 100, 'bagging_temperature': 0.8}

Best Accuracy: 0.81


In [101]:
# create the cb model using the winning parameters
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

# get the probabilities using the test data
probabilities = random_search.predict_proba(X_test)

print (probabilities)

# create df for submission
sub_df = test_df[['patient_id']]
sub_df['DiagPeriodL90D'] = probabilities[:,1]

# save the submission
display(sub_df)

sub_df.to_csv('simplified_submission_cb_' + str(iteration2) + '.csv', index=False)

[[0.22423922 0.77576078]
 [0.32555508 0.67444492]
 [0.21687659 0.78312341]
 ...
 [0.10462079 0.89537921]
 [0.90161647 0.09838353]
 [0.1377912  0.8622088 ]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['DiagPeriodL90D'] = probabilities[:,1]


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.775761
1,593679,0.674445
2,184532,0.783123
3,447383,0.776386
4,687972,0.799344
...,...,...
5787,977076,0.843012
5788,922960,0.863509
5789,759690,0.895379
5790,911717,0.098384


In [102]:
# LOGGING 2/23/24 -- NOTE THAT ANY SUBMISSIONS NAMED _LBGM WERE ACTUALLY CATBOOST
# test 0 - random_state = 52, n_iter = 10
# test 1 - random_state = 26, n_iter = 15 --> WINNER 0.805
# Best Parameters:  {'random_strength': 0.1, 'learning_rate': 0.05, 'l2_leaf_reg': 5, 'iterations': 150, 'depth': 8, 'border_count': 100, 'bagging_temperature': 0.2}
# test 2 - random_state = None, n_iter = 15
# test 3 - random_state = None, n_ter = 20 --> 0.804 (with the current set of params) 
# test 4 --> replaced randomsearch with gridsearch and specified more params. hit a stopping point. try something new?


# LOGGING 2/25/24 
# test 5 - random_state  = 26, n_iter = 20, eval_metric = 'logloss'
# test 6 - using the winning parameters from yesterday but doing eval_metric = AUC --> GAVE 0.805 AGAIN.
# {'random_strength': 0.1, 'learning_rate': 0.05, 'l2_leaf_reg': 5, 'iterations': 150, 'depth': 8, 'border_count': 100, 'bagging_temperature': 0.2}

# LOGGING 2/25/24 (REFRESHED THE NOTEBOOK - added patient_zip3 back to list of cols)
# test - random_state = 26, n_iter = 20 
# TODO - CHANGED SOME PREPROCESSING. TEST ON XGBOOST NOTEBOOK


In [48]:
# iteration3 = 0

In [49]:
# # cross validation and parameter tuning example - lgbm
# from sklearn.model_selection import GridSearchCV
# start_time = time.time() # start timer

# model = LGBMClassifier()

# # Define hyperparameters - this will depend on the model. you can refer to the docs or use ChatGPT to spit out different parameter tests
# # these are arbitrary values I got from ChatGPT
# param_grid = {
#     'boosting_type': ['gbdt', 'dart', 'goss'],
#     'num_leaves': [10, 20, 30, 40, 50],
#     'learning_rate': [0.01, 0.05, 0.1, 0.2],
#     'n_estimators': [50, 100, 150, 200],
#     'max_depth': [-1, 5, 10, 15, 20],
#     'min_child_samples': [5, 10, 15],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0],
# }

# # Use RandomizedSearchCV/GridSearchCV to search for the best hyperparameters - will use n_iter random combinations of  parameters
# random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
#                                    scoring='accuracy', n_iter=15, cv=3, random_state=52)

# # random_search = GridSearchCV(estimator=model, param_grid=param_grid,
# #                              scoring='accuracy', cv=3)

# random_search.fit(X_train, y_train)

# end_time = time.time() # end the timer
# elapsed_time_seconds = end_time - start_time

# # print the elapsed time
# print(f"\nElapsed Time: {int(elapsed_time_seconds // 60)} minutes and {elapsed_time_seconds % 60} seconds")

# # Print the best parameters and corresponding accuracy
# print("\nBest Parameters: ", random_search.best_params_)
# print("\nBest Accuracy: {:.2f}".format(random_search.best_score_))

# iteration3 += 1


[LightGBM] [Info] Number of positive: 5373, number of negative: 3231
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 361
[LightGBM] [Info] Number of data points in the train set: 8604, number of used features: 99
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624477 -> initscore=0.508595
[LightGBM] [Info] Start training from score 0.508595
[LightGBM] [Info] Number of positive: 5373, number of negative: 3231
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004323 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 364
[LightGBM] [Info] Number of data points in the train set: 8604, number of used features: 10

In [50]:
# # create the lgbm model using the winning parameters
# best_model = random_search.best_estimator_
# best_model.fit(X_train, y_train)

# # get the probabilities using the test data
# probabilities = best_model.predict_proba(X_test)

# print (probabilities)

# # create df for submission
# sub_df = test_df[['patient_id']]
# sub_df['DiagPeriodL90D'] = probabilities[:,1]

# # save the submission
# display(sub_df)

# sub_df.to_csv('simplified_submission_lgbm_' + str(iteration3) + '.csv', index=False)

[LightGBM] [Info] Number of positive: 8060, number of negative: 4846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 383
[LightGBM] [Info] Number of data points in the train set: 12906, number of used features: 106
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624516 -> initscore=0.508760
[LightGBM] [Info] Start training from score 0.508760
[[0.22160413 0.77839587]
 [0.22109297 0.77890703]
 [0.21785651 0.78214349]
 ...
 [0.11660875 0.88339125]
 [0.91110947 0.08889053]
 [0.13328235 0.86671765]]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['DiagPeriodL90D'] = probabilities[:,1]


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.778396
1,593679,0.778907
2,184532,0.782143
3,447383,0.775616
4,687972,0.789174
...,...,...
5787,977076,0.803716
5788,922960,0.840161
5789,759690,0.883391
5790,911717,0.088891


In [None]:
# LOGGING 2/24/24 -- NOTE THAT ANY SUBMISSIONS NAMED _LBGM WERE ACTUALLY CATBOOST
# test 0 - random_state = None, n_iter = 20
# test 1 - random_state = 26, n_iter = 15
# test2 - random_state = 52, n_iter = 15

In [61]:
# iteration4 = 0

In [52]:
# # trying a combination of catboost and lgbm
# start_time = time.time() # start timer

# lgbm_classifier, catboost_classifier = LGBMClassifier(), CatBoostClassifier()

# # train both models
# lgbm_classifier.fit(X_train, y_train)
# catboost_classifier.fit(X_train, y_train)

# end_time = time.time() # end the timer
# elapsed_time_seconds = end_time - start_time

# # print the elapsed time
# print(f"\nElapsed Time: {int(elapsed_time_seconds // 60)} minutes and {elapsed_time_seconds % 60} seconds")

# # print the score of the models
# print("CatBoost Score:", catboost_classifier.best_score_)
# print("LGBM Score:", lgbm_classifier.best_score_)


# iteration4 += 1 

[LightGBM] [Info] Number of positive: 8060, number of negative: 4846
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 371
[LightGBM] [Info] Number of data points in the train set: 12906, number of used features: 102
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624516 -> initscore=0.508760
[LightGBM] [Info] Start training from score 0.508760
Learning rate set to 0.030707
0:	learn: 0.6797426	total: 46.7ms	remaining: 46.7s
1:	learn: 0.6668742	total: 64.3ms	remaining: 32.1s
2:	learn: 0.6547937	total: 81.5ms	remaining: 27.1s
3:	learn: 0.6430810	total: 97.4ms	remaining: 24.2s
4:	learn: 0.6329168	total: 113ms	remaining: 22.5s
5:	learn: 0.6235054	total: 129ms	remaining: 21.3s
6:	learn: 0.6135758	total: 142ms	remaining: 20.2s
7:	learn: 0.6055002	total: 158ms	remaining: 19.6s
8:	learn: 0.

In [62]:

# # make predictions and average them
# lgbm_predictions = lgbm_classifier.predict_proba(X_test)
# catboost_predictions = catboost_classifier.predict_proba(X_test)

# ensemble_predictions = (lgbm_predictions[:,1] + catboost_predictions[:,1]) / 2

# print(ensemble_predictions)

# # create df for submission
# sub_df = test_df[['patient_id']]
# sub_df['DiagPeriodL90D'] = ensemble_predictions

# # save the submission
# display(sub_df)

# sub_df.to_csv('simplified_submission_ensemble_' + str(iteration4) + '.csv', index=False)

[0.78209194 0.78767978 0.74035985 ... 0.91637705 0.08289171 0.89772077]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['DiagPeriodL90D'] = ensemble_predictions


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.782092
1,593679,0.787680
2,184532,0.740360
3,447383,0.779405
4,687972,0.780880
...,...,...
5787,977076,0.803150
5788,922960,0.815225
5789,759690,0.916377
5790,911717,0.082892


In [None]:
# i think after everything, the catboost model is best. however, even this method gave me 0.803. what i just used the default params...

In [63]:
# # create df for submission
# sub_df = test_df[['patient_id']]
# sub_df['DiagPeriodL90D'] = lgbm_predictions[:,1]

# # save the submission
# display(sub_df)

# sub_df.to_csv('simplified_submission_lgmb_default_' + str(iteration4) + '.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['DiagPeriodL90D'] = lgbm_predictions[:,1]


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.786436
1,593679,0.811058
2,184532,0.710468
3,447383,0.785878
4,687972,0.752877
...,...,...
5787,977076,0.765654
5788,922960,0.793714
5789,759690,0.923354
5790,911717,0.081536


In [64]:
# # create df for submission
# sub_df = test_df[['patient_id']]
# sub_df['DiagPeriodL90D'] = catboost_predictions[:,1]

# # save the submission
# display(sub_df)

# sub_df.to_csv('simplified_submission_cb_default_' + str(iteration4) + '.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['DiagPeriodL90D'] = catboost_predictions[:,1]


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.777748
1,593679,0.764302
2,184532,0.770252
3,447383,0.772932
4,687972,0.808883
...,...,...
5787,977076,0.840646
5788,922960,0.836737
5789,759690,0.909400
5790,911717,0.084247


In [None]:
# catboost default gave 0.802. i think catboost is the best option. let's go back and keep training CB