In [16]:
import pandas as pd 
import numpy as np
import time
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

In [2]:
path = 'training.csv' # path to the file we're looking at. 
# in this case the file is in the same folder as my notebook, so I don't need to specify the full path
train_df = pd.read_csv(path)

In [3]:
path = 'test.csv'
test_df = pd.read_csv(path)

In [4]:
# very basic preprocessing...let's just keep the columns we want to keep! 

In [5]:
# want to keep info about the patient themselves also some specifc regional data that could have an impact on the patient's health
# i'm excluded the demographic info that's not a characteristic of the patients themselves
cols = [
    'patient_race',
    'payer_type',
    'patient_state',
    'patient_age',
    'patient_gender',
    'breast_cancer_diagnosis_code',
    'metastatic_cancer_diagnosis_code'
]
# NOTE - IF USING PATIENT_ZIP3, TREAT IT LIKE A CATEGORICAL VARIABLE

train_df_simple = train_df[cols]
test_df_simple = test_df[cols]

In [7]:
# simple preprocessing 

In [9]:
# fill in nulls
numerical_cols = train_df_simple.select_dtypes(exclude=['object']).columns
# median for numerical
for c in numerical_cols:
    train_df_simple[c].fillna(value=train_df_simple[c].median(), inplace=True)
    test_df_simple[c].fillna(value=test_df_simple[c].median(), inplace=True)
    
# 'None' for categorical (the rest)
train_df_simple.replace(np.nan, 'None', regex=True,inplace=True)
test_df_simple.replace(np.nan, 'None', regex=True,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_simple[c].fillna(value=train_df_simple[c].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_simple[c].fillna(value=test_df_simple[c].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_simple.replace(np.nan, 'None', regex=True,inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/in

In [10]:
# encode categorical variables
train_df_simple = pd.get_dummies(train_df_simple) 
test_df_simple = pd.get_dummies(test_df_simple) 

print(train_df_simple.shape)
print(test_df_simple.shape)

(12906, 156)
(5792, 141)


In [11]:
# get list of missing cols in test data 
missing_cols = [x for x in train_df_simple.columns if x not in test_df_simple.columns and x != 'DiagPeriodL90D']

# add these cols with default value = 0
for c in missing_cols:
    test_df_simple[c] = 0
    
# reorder the columns to make sure the test and train are the same
test_df_simple = test_df_simple[train_df_simple.columns]

print(train_df_simple.shape)
print(test_df_simple.shape)

(12906, 156)
(5792, 156)


In [12]:
# standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df_simple)
X_test = scaler.transform(test_df_simple)


In [25]:
# train models
iteration = 0

In [13]:
y_train = train_df['DiagPeriodL90D']

In [49]:
# call our model function
create_submission('xgb', xgb_model, X_train_scaled_final, y_train, X_test_scaled_final, test_df_final)

probabilities: 
 [[0.04152191 0.9584781 ]
 [0.16139752 0.8386025 ]
 [0.24576873 0.7542313 ]
 ...
 [0.0296272  0.9703728 ]
 [0.9868897  0.01311031]
 [0.10984433 0.8901557 ]] 

predictions of 1: 
 [0.9584781  0.8386025  0.7542313  ... 0.9703728  0.01311031 0.8901557 ] 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['DiagPeriodL90D'] = predictions


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.958478
1,593679,0.838602
2,184532,0.754231
3,447383,0.739523
4,687972,0.897104
...,...,...
5787,977076,0.831055
5788,922960,0.874637
5789,759690,0.970373
5790,911717,0.013110


Today's date: 2024-02-16
Submission saved - xgb_submission_2024-02-16.csv


In [50]:
xgb_model

In [51]:
# initialize the model you want to use
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()

In [52]:
# call our model function
create_submission('knn', knn_model, X_train_scaled_final, y_train, X_test_scaled_final, test_df_final)

probabilities: 
 [[0.4 0.6]
 [0.2 0.8]
 [0.4 0.6]
 ...
 [0.2 0.8]
 [0.8 0.2]
 [0.4 0.6]] 

predictions of 1: 
 [0.6 0.8 0.6 ... 0.8 0.2 0.6] 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['DiagPeriodL90D'] = predictions


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.6
1,593679,0.8
2,184532,0.6
3,447383,0.2
4,687972,0.8
...,...,...
5787,977076,0.8
5788,922960,1.0
5789,759690,0.8
5790,911717,0.2


Today's date: 2024-02-16
Submission saved - knn_submission_2024-02-16.csv


We can use cross validation to divide up our training dataset and test the model using those chunks. From there we can pick out the model with the best accuracy.

Parameter tuning lets us manually select and test different values for a models' parameters to see which gives the best accuracy.

Note that accuracy in our case is nuanced, because the training dataset accuracy is either 0 or 1, but our test data accuracy is comparing actual probabilities. 

FYI, the time it takes to test different models will vary on the number of parameters you specify. It will create and train <i>every</i> combo of the model, so training time can add up

In [53]:
from sklearn.model_selection import cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.datasets import make_classification
# GridSearchCV - allows us to perform cross validation and parameter tuning/testing simultaneously

In [54]:
# cross validation and parameter tuning example - logistic regression
start_time = time.time() # start timer

lr_model = LogisticRegression()

# Define hyperparameters - this will depend on the model. you can refer to the docs or use ChatGPT to spit out different parameter tests
# these are arbitrary values I got from ChatGPT
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-3, 3, 7),
    'solver': ['liblinear']
}

# Use GridSearchCV to find the best hyperparameters - this method trains the model using the list of paramters we just defined
grid_search = GridSearchCV(lr_model, param_grid, cv=5, scoring='accuracy') # cv = 5 --> cross-validation = 5; performs the split automatically for us; divides our training data into 5 chunks and tests the models on each one
grid_search.fit(X_train_scaled_final, y_train)

end_time = time.time() # end timer
elapsed_time_seconds = end_time - start_time

# print the elapsed time
print(f"Elapsed Time: {int(elapsed_time_seconds // 60)} minutes and {elapsed_time_seconds % 60} seconds\n")


# Print the best parameters and corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: {:.2f}".format(grid_search.best_score_))


Elapsed Time: 10 minutes and 20.200597286224365 seconds

Best Parameters:  {'C': 0.01, 'penalty': 'l1', 'solver': 'liblinear'}
Best Accuracy: 0.81


We can repeat this process and replace lr_model with any other model we choose. It gave us the parameters that yielded the best accuracy for the training dataset. 

We can create a new model using the best parameters and pass it into our <b>create_submission</b> function. Again, the accuracy using the training data may not translate to the testing data, since we are using the actual predictions rather than 0/1. 


In [55]:
# create the logistic regression model using the winning parameters
lr_best_model = grid_search.best_estimator_

# call the function- lr_cv = logistic regression cross validation
create_submission('lr_cv', lr_best_model, X_train_scaled_final, y_train, X_test_scaled_final, test_df_final)

probabilities: 
 [[0.24530027 0.75469973]
 [0.25104848 0.74895152]
 [0.22446853 0.77553147]
 ...
 [0.1899643  0.8100357 ]
 [0.90417674 0.09582326]
 [0.19865532 0.80134468]] 

predictions of 1: 
 [0.75469973 0.74895152 0.77553147 ... 0.8100357  0.09582326 0.80134468] 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['DiagPeriodL90D'] = predictions


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.754700
1,593679,0.748952
2,184532,0.775531
3,447383,0.779154
4,687972,0.756289
...,...,...
5787,977076,0.801713
5788,922960,0.813859
5789,759690,0.810036
5790,911717,0.095823


Today's date: 2024-02-16
Submission saved - lr_cv_submission_2024-02-16.csv


This result yielded a prediction score of <b>0.792</b> in Kaggle. It's a promising method! We can improve our chances by 
- adding more parameters to test
- training different model types

I'll repeat the method using XGBoost. In ChatGPT, I asked to give me an example of GridSearchCV for XGBoost.
Then I took whatever <b>param_grid</b> value it gave and replaced it below:

In [56]:
# cross validation and parameter tuning example - XGBoost
start_time = time.time() # start timer

xgb_model = xgb.XGBClassifier(objective='binary:logistic')

# Define hyperparameters - this will depend on the model. you can refer to the docs or use ChatGPT to spit out different parameter tests
# these are arbitrary values I got from ChatGPT
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 4, 5, 6],
    'subsample': [0.5, 0.75, 1.0],
}

# Use GridSearchCV to find the best hyperparameters - this method trains the model using the list of paramters we just defined
grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring='accuracy') # cv = 5 --> cross-validation = 5; performs the split automatically for us; divides our training data into 5 chunks and tests the models on each one
grid_search.fit(X_train_scaled_final, y_train)

end_time = time.time() # end the timer
elapsed_time_seconds = end_time - start_time

# print the elapsed time
print(f"\n\nElapsed Time: {int(elapsed_time_seconds // 60)} minutes and {elapsed_time_seconds % 60} seconds\n")

# Print the best parameters and corresponding accuracy
print("Best Parameters: ", grid_search.best_params_)
print("Best Accuracy: {:.2f}".format(grid_search.best_score_))


Elapsed Time: 18 minutes and 23.35038185119629 seconds

Best Parameters:  {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best Accuracy: 0.81


In [58]:
# create the xgb model using the winning parameters
xgb_best_model = grid_search.best_estimator_

# call the function- xgb_cv = xgb cross validation
create_submission('xgb_cv', xgb_best_model, X_train_scaled_final, y_train, X_test_scaled_final, test_df_final)

probabilities: 
 [[0.2050128  0.7949872 ]
 [0.25278074 0.74721926]
 [0.2241211  0.7758789 ]
 ...
 [0.1459595  0.8540405 ]
 [0.91311795 0.08688204]
 [0.16204518 0.8379548 ]] 

predictions of 1: 
 [0.7949872  0.74721926 0.7758789  ... 0.8540405  0.08688204 0.8379548 ] 



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission['DiagPeriodL90D'] = predictions


Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.794987
1,593679,0.747219
2,184532,0.775879
3,447383,0.774715
4,687972,0.806570
...,...,...
5787,977076,0.779446
5788,922960,0.857029
5789,759690,0.854041
5790,911717,0.086882


Today's date: 2024-02-16
Submission saved - xgb_cv_submission_2024-02-16.csv


This result yielded a prediction score of <b>0.798</b> in Kaggle. Think this route may be better than just using the default version of models.

Try any classification model types, but be warned some can take a long time to run.


In [39]:
# cross validation and parameter tuning example - XGBoost
start_time = time.time() # start timer

model = xgb.XGBClassifier(objective='binary:logistic')

# Define hyperparameters - this will depend on the model. you can refer to the docs or use ChatGPT to spit out different parameter tests
# these are arbitrary values I got from ChatGPT
param_grid = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
    'min_child_weight': [1, 5, 10],
    'gamma': [0, 0.1, 0.2],
}

iteration += 1

# Use RandomizedSearchCV to search for the best hyperparameters - will use n_iter random combinations of  parameters
random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid,
                                   scoring='accuracy', n_iter=15, cv=3, random_state=26)
random_search.fit(X_train, y_train)

end_time = time.time() # end the timer
elapsed_time_seconds = end_time - start_time

# print the elapsed time
print(f"\nElapsed Time: {int(elapsed_time_seconds // 60)} minutes and {elapsed_time_seconds % 60} seconds")

# Print the best parameters and corresponding accuracy
print("\nBest Parameters: ", random_search.best_params_)
print("\nBest Accuracy: {:.2f}".format(random_search.best_score_))



Elapsed Time: 0 minutes and 14.35178256034851 seconds

Best Parameters:  {'subsample': 0.9, 'n_estimators': 100, 'min_child_weight': 1, 'max_depth': 5, 'learning_rate': 0.2, 'gamma': 0, 'colsample_bytree': 1.0}

Best Accuracy: 0.81


In [41]:
# create the xgb model using the winning parameters
best_model = random_search.best_estimator_
best_model.fit(X_train, y_train)

# get the probabilities using the test data
probabilities = best_model.predict_proba(X_test)

print (probabilities)

[[0.17566359 0.8243364 ]
 [0.17210567 0.82789433]
 [0.21334255 0.78665745]
 ...
 [0.07020581 0.9297942 ]
 [0.9212228  0.07877722]
 [0.07885414 0.92114586]]


In [42]:
# create df for submission
sub_df = test_df[['patient_id']]
sub_df['DiagPeriodL90D'] = probabilities[:,1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sub_df['DiagPeriodL90D'] = probabilities[:,1]


In [43]:
# save the submission
display(sub_df)

sub_df.to_csv('simplified_submission_' + str(iteration) + '.csv', index=False)

Unnamed: 0,patient_id,DiagPeriodL90D
0,573710,0.824336
1,593679,0.827894
2,184532,0.786657
3,447383,0.779635
4,687972,0.853074
...,...,...
5787,977076,0.848677
5788,922960,0.832114
5789,759690,0.929794
5790,911717,0.078777


In [44]:
# wowowowowow this is geting better results than almost anything else I did haha. got 0.802 with test 1

In [40]:
# LOGGING - 2/23/24
# test 0 - random_state = 42, n_iter = 15
# test 1 - random_state = 52, n_iter = 15
# test 2 - random_state = 1, n_iter = 20
# test 3 - random_state = 26, n_iter = 15