In [None]:
import numpy as np
import pandas as pd 
df = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')

# Some Basic EDA

In [None]:
df.head()

## Notes:
- Columns are anonymized, but there's a mix of numerical and binary features. The last column is the label and id should be dropped before training.

In [None]:
df.isna().any().any()

## Notes:
- Above shows no missing values were found

In [None]:
# dirty way of forcing kaggle to display all columns, transposed
df.iloc[:,1:287].describe(include = 'all').T.style

## Notes:
- Target is supposed to be a binary label and above shows target column has a "mean" close to 0.5, so it's a balanced set
- All features seem to have range 0-1, with a subset of the later features binary. This also means the numerical features do not need to be scaled

In [None]:
# take a quick look at features with the highest correlation with each other
stg = df.iloc[:,1:287]

# using corr method on dataframe creates a symmetrical matrix where about half the values are redundant since corr(a,b) = corr(b,a) for this case
pairs_to_drop = set()
cols = stg.columns
for i in range(0, stg.shape[1]):
    for j in range(0, i+1):
        pairs_to_drop.add((cols[i], cols[j]))

stg = stg.corr().abs()
stg = stg.unstack()
stg = stg.drop(labels=pairs_to_drop).sort_values(ascending=False)

In [None]:
stg.head(100)

## Notes:
- The features are not very correlated, with the highest values less than 0.10
- Also interesting to see f22 is highly correlated with the target

# Modeling - Variations of Logistic Regression

In [None]:
# drop id column, and split into features x and target y
x = df.iloc[:,1:286]
y = df.iloc[:,286]

In [None]:
# just to reduce mem usage
import gc
del df
gc.collect()

In [None]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

# Since the competition explicitly scores by the Area Under ROC curve, we will use it as the performance metric for every model
from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve # for more custom plotting
from sklearn.metrics import roc_auc_score

# for saving and loading models
from joblib import dump, load

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [None]:
del x, y
gc.collect()

In [None]:
# first, we benchmark using a basic bare-bones logistic regression
# sklearn's base logistic regression uses L2 regularization by default
logreg = LogisticRegression(solver='liblinear', random_state = 42)
logreg.fit(X_train, y_train)

In [None]:
# keep track of the train and test scores for comparing later
model_brief_desc = []
train_scores = []
test_scores = []

In [None]:
model_brief_desc.append("logistic")
# train score
y_train_pred = logreg.predict_proba(X_train)
train_scores.append(roc_auc_score(y_train, y_train_pred[:,1]))
# test score
y_pred = logreg.predict_proba(X_test)
test_scores.append(roc_auc_score(y_test, y_pred[:,1]))

In [None]:
for i in range(0, len(train_scores)):
    print(model_brief_desc[i], " : ", train_scores[i], ", ", test_scores[i])

In [None]:
dump(logreg, 'logistic.sav')

In [None]:
plot_roc_curve(logreg, X_test, y_test, name = 'Logistic Regression')

## Notes:
- Try out a method for seeing if more training samples would improve score
- Doing this just for practice since outside of shifting the train_test_split, there isn't a way to get more training data

In [None]:
from sklearn.model_selection import learning_curve
# the following is a wrapper to learning curve

def plot_learning_curves_classification(model, features, labels, model_str_name):
    train_sizes = np.round(np.linspace(1000, 0.7 * features.shape[0], 10)).astype(int)
    
    train_sizes, train_scores, validation_scores = learning_curve(
        estimator = model,
        X = features,
        y = labels, train_sizes=train_sizes, cv=5,
        scoring='roc_auc')
    train_scores_mean = train_scores.mean(axis=1)
    validation_scores_mean = validation_scores.mean(axis=1)

    plt.style.use('seaborn')
    plt.plot(train_sizes, 1 - train_scores_mean, label='Training error')
    plt.plot(train_sizes, 1 - validation_scores_mean, label='Validation error')
    plt.xlabel("Training Set Size")
    plt.ylabel("Error (1 - ROC AUC)")
    plt.title("Learning Curve for " + model_str_name)
    plt.legend()
    plt.savefig("learning_curve.png")
    plt.show()

In [None]:
m = LogisticRegression(solver='liblinear', random_state = 42)
plot_learning_curves_classification(m, X_train, y_train, 'logistic')

## Notes:
- Above plot shows that the training and validation scores (or errors) converge towards 0.7 of our total train set
- Since our train set is even higher than that, this suggests more training data will not improve the score significantly

# Trying out Dimensionality Reduction, Feature Selection Techniques
- Basic logistic regression (with L2 regularization) model produced similar train and validation scores
- This suggests that the model is generalizing well (i.e. it's not overfitting)
- Although from EDA we saw that the columns are not very correlated, want to try some dimensionality reduction and feature selection techniques
- We expect that these might lead to lower performance

In [None]:
# we first try PCA
# first do a quick plot of new features explained variance and decide how many I want to keep
from sklearn.decomposition import PCA
pca = PCA(n_components=None)

In [None]:
pca.fit(X_train)

In [None]:
plt.plot(np.cumsum(pca.explained_variance_ratio_ * 100))
plt.xlabel("Number of components (Dimensions)")
plt.ylabel("Explained variance (%)")

## Notes:
- choosing 100, which is after the inflection point and explains close to ~90% of the variance

In [None]:
pca = PCA(n_components=100)
pca.fit(X_train)

In [None]:
X_train_pca=pca.fit_transform(X_train)
X_train_pca=pd.DataFrame(X_train_pca)
print(X_train_pca)

In [None]:
dump(pca, 'pca.sav')

In [None]:
logreg2 = LogisticRegression(solver='liblinear')
logreg2.fit(X_train_pca, y_train)

In [None]:
X_test_pca=pca.fit_transform(X_test)

In [None]:
model_brief_desc.append("logistic(w/PCA)")
# train score
y_train_pred = logreg2.predict_proba(X_train_pca)
train_scores.append(roc_auc_score(y_train, y_train_pred[:,1]))
# test score
y_pred = logreg2.predict_proba(X_test_pca)
test_scores.append(roc_auc_score(y_test, y_pred[:,1]))

In [None]:
# let's see the scores again
for i in range(0, len(train_scores)):
    print(model_brief_desc[i], " : ", train_scores[i], ", ", test_scores[i])

In [None]:
dump(logreg2, 'logistic_pca.sav')

In [None]:
plot_roc_curve(logreg2, X_test_pca, y_test, name = 'Logistic Regression PCA')

## Notes:
- PCA before linear regression has lowered the AUC score, suggesting that some of the features dropped that, while highly correlated with others, are seperately highly correlated with the target and so now the model is performing worse
- try a different method for feature selection next : recursive feature elimination

In [None]:
# previously ran models taking up a lot of memory, so we drop them to try and free memory
del m, X_test_pca, X_train_pca, y_train_pred, y_pred, logreg, logreg2
gc.collect()

In [None]:
# fit simple logistic regression and recursively take away features
from sklearn.feature_selection import RFE
rfe_selector = RFE(estimator=LogisticRegression(solver='liblinear'),n_features_to_select = 50, step = 2)
rfe_selector.fit(X_train, y_train)

In [None]:
dump(rfe_selector, 'rfe.sav')
# brief look at which columns were selected
X_train.columns[rfe_selector.get_support()]

In [None]:
X_train_rfe = rfe_selector.transform(X_train)
logreg3 = LogisticRegression(solver='liblinear')
logreg3.fit(X_train_rfe, y_train)

In [None]:
model_brief_desc.append("logistic(w/RFE)")
# train score
y_train_pred = logreg3.predict_proba(X_train_rfe)
train_scores.append(roc_auc_score(y_train, y_train_pred[:,1]))

# test score
X_test_rfe = rfe_selector.transform(X_test)
y_pred = logreg3.predict_proba(X_test_rfe)
test_scores.append(roc_auc_score(y_test, y_pred[:,1]))

In [None]:
# let's see the scores again
for i in range(0, len(train_scores)):
    print(model_brief_desc[i], " : ", train_scores[i], ", ", test_scores[i])

In [None]:
dump(logreg3, 'logistic_rfe.sav')

In [None]:
plot_roc_curve(logreg3, X_test_rfe, y_test, name = 'RFE Logistic')

## Notes:
- RFE took much longer, performed much better on the test set, but does not beat the baseline Logistic Regression w/ regularization
- As suspected might happen, dimensionality reduction and feature selection actually worsened performance by removing information (features) that were related to the target


# Modeling - DTEs
- Now try classification using common decision tree methods

In [None]:
# gc memory or Kaggle will reset
del rfe_selector, submit, y_submit, X_submit, X_submit_rfe, test_df, X_test_rfe, y_pred_rfe, logreg3
gc.collect()

In [None]:
# run this cell in case kaggle restarts the notebook for memory or timeout reasons
# this simply repeats the short preprocessing required and imports all general libraries
import numpy as np
import pandas as pd 
df = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')

x = df.iloc[:,1:286]
y = df.iloc[:,286]

import gc
del df
gc.collect()

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import auc
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score

from joblib import dump, load

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

del x, y
gc.collect()

# we know the scores based on previous runs on the same seed
model_brief_desc = ['logistic','logistic(w/PCA)','logistic(w/RFE)']
train_scores = [0.8402472449462012, 0.8066444198959453, 0.8328801836407288]
test_scores = [0.8394841897582251, 0.7612871702176366, 0.8324074476989]

In [None]:
# we will try a simple catboost first
import catboost as catb

In [None]:
catboost = catb.CatBoostClassifier(loss_function='Logloss',verbose=False)

In [None]:
catboost.fit(X_train, y_train)

In [None]:
model_brief_desc.append("catboost")
# CatBoost has an inbuilt function for calculating AUC, but we can keep this for consistency/convenience
# train score
y_train_pred = catboost.predict_proba(X_train)
train_scores.append(roc_auc_score(y_train, y_train_pred[:,1]))

# test score
y_pred = catboost.predict_proba(X_test)
test_scores.append(roc_auc_score(y_test, y_pred[:,1]))

In [None]:
# let's see the scores again
for i in range(0, len(train_scores)):
    print(model_brief_desc[i], " : ", train_scores[i], ", ", test_scores[i])

In [None]:
dump(catboost, 'catboost.sav')

In [None]:
plot_roc_curve(catboost, X_test, y_test, name = 'catboost')

## Notes:
- CatBoost with mostly default parameters performs very well compared to any of the logistic regression models
- However, the validation score is worse than the train score, suggesting there might be a small amount of overfitting
- Next, will try different hyperparameter tuning methods
- In real setting, would want to try different methods on the same classifier but to cover as much as possible, will just do random search on catboost and leave bayesian search for xgboost later.
- Leaving out grid search as it is basically brute force and takes a long time, susceptible to Kaggle memory and timeout restarts

In [None]:
catboost_rs = catb.CatBoostClassifier(loss_function='Logloss',verbose=False)

In [None]:
space = {'iterations' : [10, 100, 1000], # default is 1000
         'learning_rate': [0.03, 0.1, 0.3], # default generated 0.16
         'depth': [4, 6, 10], # 6 is default
         'l2_leaf_reg': [1, 3, 5, 7, 9]} # default is 3, with higher penalities we might reduce overfitting

In [None]:
# catboost has inbuilt randomized search function, that defaults to using a 3-fold cross-validation
search_results = catboost_rs.randomized_search(space, X_train, y_train)

In [None]:
model_brief_desc.append("catboost(w/RandomSearch)")
# according to catboost doc, model is already trained/fitted after running the search

# train score
y_train_pred = catboost_rs.predict_proba(X_train)
train_scores.append(roc_auc_score(y_train, y_train_pred[:,1]))

# test score
y_pred = catboost_rs.predict_proba(X_test)
test_scores.append(roc_auc_score(y_test, y_pred[:,1]))

In [None]:
# let's see the scores again
for i in range(0, len(train_scores)):
    print(model_brief_desc[i], " : ", train_scores[i], ", ", test_scores[i])

In [None]:
dump(catboost_rs, 'catboost_rs.sav')

In [None]:
plot_roc_curve(catboost_rs, X_test, y_test, name = 'catboost(w/RandomSearch)')

## Notes:
- In-built random search has lowered the train score but improved the test (validation) score slightly
- Now we will try a basic xgboost, as well as xgboost with bayesian search and see how they perform

In [None]:
# next we try xgboost, again gc for memory
del catboost, catboost_rs, search_results, space, y_pred_proba, X_submit, y_submit, submit, test_df
gc.collect()

In [None]:
# run this cell in case kaggle restarts the notebook for memory or timeout reasons
# this simply repeats the short preprocessing required and imports all general libraries
import numpy as np
import pandas as pd 
df = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')

x = df.iloc[:,1:286]
y = df.iloc[:,286]

import gc
del df
gc.collect()

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score

from joblib import dump, load

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

del x, y
gc.collect()

# we know the scores based on previous runs on the same seed
model_brief_desc = ['logistic','logistic(w/PCA)','logistic(w/RFE)', 'catboost', 'catboost(w/RandomSearch)']
train_scores = [0.8402472449462012, 0.8066444198959453, 0.8328801836407288, 0.8819991775525523, 0.8719932449630302]
test_scores = [0.8394841897582251, 0.7612871702176366, 0.8324074476989, 0.8545129516316827, 0.8557815686724553]

In [None]:
# start with basic xgboost classifier
# this is the sklearn wrapper version so we can pass parameters as arguments
import xgboost as xgb
xgboost = xgb.XGBClassifier(objective = 'binary:logistic')

In [None]:
xgboost.fit(X_train, y_train)

In [None]:
model_brief_desc.append("xgboost")
# train score
y_train_pred = xgboost.predict_proba(X_train)
train_scores.append(roc_auc_score(y_train, y_train_pred[:,1]))

# test score
y_pred = xgboost.predict_proba(X_test)
test_scores.append(roc_auc_score(y_test, y_pred[:,1]))

In [None]:
# let's see the scores again
for i in range(0, len(train_scores)):
    print(model_brief_desc[i], " : ", train_scores[i], ", ", test_scores[i])

In [None]:
dump(xgboost, 'xgb.sav')

In [None]:
plot_roc_curve(xgboost, X_test, y_test, name = 'xgboost')

## Notes:
- Basic XGBoost score was quite good considering the best scores from previous models
- Now let's try hyperparameter tuning using bayesian search

In [None]:
# this library is what I could find online for running a bayesian search
from hyperopt import hp
from hyperopt import fmin
from hyperopt import tpe

In [None]:
del xgboost
gc.collect()

In [None]:
# instead of interacting with the sickit-learn wrapper, we will use the xgboost python api directly, 
# in order to send it a set of "params" found by the hyperopt fmin function later
# first, create a DMatrix which xgb accepts
train_dm = xgb.DMatrix(X_train, label = y_train)

# define hyperparameter space using dictionary
# for bayesian search we give a distribution
space = {
    'objective' : 'binary:logistic', # this is a required parameter even though we dont search over it
    'max_depth' : hp.choice('max_depth', np.arange(3, 14, dtype=int)), # the default is 3
    'min_child_weight' : hp.uniform('min_child_weight', 5, 8), # the default is 1
    'subsample' : hp.uniform('subsample', 0.8, 1.0), # the default is 1
    'colsample_bytree': hp.uniform('colsample_bytree', 0.6, 1.0), # the default is 1
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.2)), # the default is 0.1
}

In [None]:
def objective(params):
    """Objective function for Gradient Boosting Machine Hyperparameter Tuning"""
    
    # Perform n_fold cross validation with hyperparameters
    # Use early stopping and evalute based on ROC AUC
    cv_results = xgb.cv(params = params, dtrain = train_dm, nfold = 3, num_boost_round = 10, 
                        early_stopping_rounds = 2, metrics = 'auc', seed = 42)
  
    # Extract the best score
    best_score = max(cv_results['train-auc-mean'])
    
    # Loss must be minimized
    loss = 1 - best_score
    
    # Dictionary with information for evaluation
    return loss

In [None]:
best = fmin(objective, space, algo=tpe.suggest, max_evals=20)

In [None]:
# get classifier fitted on the train set using best hyperparameters
# there might be a more efficient way of giving the params, but I'll stick to what's intuitive with sklearn
xgboost_bs = xgb.XGBClassifier(objective = 'binary:logistic',
                               colsample_bytree = best['colsample_bytree'],
                               learning_rate = best['learning_rate'],
                               max_depth = best['max_depth'],
                               min_child_weight = best['min_child_weight'],
                               subsample = best['subsample'])

In [None]:
dump(xgboost_bs, 'stg_xgb_bs.sav')

In [None]:
xgboost_bs.fit(X_train, y_train)

In [None]:
model_brief_desc.append("xgboost(w/BayesianSearch)")
# train score
y_train_pred = xgboost_bs.predict_proba(X_train)
train_scores.append(roc_auc_score(y_train, y_train_pred[:,1]))

# test score
y_pred = xgboost_bs.predict_proba(X_test)
test_scores.append(roc_auc_score(y_test, y_pred[:,1]))

In [None]:
# let's see the scores again
for i in range(0, len(train_scores)):
    print(model_brief_desc[i], " : ", train_scores[i], ", ", test_scores[i])

In [None]:
dump(xgboost_bs, 'xgb_bayesian.sav')

In [None]:
plot_roc_curve(xgboost_bs, X_test, y_test, name = 'xgboost(w/BayesianSearch)')

## Notes:
- it looks like hp tuning has increased the AUC train score, but it took much longer, and did not improve the test score
- this suggests the model is overfitting relative to the default xgboost model

# Other Considerations
- In a real-world setting, the tasks may not be as straightforward, and the features may require more cleaning (e.g. need to impute missing values or remove outliers), or are more interpretable, allowing for more feature engineering
- Another important step may be determining a good peformance metric to evaluate models by
- In this section, briefly go through exercise of looking at ROC curve and confusion matrix

In [None]:
# first, let's reload libraries and models, if kaggle has restarted multiple times at this point
import numpy as np
import pandas as pd 
df = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv')

x = df.iloc[:,1:286]
y = df.iloc[:,286]

import gc
del df
gc.collect()

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import auc
from sklearn.metrics import roc_curve
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_auc_score

from joblib import dump, load

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

del x, y
gc.collect()

import catboost as catb
import xgboost as xgb

In [None]:
# change directory depending on if this is a reupload or from output
logreg = load('../input/playground-models/logistic.sav')
logreg2 = load('../input/playground-models/logistic_pca.sav')
logreg3 = load('../input/playground-models/logistic_rfe.sav')
catboost = load('../input/playground-models/catboost.sav')
catboost_rs = load('../input/playground-models/catboost_rs.sav')
xgboost = load('../input/playground-models/xgb.sav')
xgboost_bs = load('../input/playground-models/xgb_bayesian.sav')

## Comparing ROC curves and confusion matrices
- The ROC curve plots true and false positive rate of a model based on different probability thresholds
- In a real-world setting, we may care more about one or the other (or care about other things such as the F-1 score)
- For example, this dataset may be a predictor for a serious health condition (positive meaning having the condition) where early detection is crucial.
- In such a case, we would care much more about the (and would want a higher) true positive rate because we want to make sure to detect the condition if the patient has it. We may even want to measure performance based on the true positive rate entirely
- A false positive is less harmful because they are likely to do further testing/go through more consultations and realize later it was a false alarm, but it still bears cost because the patient may be needlessly taking up medical resources or their own money from future tests

In [None]:
# plot ROC curve of best performing models on the same axes to compare test score

y_pred1 = catboost_rs.predict_proba(X_test)
y_pred2 = xgboost.predict_proba(X_test)

plt.figure(0).clf()
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

fpr, tpr, thresh = roc_curve(y_test, y_pred1[:,1])
plt.plot(fpr,tpr,label = 'catboost(w/RandomSearch)')
fpr, tpr, thresh = roc_curve(y_test, y_pred2[:,1])
plt.plot(fpr,tpr,label = 'xgboost')


plt.legend(loc=0)

## Notes:
- The models' ROC curves are not very distinguishable and seem to perform similarly at all probability thresholds (Catboost maybe edging out slightly)
- What may be important in real-world is adjusting the probability threshold for predicting a positive case depending on the cost/benefit of false positives/negatives and true positives/negatives respectively
- In the case outlined above, we might want to lower the probability threshold (toward the right of the ROC curve) so we make sure to catch all cases of the serious health condition
- Now quickly compare confusion matrices at 50% probability threshold (defualt)

In [None]:
from sklearn.metrics import confusion_matrix

y_pred1 = catboost_rs.predict(X_test)
y_pred2 = xgboost.predict(X_test)

print('ref: tn, fp, fn, tp')
print('catboost(w/RandomSearch)\n', confusion_matrix(y_test, y_pred1).ravel())
print('xgboost\n', confusion_matrix(y_test, y_pred2).ravel())

## Notes:
- The results are again very similar
- Given the example outlined earlier where the cost of a false negative was higher than the cost of a false positive, we would choose catboost(w/RandomSearch) if we didn't want to adjust the probability threshold

In [None]:
# run these only for competition
# test data and predict
test_df=pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv')
X_submit = test_df.iloc[:,1:]
# classifier can be changed to submit different predictions
y_submit = catboost_rs.predict_proba(X_submit)
submit = np.c_[test_df.iloc[:,0],y_submit[:,1]]
submit = pd.DataFrame(submit, columns = ['id','target'])
submit.to_csv('catb_rs.csv')