### PLEASE UPVOTE if you like this notebook. It will keep me motivated to update my notebook.

#### What are you trying to do in this notebook?
I'm predicting a categorical target based on a number of feature columns given in the data. The data is synthetically generated by a GAN that was trained on the data from the Forest Cover Type Prediction. This dataset is (a) much larger, and (b) may or may not have the same relationship to the target as the original data.

#### Why are you trying it?
Practice my ML skills on this approachable dataset.


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score # this is the metric used to score the competition
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, QuantileTransformer # scaling will be necessary for most models
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV # using a small validation + cv set may help
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2, f_classif
from sklearn.inspection import permutation_importance
from sklearn.decomposition import KernelPCA, PCA
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from skopt import BayesSearchCV # works similar to GridSearchCV but it doesn't try all param combinations, takes structure of search space into acc.

In [None]:
RAND_STATE = 13

In [None]:
def prep_test_data(test_data):
    """Prepares Test Data for predictions, by applying all non-pipeline preproccessing steps.
        - Downcasting float and integer columns to save memory
        - Dropping 'id' feature
    
    Args:
        test_data(pd.DataFrame): DataFrame containing all columns of training data except id.
    Returns:
        test_data_prep(pd.DataFrame): DataFrame with features equal to training data. 
    """
    for col in test_data.columns:
        if test_data.loc[:,col].dtype == 'float64':
            test_data.loc[:,col] = pd.to_numeric(test_data.loc[:,col], downcast='float')   
        if test_data.loc[:,col].dtype == 'int64':
            test_data.loc[:,col] = pd.to_numeric(test_data.loc[:,col], downcast='integer')

    test_data_prep = test_data.drop('id', axis = 1)  
    return test_data_prep

In [None]:
df_train = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv')
df_test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')

In [None]:
df_train.shape

In [None]:
for col in df_train.columns:    
    if df_train.loc[:,col].dtype == 'float64':
        df_train.loc[:,col] = pd.to_numeric(df_train.loc[:,col], downcast='float')   
    if  df_train.loc[:,col].dtype == 'int64':
         df_train.loc[:,col] = pd.to_numeric(df_train.loc[:,col], downcast='integer')

In [None]:
df_train.head()

In [None]:
df_train.isna().any().sum()

In [None]:
fig, ax = plt.subplots(figsize = (7,7))
sns.countplot(x = df_train.loc[:,'target'], ax = ax)
sns.despine()

In [None]:
df_train.describe().T.sample(20)

In [None]:
# fig, axes = plt.subplots(nrows = 10, ncols = 10, figsize = (20,20))
# axes = axes.flatten()

# df_plot = df_train.sample(frac = 0.33, random_state = RAND_STATE) #only use a third of the data to visualize
# for idx, axis in enumerate(axes):
#     sns.histplot(data = df_plot, x = f'f{idx}',
#                 ax = axis, hue = 'target', legend = False)
#     axis.set_ylabel('')    
#     axis.set_xlabel('')

In [None]:
fig, ax = plt.subplots(figsize = (15,13))
sns.heatmap(df_train.corr(), ax = ax)

In [None]:
df_train.drop('id', axis = 1, inplace = True) 

In [None]:
features = df_train.drop('target', axis = 1)
target = df_train.loc[:,'target']

features_train, features_val, target_train,target_val = train_test_split(features, 
                                                                         target, 
                                                                         test_size = 0.1, 
                                                                         random_state = RAND_STATE)

In [None]:
features_train.shape, features_val.shape

In [None]:
target_train.shape, target_val.shape

In [None]:
# base_dt = DecisionTreeClassifier()
# cross_val_score(base_dt, features_train, target_train, scoring = 'roc_auc', n_jobs = -1)

In [None]:
nb_clf = GaussianNB()
pipe_standard = Pipeline([('standard_scaler', StandardScaler()), ('nb_model', nb_clf)])
pipe_minmax = Pipeline([('minmax_scaler', MinMaxScaler()), ('nb_model', nb_clf)])
pipe_robust = Pipeline([('robust_scaler', RobustScaler()), ('nb_model', nb_clf)])
pipe_quantile = Pipeline([('quantile_scaler', QuantileTransformer()), ('nb_model', nb_clf)])
pipe_quantile_norm = Pipeline([('quantile_scaler', QuantileTransformer(output_distribution = 'normal')),
                          ('nb_model', nb_clf)])

In [None]:
pipes = {'Standard': pipe_standard,
         'MinMax': pipe_minmax,
         'Robust': pipe_robust,
         'Quantile': pipe_quantile,
         'Quantile Normal Dist': pipe_quantile_norm}

for key,pipe in pipes.items():
    print(key)
    cv_scores = cross_val_score(pipe,features_train, target_train, 
                               scoring = 'roc_auc', n_jobs = -1)
    print(f"Mean ROC AUC: {np.mean(cv_scores)}")
    

In [None]:
pipe_quantile_norm.fit(features_train, target_train)
quant_nb_preds = pipe_quantile_norm.predict_proba(features_val)

In [None]:
roc_auc_score(target_val, quant_nb_preds[:,1])

In [None]:
# pipe_quantile_norm.fit(features_train, target_train)
# features_test = prep_test_data(df_test)
# quant_nb_preds_sub = pipe_quantile_norm.predict_proba(features_test)
# df_sub_quant_nb = pd.DataFrame({'id': df_test.loc[:,'id'], 'target':quant_nb_preds_sub[:,1]})
# df_sub_quant_nb
#df_sub_quant_nb.to_csv('submission.csv', index = None)

In [None]:
log_clf = LogisticRegression(n_jobs = -1)
quant_norm = QuantileTransformer(output_distribution = 'normal')
pipe_robust_log = Pipeline([('robust_scaler', RobustScaler()), ('LogReg',log_clf)])
pipe_log = Pipeline([('Quantile Transformer', quant_norm),('LogReg',log_clf)])

In [None]:
# rf_clf = RandomForestClassifier(n_jobs = -1, random_state = RAND_STATE)
# pipe_rf = Pipeline([('Quantile Transformer', quant_norm),('RandomForest',rf_clf)])

In [None]:
# ada_clf = AdaBoostClassifier(random_state = RAND_STATE)
# pipe_ada = Pipeline([('Quantile Transformer', quant_norm),('AdaBoost',ada_clf)])

In [None]:
svc_clf = LinearSVC(dual = False)
pipe_robust_svc = Pipeline([('robust_scaler', RobustScaler()), ('LinearSVC',svc_clf)])
pipe_svc = Pipeline([('Quantile Transformer', quant_norm),('LinearSVC',svc_clf)])

In [None]:
svcsgd_clf = SGDClassifier(loss = 'hinge', n_jobs = -1)
pipe_robust_svcsgd = Pipeline([('robust_scaler', RobustScaler()), ('LinearSGDSVC',svcsgd_clf)])
pipe_svcsgd = Pipeline([('Quantile Transformer', quant_norm),('LinearSGDSVC',svcsgd_clf)])


In [None]:
# XGB_clf = XGBClassifier(max_depth = 5,
#                                  learning_rate = 0.007,
#                                  n_estimators = 7000,
#                                  objective = 'binary:logistic',
#                                  booster = 'gbtree',
#                                  gamma = 1.5,
#                                  max_delta_step = 3,
#                                  min_child_weight = 10,
#                                  subsample = 0.6,
#                                  colsample_bytree = 0.8,
#                                  n_jobs = -1
#                                  )

# quant_scaler = QuantileTransformer()
# features_train_xgb = pd.DataFrame(quant_scaler.fit_transform(features_train))
# features_val_xgb = pd.DataFrame(quant_scaler.transform(features_val))

# xgb = XGB_clf.fit(features_train_xgb.values,
#                        target_train.values.ravel(),
#                        eval_set = [(features_train_xgb.values, target_train), (features_val_xgb.values, target_val)], 
#                        eval_metric = 'auc',
#                        early_stopping_rounds = 25,
#                        verbose = True)

In [None]:
# features_test = prep_test_data(df_test)
# features_test = pd.DataFrame(quant_scaler.transform(features_test))
# quant_xgb_preds_sub = xgb.predict_proba(features_test)
# df_sub_quant_xgb = pd.DataFrame({'id': df_test.loc[:,'id'], 'target':quant_xgb_preds_sub[:,1]})
# print(df_sub_quant_xgb.head())
# df_sub_quant_xgb.to_csv('df_sub_quant_xgb.csv', index = None)

In [None]:
#df_sub_quant_xgb.to_csv('submission_quant_xgb.csv', index = None)

In [None]:
pipes = {'Quant LogReg': pipe_log,
         'Robust LogReg': pipe_robust_log,
         #'RandomForest':pipe_rf,
        #'AdaBoost':pipe_ada,
         'Quant LinearSVC':pipe_svc,
         'Robust LinearSVC': pipe_robust_svc,
        'Quant LinearSGDSVC': pipe_svcsgd,
        'Robust LinearSGDSVC': pipe_robust_svcsgd,}

for key,pipe in pipes.items():
    print(key)
    cv_scores = cross_val_score(pipe,features_train, target_train, 
                               scoring = 'roc_auc', n_jobs = -1)
    print(f"Mean ROC AUC: {np.mean(cv_scores)}")

In [None]:
pipe_robust_log.fit(features_train, target_train)
features_test = prep_test_data(df_test)
quant_log_preds_sub = pipe_robust_log.predict_proba(features_test)
df_sub_quant_log = pd.DataFrame({'id': df_test.loc[:,'id'], 'target':quant_log_preds_sub[:,1]})
df_sub_quant_log.to_csv('submission_robust_log.csv', index = None)

In [None]:
# pipe_robust_svc.fit(features_train, target_train)
# features_test = prep_test_data(df_test)
# quant_svc_preds_sub = pipe_robust_svc.predict(features_test)
# df_sub_quant_svc = pd.DataFrame({'id': df_test.loc[:,'id'], 'target':quant_svc_preds_sub})
# df_sub_quant_svc.to_csv('submission_robust_svc.csv', index = None)

In [None]:
perm_feat_imp_logreg = permutation_importance(pipe_robust_log, 
                                               features, target, 
                                               scoring = 'roc_auc',
                                               n_repeats = 3,
                                               n_jobs = -1, 
                                               random_state = RAND_STATE)

In [None]:
perm_feat_imp_logreg_series = pd.Series(perm_feat_imp_logreg.get('importances_mean'),index = features.columns,)
perm_feat_imp_logreg_series = perm_feat_imp_logreg_series.sort_values(ascending = False)

fig,ax = plt.subplots(figsize = (15,20))
sns.barplot(y = perm_feat_imp_logreg_series.index,
            x = perm_feat_imp_logreg_series.values,
            ax = ax)
sns.despine()

In [None]:
features_reduced = features_train.loc[:,perm_feat_imp_logreg_series[:40].index]

In [None]:
cv_scores = cross_val_score(pipe_robust_log,features_reduced, target_train, 
                               scoring = 'roc_auc', n_jobs = -1)
print(f"Mean ROC AUC: {np.mean(cv_scores)}")

In [None]:
# features_reduced = features.loc[:,perm_feat_imp_logreg_series[:30].index]
# pipe_robust_log.fit(features_reduced, target)
# features_test = prep_test_data(df_test).loc[:,perm_feat_imp_logreg_series[:30].index]
# quant_log_preds_sub = pipe_robust_log.predict_proba(features_test)
# df_sub_quant_log = pd.DataFrame({'id': df_test.loc[:,'id'], 'target':quant_log_preds_sub[:,1]})
# df_sub_quant_log.to_csv('submission_robust_log.csv', index = None)


In [None]:
#df_sub_quant_log.to_csv('submission.csv', index = None)

In [None]:
selector_f = SelectKBest(score_func = f_classif, k = 40)
selector_f.fit(features_train, target_train)


In [None]:
col_index = selector_f.get_support(indices=True)
col_names = features_train.iloc[:,col_index].columns
pd.Series(selector_f.scores_[:40], index = col_names)

In [None]:
features_train_kbest = features_train.iloc[:,col_index]

pipes = {'Quantile Uni NB': pipe_quantile,
         'Quantile Normal NB': pipe_quantile_norm,
         'Quant LogReg': pipe_log,
         'Robust LogReg': pipe_robust_log,
         'Quant LinearSVC':pipe_svc,
         'Robust LinearSVC': pipe_robust_svc,
        'Quant LinearSGDSVC': pipe_svcsgd,
        'Robust LinearSGDSVC': pipe_robust_svcsgd,}

for key,pipe in pipes.items():
    print(key)
    cv_scores = cross_val_score(pipe,features_train_kbest, target_train, 
                               scoring = 'roc_auc', n_jobs = -1)
    print(f"Mean ROC AUC: {np.mean(cv_scores)}")

In [None]:
selector_f70 = SelectKBest(score_func = f_classif, k = 70)
selector_f70.fit(features_train, target_train)
col_index = selector_f70.get_support(indices=True)

features_train_kbest = features_train.iloc[:,col_index]

pipes = {'Quantile Uni NB': pipe_quantile,
         'Quantile Normal NB': pipe_quantile_norm,
         'Quant LogReg': pipe_log,
         'Robust LogReg': pipe_robust_log,
         'Quant LinearSVC':pipe_svc,
         'Robust LinearSVC': pipe_robust_svc,
        'Quant LinearSGDSVC': pipe_svcsgd,
        'Robust LinearSGDSVC': pipe_robust_svcsgd,}

for key,pipe in pipes.items():
    print(key)
    cv_scores = cross_val_score(pipe,features_train_kbest, target_train, 
                               scoring = 'roc_auc', n_jobs = -1)
    print(f"Mean ROC AUC: {np.mean(cv_scores)}")

In [None]:
features_reduced = features_train_kbest = features_train.iloc[:,col_index]
pipe_robust_log.fit(features_reduced, target_train)
pipe_robust_svc.fit(features_reduced, target_train)
features_test = prep_test_data(df_test).iloc[:,col_index]
robust_log_70_preds_sub = pipe_robust_log.predict_proba(features_test)
robust_svc_70_preds_sub = pipe_robust_svc.predict(features_test)
df_sub_robust_log_70 = pd.DataFrame({'id': df_test.loc[:,'id'], 'target':robust_log_70_preds_sub[:,1]})
df_sub_robust_svc_70 = pd.DataFrame({'id': df_test.loc[:,'id'], 'target':robust_svc_70_preds_sub})
df_sub_robust_log_70.to_csv('submission_robust_log_70best.csv', index = None)
df_sub_robust_svc_70.to_csv('submission_robust_svc_70best.csv', index = None)

In [None]:
pca = PCA(n_components = 0.95)

pipe_quant_pca_logreg = Pipeline([('QuantileTransformer', QuantileTransformer()),
                                  ('PCA',pca),
                                  ('LogReg',log_clf)])
pipe_robust_pca_logreg = Pipeline([('Scaler', RobustScaler()),
                                  ('PCA',pca),
                                  ('LogReg',log_clf)])
pipe_quant_pca_svc = Pipeline([('QuantileTransformer', QuantileTransformer()),
                                  ('PCA',pca),
                                  ('LinearSVC',svc_clf)])
pipe_robust_pca_svc = Pipeline([('Scaler', RobustScaler()),
                                  ('PCA',pca),
                                  ('LinearSVC',svc_clf)])

In [None]:
pipes = {'Quant LogReg': pipe_quant_pca_logreg,
         'Robust LogReg': pipe_robust_pca_logreg,
         'Quant LinearSVC':pipe_quant_pca_svc,
         'Robust LinearSVC': pipe_robust_pca_svc}

for key,pipe in pipes.items():
    print(key)
    cv_scores = cross_val_score(pipe,features_train, target_train, 
                               scoring = 'roc_auc', n_jobs = -1)
    print(f"Mean ROC AUC: {np.mean(cv_scores)}")

In [None]:
def tune_hyperparams(pipeline, param_grid, n_iter = 50, iid = True):
    '''ADD DOCSTRING'''
    bayes_search = BayesSearchCV(pipeline,
                                 param_grid,
                                 n_iter = n_iter,
                                 scoring = 'roc_auc',
                                 cv = 3,
                                 random_state = RAND_STATE,
                                 verbose = 1,
                                 n_jobs = 2)
    bayes_search.fit(features, target)
    best_estimator = bayes_search.best_estimator_
    print(f'Best CV ROC-AUC {bayes_search.best_score_}\n')
    #print(pd.DataFrame(bayes_search.cv_results_))
    print(bayes_search.best_estimator_)
    return best_estimator

In [None]:
# logreg_clf = LogisticRegression(solver = 'saga', 
#                                 random_state = RAND_STATE,
#                                 n_jobs = -1,
#                                 max_iter = 500,
#                                 penalty = 'elasticnet')

# pipe_robust_logreg = Pipeline([('scaler', RobustScaler()),
#                                ('logreg',logreg_clf)])

# robust_logreg_params = {
#                         'logreg__l1_ratio': np.arange(0, 1.1, 0.1),
#                         'logreg__C': np.geomspace(0.001, 100, 10)
#                        }

In [None]:
# best_robust_logreg = tune_hyperparams(pipe_robust_logreg,
#                                      robust_logreg_params, n_iter = 50)


In [None]:
# best_robust_logreg.fit(features,target)
# features_test = prep_test_data(df_test)
# robust_log_tuned_preds_sub = best_robust_logreg.predict_proba(features_test)
# df_sub_robust_log_tuned = pd.DataFrame({'id': df_test.loc[:,'id'], 'target':robust_log_tuned_preds_sub[:,1]})
# df_sub_robust_log_tuned.to_csv('submission.csv', index = None)

In [None]:
# logreg_clf = LogisticRegression(solver = 'saga', 
#                                 random_state = RAND_STATE,
#                                 n_jobs = -1,
#                                 max_iter = 500,
#                                 penalty = 'elasticnet')

# pipe_quant_logreg = Pipeline([('scaler', QuantileTransformer()),
#                                ('logreg',logreg_clf)])

# quant_logreg_params = {'scaler__output_distribution':['normal','uniform'],
#                         'logreg__l1_ratio': np.arange(0, 1.1, 0.1),
#                         'logreg__C': np.geomspace(0.001, 100, 10)
#                        }

# best_quant_logreg = tune_hyperparams(pipe_quant_logreg,
#                                      quant_logreg_params)

In [None]:
best_quant_logreg = Pipeline(steps=[('scaler', QuantileTransformer(output_distribution='normal')),
                ('logreg',
                 LogisticRegression(C=0.001, l1_ratio=0.7000000000000001,
                                    max_iter=500, n_jobs=-1,
                                    penalty='elasticnet', random_state=13,
                                    solver='saga'))])

In [None]:
best_quant_logreg.fit(features,target)
features_test = prep_test_data(df_test)
quant_log_tuned_preds_sub = best_quant_logreg.predict_proba(features_test)
df_sub_quant_log_tuned = pd.DataFrame({'id': df_test.loc[:,'id'], 'target':quant_log_tuned_preds_sub[:,1]})
df_sub_quant_log_tuned.to_csv('submission_quant.csv', index = None)


#### The dataset is used for this competition is synthetic, but based on a real dataset and generated using a CTGAN. This dataset is based off of the original Forest Cover Type Prediction competition.