In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold, train_test_split, StratifiedKFold
from lightgbm import LGBMClassifier
from sklearn.metrics import RocCurveDisplay, auc
from sklearn.decomposition import PCA
from sklearn.metrics import roc_auc_score


from sklearn.pipeline import Pipeline
from copy import deepcopy


from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from eli5.sklearn import PermutationImportance
import eli5
from sklearn.inspection import PartialDependenceDisplay
from sklearn.preprocessing import StandardScaler

from path import Path
import warnings 
warnings.filterwarnings('ignore') # supress warnings


In [2]:
# loading datasets
path = Path("/kaggle/input/playground-series-s3e12")

train = pd.read_csv(path / "train.csv")
test = pd.read_csv(path / "test.csv")
sub = pd.read_csv(path / "sample_submission.csv")
original = pd.read_csv("/kaggle/input/kidney-stone-prediction-based-on-urine-analysis/kindey stone urine analysis.csv")
original = original[train.drop('id', axis=1).columns.tolist()]

original = original.drop([5, 32]).reindex()


train.head()

Unnamed: 0,id,gravity,ph,osmo,cond,urea,calc,target
0,0,1.013,6.19,443,14.8,124,1.45,0
1,1,1.025,5.4,703,23.6,394,4.18,0
2,2,1.009,6.13,371,24.5,159,9.04,0
3,3,1.021,4.91,442,20.8,398,6.63,1
4,4,1.021,5.53,874,17.8,385,2.21,1


In [3]:
#  plot correaltion        
def plot_corr(df, features, target, sort=False, method='pearson', figsize=(13, 8), use_mask=True, mask_type="triu", **kwargs):
    """Plot correlation given dataset and features names"""
    plt.figure(figsize=figsize) # sets figure size
    corr = df[features].corr(method=method) # calculates correlation based on the features and method provied
    target_values = corr[target]
    features.remove(target)
    corr = corr[features]
    
    corr = pd.concat([corr, target_values], axis=1)
    
    if sort: # if sort is true then sort the correlation matrix
        corr = corr.sort_values(by=target)
    if use_mask: # if uses_mask = True
        if mask_type == 'triu': # sets mask type to lower trigonal
            mask = np.triu(np.ones(corr.shape)) 
        else:   # sets mask type to upper trigonal
            mask = np.tril(np.ones(corr.shape))
        sns.heatmap(corr, annot=True, mask=mask, **kwargs)
        
    else: # if uses mask is not true
         sns.heatmap(corr, annot=True, **kwargs)       

    plt.title('Correlation between features')

    
    
def submission_csv(_model, _train, _test):
    X_train = _train.drop(['id', 'target'], axis=1)
    y_train = _train['target']
    X_test = _test.drop('id', axis=1)
    
    _model.fit(X_train, y_train)
    
    preds = _model.predict_proba(X_test)[:, 1]
    
    df = pd.DataFrame()
    df['id'] = _test['id']
    df['target'] = preds
    
    return 



def plot_importance(models, X_test, title=""):
#     taken from https://www.kaggle.com/code/shoabahamed/ps3e9-eda-and-gbdt-catboost-median-duplicatedata/edit
    """Plots features importance given models and train set"""
    features = X_test.columns.tolist()
    feature_importance = pd.DataFrame()
    for model in models:
        _df = pd.DataFrame()
        _df['importance'] = model.feature_importances_
        _df["features"] = pd.Series(features)
        _df = _df.sort_values(by='importance', ascending=False)
        feature_importance = pd.concat([feature_importance, _df])
        
                
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    plt.figure(figsize=(16, 10))
    ax = sns.barplot(x='importance', y='features', data=feature_importance, color='skyblue', errorbar='sd')
    
    for i in ax.containers:
        ax.bar_label(i,)
    
   
    plt.xlabel('Importance', fontsize=14)
    plt.ylabel('Feature', fontsize=14)
    plt.title(f"{title} Feature Importances", fontsize=18)
    plt.grid(True, axis='x')
    plt.show()
    
    return feature_importance


class Splitter:
    """A splitter class which splits the X, y using the split_data function with a random state provided. It yeilds \
    X_train, X_val, y_train, y_val, train_idx, val_idx in the end.\
    code from  https://www.kaggle.com/code/tetsutani/ps3e9-eda-and-gbdt-catboost-median-duplicatedata wit little bit of modification """

    def __init__(self, test_size=0.2, kfold=True, n_splits=5, use_loocv=False):
        self.test_size = test_size # set test size
        self.kfold = kfold  # wheter to just split the data in two or use kfold
        self.n_splits= n_splits 
        self.use_loocv = use_loocv
        
    def split_data(self, X, y, random_state):
        if self.kfold:
            if self.use_loocv:
                kf = LeaveOneOut()
            else:
                kf = StratifiedKFold(n_splits=self.n_splits, random_state=random_state, shuffle=True)
            for train_idx, val_idx in kf.split(X, y):
                X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
                y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
                yield X_train, X_val, y_train, y_val, train_idx, val_idx
        else:
            X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=self.test_size, random_state=random_state)
            yield X_train, X_val, y_train, y_val





def evaluate_model(model_name, model_pipeline, _X, _y, features, original_data=None, use_original=False, n_splits=5, random_state_list=[0, 5, 10], use_loocv=False):
    len_y = len(_y)
    len_states = len(random_state_list)

    oof_preds = np.zeros(len_y * len_states).reshape(len_states, len_y)
    scores_train = []
    model_pipelines = []

    for index, random_state in enumerate(random_state_list):
        print("#"*25)
        print("#"*15, f"traininng model {model_name} with seed {random_state}")
        print("#"*25)
        splitter = Splitter(n_splits=n_splits, use_loocv=use_loocv)
        splits = 0
        for X_train, X_val, y_train, y_val, train_idx, val_idx in splitter.split_data(_X, _y, random_state):
            if use_original: # we will only use original data for training not testing
                target = 'target'
                X_train = pd.concat([X_train, original_data.drop(target, axis=1)]) 
                y_train = pd.concat([y_train, original_data[target]])  

            model_pipeline.fit(X_train, y_train)
            oof_preds[index, val_idx] = model_pipeline.predict_proba(X_val)[:, 1].squeeze()
            
            score_train = roc_auc_score(y_train, model_pipeline.predict_proba(X_train)[:, 1])
            scores_train.append(score_train)
            model_pipelines.append(deepcopy(model_pipeline))

            
    oof_preds_mean = oof_preds.mean(axis=0)

    return model_pipelines, oof_preds_mean, np.mean(scores_train), roc_auc_score(_y, oof_preds.mean(axis=0))


def predict_test(model_pipeline, X_test):
    test_preds = model_pipeline.predict_proba(X_test)[:, 1]
    return test_preds


# SVC

have the mindset that only some features would be better for particular models. Here we will only try to find the best features for SVC

****Baseline SVC score****

In [4]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

In [5]:
X = train.drop(['id', 'target'], axis=1)
y = train['target']
features = X.columns.tolist()
n_splits = 100
random_states = [0]

pipe = Pipeline(steps=[
    ("scale", StandardScaler()),
    ("model", SVC(probability=True, random_state=0))
])

_pipes, _oof_preds, _train_score, _oof_score = evaluate_model(f"pipeline_svc", pipe, X, y, features,
                                         use_original=False,
                                         n_splits=n_splits, random_state_list=random_states, use_loocv=False)

#########################
############### traininng model pipeline_svc with seed 0
#########################


In [6]:
_oof_score

0.7821361058601133

This is our baseline score. We will start by adding features as we go

In [7]:
# with only calc feature
train_temp = train.copy()
X_temp = train[['calc']]
y_temp = train['target']
features_temp = X.columns.tolist()


pipe = Pipeline(steps=[
    ("scale", StandardScaler()),
    ("model", SVC(probability=True, random_state=0))
])

_pipes_temp, _oof_preds_temp, _train_score_temp, _oof_score_temp = evaluate_model(f"pipeline_svc", pipe, X_temp, y_temp, features_temp,
                                         use_original=False,
                                         n_splits=n_splits, random_state_list=random_states, use_loocv=False)

#########################
############### traininng model pipeline_svc with seed 0
#########################


In [8]:
# oof score with calc feature only
_oof_score_temp

0.7856687145557657

In [9]:
# with only calc and gravity feature
train_temp = train.copy()
X_temp = train[['calc', 'gravity']]
y_temp = train['target']
features_temp = X.columns.tolist()


pipe = Pipeline(steps=[
    ("scale", StandardScaler()),
    ("model", SVC(probability=True, random_state=0))
])

_pipes_temp, _oof_preds_temp, _train_score_temp, _oof_score_temp = evaluate_model(f"pipeline_svc", pipe, X_temp, y_temp, features_temp,
                                         use_original=False,
                                         n_splits=n_splits, random_state_list=random_states, use_loocv=False)

#########################
############### traininng model pipeline_svc with seed 0
#########################


In [10]:
# with calc and gravity
_oof_score_temp

0.7890595463137996

In [11]:
from sklearn.preprocessing import PolynomialFeatures
# with only calc and gravity feature but polynomial features included
train_temp = train.copy()

X_temp = train[['calc', 'gravity']]
y_temp = train_temp['target']
features_temp = train_temp.columns.tolist()


poly = ColumnTransformer(transformers=[("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True),
                                               ['calc', 'gravity'])],
                                         remainder='passthrough')

pipe = make_pipeline(poly, StandardScaler(), SVC(probability=True, random_state=0))

_pipes_temp, _oof_preds_temp, _train_score_temp, _oof_score_temp = evaluate_model(f"pipeline_svc", pipe, X_temp, y_temp, features_temp,
                                         use_original=False,
                                         n_splits=n_splits, random_state_list=random_states, use_loocv=False)

#########################
############### traininng model pipeline_svc with seed 0
#########################


In [12]:
# with only calc and gravity feature but polynomial features included interation=True, bias=False
_oof_score_temp

0.7926275992438563

In [13]:
train_temp = train.copy()
# features calc, cond
X_temp = train[['calc', 'cond']]
y_temp = train['target']
features_temp = X_temp.columns.tolist()


poly = ColumnTransformer(transformers=[("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True),
                                               ['calc', 'cond'])],
                                         remainder='passthrough')

pipe = make_pipeline(poly, StandardScaler(), SVC(probability=True, random_state=0))

_pipes_temp, _oof_preds_temp, _train_score_temp, _oof_score_temp = evaluate_model(f"pipeline_svc", pipe, X_temp, y_temp, features_temp,
                                         use_original=False,
                                         n_splits=n_splits, random_state_list=random_states, use_loocv=False)

#########################
############### traininng model pipeline_svc with seed 0
#########################


In [14]:
# with only calc and cond
_oof_score_temp

0.7950259924385633

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
# train_temp = train.copy()
# # features calc, cond
# X_temp = train[['calc', 'cond', 'urea']]
# X_temp['urea'] = pd.qcut(train['urea'], q=10)
# y_temp = train['target']
# features_temp = X_temp.columns.tolist()

# transformers = ColumnTransformer(transformers=[("poly", make_pipeline(PolynomialFeatures(degree=2, include_bias=False, interaction_only=True), 
#                                                         StandardScaler()),['calc', 'cond']), 
#                                                    ('ohe', OneHotEncoder(sparse=False, drop='first'), ['urea'])],
#                                                  remainder='passthrough')

# pipe = make_pipeline(transformers, SVC(probability=True, random_state=0))


# _pipes_temp, _oof_preds_temp, _train_score_temp, _oof_score_temp = evaluate_model(f"pipeline_svc", pipe, X_temp, y_temp, features_temp,
#                                          use_original=False,
#                                          n_splits=n_splits, random_state_list=random_states, use_loocv=False)

****Hyperparamter Tuning for SVC****

In [17]:
train_temp = train.copy()
# features calc, cond
X_temp = train[['calc', 'cond']]
y_temp = train['target']
features_temp = X_temp.columns.tolist()


poly = ColumnTransformer(transformers=[("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
                                               ['calc', 'cond'])],
                                         remainder='passthrough')

pipe = make_pipeline(poly, StandardScaler(), SVC(probability=True, random_state=0, C=0.09))

_pipes_temp, _oof_preds_temp, _train_score_temp, _oof_score_temp = evaluate_model(f"pipeline_svc", pipe, X_temp, y_temp, features_temp,
                                         use_original=False,
                                         n_splits=n_splits, random_state_list=random_states, use_loocv=False)

_oof_score_temp

#########################
############### traininng model pipeline_svc with seed 0
#########################


0.7886578449905481

In [18]:
train_temp = train.copy()
# features calc, cond
X_temp = train[['calc', 'cond', 'gravity', 'ph']]
y_temp = train['target']
features_temp = X_temp.columns.tolist()


poly = ColumnTransformer(transformers=[("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True),
                                               ['calc', 'cond'])],
                                         remainder='passthrough')

pipe = make_pipeline(poly, StandardScaler(), SVC(probability=True, random_state=0, kernel='rbf', C=0.1))

_pipes_temp, _oof_preds_temp, _train_score_temp, _oof_score_temp = evaluate_model(f"pipeline_svc", pipe, X_temp, y_temp, features_temp,
                                         use_original=False,
                                         n_splits=n_splits, random_state_list=random_states, use_loocv=False)

_oof_score_temp

# best till now

#########################
############### traininng model pipeline_svc with seed 0
#########################


0.8007088846880908

In [19]:
train_temp = train.copy()
# features calc, c
X_temp = train[['calc', 'cond', 'gravity', 'ph']]
y_temp = train['target']
features_temp = X_temp.columns.tolist()


poly = ColumnTransformer(transformers=[("poly", PolynomialFeatures(degree=2, include_bias=False, interaction_only=True),
                                               ['calc', 'cond', 'gravity'])],
                                         remainder='passthrough')

pipe = make_pipeline(poly, StandardScaler(), SVC(probability=True, random_state=0, kernel='rbf', C=0.01))

_pipes_temp, _oof_preds_temp, _train_score_temp, _oof_score_temp = evaluate_model(f"pipeline_svc", pipe, X_temp, y_temp, features_temp,
                                         use_original=False,
                                         n_splits=n_splits, random_state_list=random_states, use_loocv=False)

_oof_score_temp

#########################
############### traininng model pipeline_svc with seed 0
#########################


0.7968809073724008

In [20]:
train_temp = train.copy()
# features calc, cond
X_temp = train[['calc', 'cond', 'urea', 'ph', 'gravity']]
# X_temp['urea'] = pd.qcut(train['urea'], q=10)
y_temp = train['target']
features_temp = X_temp.columns.tolist()

transformers = ColumnTransformer(transformers=[("poly", make_pipeline(PolynomialFeatures(degree=2, include_bias=False, interaction_only=True), 
                                                        StandardScaler()),['calc', 'cond', 'gravity', 'ph']),
                                                  ('scale', StandardScaler(), ['urea'])],
                                                 remainder='passthrough')

pipe = make_pipeline(transformers, SVC(probability=True, random_state=0, C=0.05))


_pipes_temp, _oof_preds_temp, _train_score_temp, _oof_score_temp = evaluate_model(f"pipeline_svc", pipe, X_temp, y_temp, features_temp,
                                         use_original=False,
                                         n_splits=n_splits, random_state_list=random_states, use_loocv=False)

_oof_score_temp
# best till now

#########################
############### traininng model pipeline_svc with seed 0
#########################


0.8027646502835538

In [21]:
train_temp = train.copy()
# features calc, cond
X_temp = train[['calc', 'cond', 'urea', 'ph', 'gravity', 'osmo']]
# X_temp['urea'] = pd.qcut(train['urea'], q=10)
y_temp = train['target']
features_temp = X_temp.columns.tolist()

transformers = ColumnTransformer(transformers=[("poly", make_pipeline(PolynomialFeatures(degree=2, include_bias=False, interaction_only=True), 
                                                        StandardScaler()),['calc', 'cond', 'gravity', 'ph']),
                                                  ('scale', StandardScaler(), ['urea', 'osmo'])],
                                                 remainder='passthrough')

pipe = make_pipeline(transformers, SVC(probability=True, random_state=0, C=0.03))


_pipes_temp, _oof_preds_temp, _train_score_temp, _oof_score_temp = evaluate_model(f"pipeline_svc", pipe, X_temp, y_temp, features_temp,
                                         use_original=False,
                                         n_splits=n_splits, random_state_list=random_states, use_loocv=False)

_oof_score_temp

#########################
############### traininng model pipeline_svc with seed 0
#########################


0.7986767485822306