In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

The goal of this notebook is to illustrate and explain different cross-validation variations.

We will use LogisticRegression Model because it evaluates faster than many of the machine learning algorithms.

Hope you'll enjoy it!

In [None]:
train=pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')


In [None]:
from sklearn.model_selection import train_test_split, KFold, GridSearchCV,StratifiedKFold,LeavePOut
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [None]:
train.head()

In [None]:
train.isnull().sum()

In [None]:
from sklearn.impute import SimpleImputer
si=SimpleImputer(strategy='mean')
ss=StandardScaler()
def impute_N_scale(df):
    for i in df.columns.values:
        df[i]=si.fit_transform(df[i].to_numpy().reshape(-1,1))
    for i in df.columns.values:
        df[i]=ss.fit_transform(df[i].to_numpy().reshape(-1,1))
    return df

In [None]:
X=train.iloc[:,1:119]
y=train.iloc[:,-1]

In [None]:
X=impute_N_scale(X)

# Train/Test Splits

'train_test_split' function basically splits the dataset into train and test datasets. Train dataset is used for training the model and test dataset is used for evaluating the model performance.

For the problems whose results of the test split are unknown (prediction) or maybe due to some other reasons, we split the training dataset into train and validation datasets. By doing that we can train and evaluate the model we construct without using a test dataset.  

<img src= "https://algotrading101.com/learn/wp-content/uploads/2020/06/training-validation-test-data-set.png" alt ="train_test_split" style='width: 500px;'>

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X,y, random_state=42)

In [None]:
log_model=LogisticRegression(multi_class='ovr',#binary classification problem
                       solver='saga',#supports both penalth l1 and l2
                      random_state=42)
log_model.fit(X_train,y_train)
preds=log_model.predict(X_val)
print(f'Error for regular train/test splits: {np.sqrt(mean_squared_error(preds,y_val))}')

# Stratifying 

By stratifying we will be make sure that each split have the same proprotions of outcome values.

In [None]:
X_train,X_val,y_train,y_val=train_test_split(X,y, random_state=42, stratify=y)

In [None]:
log_model=LogisticRegression(multi_class='ovr',
                       solver='saga',
                      random_state=42)
log_model.fit(X_train,y_train)
preds=log_model.predict(X_val)
print(f'Error for LogisticRegression Model: {np.sqrt(mean_squared_error(preds,y_val))}')

# KFold Cross Validation

KFold Cross-Validation is used for evaluating the model more intelligently.  We divide our training data into k splits. Each time one split will be used for validation and the rest is used for training(k-1).    

<img src= "https://miro.medium.com/max/3000/1*_Ygt7XSGmDvBIdXX-z6TWw.png" alt ="KFold" style='width: 500px;'>

I have influenced the code below from 'mlanhenke'. You can find his code from [here](https://www.kaggle.com/mlanhenke/tps-09-single-catboostclassifier)

In [None]:
# k=5
kf=KFold(n_splits=5,shuffle =True,random_state=42)
err=[]
for fold, (train_index, val_index) in enumerate(kf.split(X)):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[val_index], y.iloc[val_index]
    model=LogisticRegression()
    model.fit(X_train,y_train)
    preds=model.predict(X_val)
    print(f'Error for KFold {fold+1}: {np.sqrt(mean_squared_error(preds,y_val))}')
    err.append(np.sqrt(mean_squared_error(preds,y_val)))
print('#'*20)
print(f'Average error of 5 folds:  {np.mean(err)}')

# Stratified Cross Validation
It is similar to KFold. However, in this method, KFold returns stratified results.

In [None]:
# k=5
skf=StratifiedKFold(n_splits=5,shuffle =True,random_state=42)
err=[]
for fold, (train_index, val_index) in enumerate(skf.split(X,y)):
    X_train, y_train = X.iloc[train_index], y.iloc[train_index]
    X_val, y_val = X.iloc[val_index], y.iloc[val_index]
    model=LogisticRegression()
    model.fit(X_train,y_train)
    preds=model.predict(X_val)
    print(f'Error for KFold {fold+1}: {np.sqrt(mean_squared_error(preds,y_val))}')
    err.append(np.sqrt(mean_squared_error(preds,y_val)))
print('#'*20)
print(f'Average error of 5 folds:  {np.mean(err)}')

# Leave One Out Cross Validation
This is just a specific case of the KFold Cross-Validation. This time number of folds is equal to the number of samples. (k=n)
It is good for evaluation however so much computation takes place which requires lot of time.


<img src= "https://i1.wp.com/dataaspirant.com/wp-content/uploads/2020/12/7-LOOCV-Leave-One-Out-Cross-Validation.png?ssl=1" alt ="Leave One Out" style='width: 500px;'>


In [None]:
# k=n
# for the convenience first 20 samples are used.Normally all of them should be used.
# In other words: replace all X[:20] and y[:20] with X and y.
kf=KFold(n_splits=len(X[:20]),shuffle =True,random_state=42)
err=[]
for fold, (train_index, val_index) in enumerate(kf.split(X[:20])):
    X_train, y_train = X[:20].iloc[train_index], y[:20].iloc[train_index]
    X_val, y_val = X[:20].iloc[val_index], y[:20].iloc[val_index]
    model=LogisticRegression()
    model.fit(X_train,y_train)
    preds=model.predict(X_val)
    print(f'Error for KFold {fold+1}: {np.sqrt(mean_squared_error(preds,y_val))}')
    err.append(np.sqrt(mean_squared_error(preds,y_val)))
print('#'*20)
print(f'Average error of all folds:  {np.mean(err)}')

# Leave P Out Cross Validation
In this type of Cross-Validation, we split p number of samples from our dataset and train with the rest of the data. Then we iterate over the all samples.

In [None]:
#p=2
lpo=LeavePOut(2)
err=[]
for fold, (train_index, val_index) in enumerate(lpo.split(X[:20])):
    X_train, y_train = X[:20].iloc[train_index], y[:20].iloc[train_index]
    X_val, y_val = X[:20].iloc[val_index], y[:20].iloc[val_index]
    model=LogisticRegression()
    model.fit(X_train,y_train)
    preds=model.predict(X_val)
    print(f'Error for KFold {fold+1}: {np.sqrt(mean_squared_error(preds,y_val))}')
    err.append(np.sqrt(mean_squared_error(preds,y_val)))
print('#'*20)
print(f'Average error of all folds:  {np.mean(err)}')

# Nested Cross Validation
In nested cross validation we divide dataset into two loops. One loop is used for finding the best parameters of the machine learning model (inner loop), and the othe one is used for regular KFold Cross Validation Process.

<img src= "https://www.researchgate.net/profile/Danilo-Bzdok/publication/324829283/figure/fig4/AS:631579718197312@1527591747794/A-diagram-of-the-nested-k-fold-cross-validation-with-model-selection.png" alt ="Nested" style='width: 700px;'>


In [None]:
"""
from sklearn.model_selection import KFold
clf=LogisticRegression(multi_class='ovr',#binary classification problem
                       solver='saga',#supports both penalth l1 and l2
                      random_state=42)
p_grid = {"penalty": ['l1','l2'],
          "C": [1,100,10]}

inner_cv=KFold(n_splits=2,shuffle=True, random_state=42)



gcv=GridSearchCV(estimator=clf,
                param_grid=p_grid,scoring='accuracy',
                 n_jobs=-1,
                 cv=inner_cv)
outer_cv=KFold(n_splits=5, shuffle=True,random_state=42)

for train_idx,valid_idx in outer_cv.split(X_train,y_train):
    gcv.fit(X_train.iloc[train_idx],y_train.iloc[train_idx])
    print(f'Best Parameters: {gcv.best_params_}')
    print(f'Best Score: {gcv.best_score_*100}%')
    print(f'Best Estimator: {gcv.best_estimator_}')
    print(f'Accuracy on outer fold: {gcv.best_estimator_.score(X_train.iloc[valid_idx],y_train.iloc[valid_idx])*100}%')
    """
# The code does not work well on kaggle platform. So I will share my results below.

Best Parameters: {'C': 10, 'penalty': 'l1'}
Best Score: 52.78239934232418%
Best Estimator: LogisticRegression(C=10, multi_class='ovr', penalty='l1', random_state=42,
                   solver='saga')
Accuracy on outer fold: 52.86752616332176%

Best Parameters: {'C': 1, 'penalty': 'l1'}
Best Score: 52.78051893037035%
Best Estimator: LogisticRegression(C=1, multi_class='ovr', penalty='l1', random_state=42,
                   solver='saga')
Accuracy on outer fold: 52.77326495592659%

Best Parameters: {'C': 1, 'penalty': 'l2'}
Best Score: 52.79683031655961%
Best Estimator: LogisticRegression(C=1, multi_class='ovr', random_state=42, solver='saga')
Accuracy on outer fold: 53.040771986141834%

Best Parameters: {'C': 1, 'penalty': 'l2'}
Best Score: 52.834672757464595%
Best Estimator: LogisticRegression(C=1, multi_class='ovr', random_state=42, solver='saga')
Accuracy on outer fold: 52.890707066752796%

Best Parameters: {'C': 1, 'penalty': 'l1'}
Best Score: 52.80857451221069%
Best Estimator: LogisticRegression(C=1, multi_class='ovr', penalty='l1', random_state=42,
                   solver='saga')
Accuracy on outer fold: 52.71062916348595%

# Conclusion
In this part of this notebook we,

1. First split the train data to X and y.(Features and outcomes)
2. Filled the empty areas with the mean of the corresponding feature columns.
3. We split X and y by using train_test_split function. Then we evaluated with LogisticRegression
4. We split X and y by using train_test_split function but this time we stratified.Then we evaluated with LogisticRegression
5. We used KFold CV, StratifiedKFold CV, Leave One Out CV, Leave P Out CV and Nested CV.

Thanks for your kind attention!