<center><em>Copyright IBM</em></center>
<center><em><b>Created By Trilok Nath</b></em></center>

# Cross Validation Implementation

In [2]:
# import necessary Libraries.
#Basic Libraries
import numpy as np
import pandas as pd
from scipy.special import comb

# sklearn Dataset
from sklearn.datasets import load_breast_cancer

#Machine Learning Model
from sklearn.linear_model import LogisticRegression

#Performance Metrics
from sklearn.metrics import accuracy_score

#Cross Validation
from sklearn.model_selection import (
    KFold,
    RepeatedKFold,
    LeaveOneOut,
    LeavePOut,
    StratifiedKFold,
    cross_validate,
    train_test_split,
)

In [7]:
# Load Data
X,y = load_breast_cancer(return_X_y=True)

In [10]:
# Covert data into pandas specific Data Structures
X = pd.DataFrame(X)
y = pd.Series(y).map({0:1, 1:0})

In [11]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [13]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       569 non-null    float64
 1   1       569 non-null    float64
 2   2       569 non-null    float64
 3   3       569 non-null    float64
 4   4       569 non-null    float64
 5   5       569 non-null    float64
 6   6       569 non-null    float64
 7   7       569 non-null    float64
 8   8       569 non-null    float64
 9   9       569 non-null    float64
 10  10      569 non-null    float64
 11  11      569 non-null    float64
 12  12      569 non-null    float64
 13  13      569 non-null    float64
 14  14      569 non-null    float64
 15  15      569 non-null    float64
 16  16      569 non-null    float64
 17  17      569 non-null    float64
 18  18      569 non-null    float64
 19  19      569 non-null    float64
 20  20      569 non-null    float64
 21  21      569 non-null    float64
 22  22

In [15]:
y.head()

0    1
1    1
2    1
3    1
4    1
dtype: int64

# Train Split Data

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)
X_train.shape, X_test.shape

((398, 30), (171, 30))

# K-Fold Cross Validation

In [21]:
# Logistic Regression Model
log = LogisticRegression(penalty ='l1', C=10, solver='liblinear', random_state=4, max_iter=10000)

# K-Fold Cross-Validation Instance
kfold = KFold(n_splits=5, shuffle=True, random_state=4) # 5 to 10

# estimate error
clf =  cross_validate(
    log,
    X_train, 
    y_train,
    scoring='accuracy',
    return_train_score=True,
    cv=kfold,
)

In [22]:
print(f"The Mean Accuracy Of Test Data: {np.mean(clf['test_score'])}")
print(f"The Mean Accuracy Of Train Data: {np.mean(clf['train_score'])}")

The Mean Accuracy Of Test Data: 0.9648101265822785
The Mean Accuracy Of Train Data: 0.9811577058811931


# Repeated K-Fold Cross Validation

In [24]:
# Logistic Regression
log = LogisticRegression(
    penalty ='l1', C=10, solver='liblinear', random_state=4, max_iter=10000)

# Repeated K-Fold Cross-Validation
rkfold = RepeatedKFold(
    n_splits=5,
    n_repeats=10,
    random_state=4,
)

# estimate error
clf =  cross_validate(
    log,
    X_train, 
    y_train,
    scoring='accuracy',
    return_train_score=True,
    cv=rkfold, # repeated k-fold
)

In [25]:
print(f"The Mean Accuracy Of Test Data: {np.mean(clf['test_score'])}")
print(f"The Mean Accuracy Of Train Data: {np.mean(clf['train_score'])}")

The Mean Accuracy Of Test Data: 0.9660727848101266
The Mean Accuracy Of Train Data: 0.9818467695826187


# Leave Out Cross Validation

In [26]:
# Logistic Regression
log = LogisticRegression(
    penalty ='l2', C=10, solver='liblinear', random_state=4, max_iter=10000)

# Leave One Out Cross-Validation
loo = LeaveOneOut()

# estimate error
clf =  cross_validate(
    log,
    X_train, 
    y_train,
    scoring='accuracy',
    return_train_score=True,
    cv=loo, # LOO
)

In [27]:
print(f"The Mean Accuracy Of Test Data: {np.mean(clf['test_score'])}")
print(f"The Mean Accuracy Of Train Data: {np.mean(clf['train_score'])}")

The Mean Accuracy Of Test Data: 0.9522613065326633
The Mean Accuracy Of Train Data: 0.9598432970899841


# Leave P Out Cross Validation

In [None]:
# Logistic Regression
log = LogisticRegression(
    penalty ='l1', C=10, solver='liblinear', random_state=4, max_iter=10000)

# Leave P Out Cross-Validation
lpo = LeavePOut(p=3)

# I take a smaller sample of the data, otherwise
# my computer runs out of memory
X_train_small = X_train.head(50)
y_train_small = y_train.head(50)


# estimate generalization error
clf =  cross_validate(
    log,
    X_train_small, 
    y_train_small,
    scoring='accuracy',
    return_train_score=True,
    cv=lpo, # lpo
)

In [None]:
print(f"The Mean Accuracy Of Test Data: {np.mean(clf['test_score'])}")
print(f"The Mean Accuracy Of Train Data: {np.mean(clf['train_score'])}")

# Stratified Cross Validation

In [None]:
# Logistic Regression
log = LogisticRegression(
    penalty ='l1', C=10, solver='liblinear', random_state=4, max_iter=10000)

# Leave P Out Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=4)

# estimate generalization error
clf =  cross_validate(
    logit,
    X_train, 
    y_train,
    scoring='accuracy',
    return_train_score=True,
    cv=skf, # stratified
)

In [None]:
print(f"The Mean Accuracy Of Test Data: {np.mean(clf['test_score'])}")
print(f"The Mean Accuracy Of Train Data: {np.mean(clf['train_score'])}")

# End