# CH05_4_Implement Cross Validation using sklearn
- update : 2022.03.29.

# Previous Chapter Results

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import SGDClassifier 

## Verification Dataset Preparation

In [2]:
# -----------------------------------------------------
# 1. Dataset Preparation 
# -----------------------------------------------------
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import train_test_split

cancer = load_breast_cancer()

x = cancer.data 
y = cancer.target 

x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, stratify=y, test_size=0.2, 
                                                            random_state=42)

# -----------------------------------------------------
# 2. Verificatioin set seperation 
# -----------------------------------------------------
x_train, x_val, y_train, y_val = train_test_split(x_train_all, y_train_all, 
                                                  stratify=y_train_all, test_size=0.2, random_state=42)

print (len(x_train), len(x_val))

# -----------------------------------------------------
# 3. Model Evaluation using Verification set 
# -----------------------------------------------------
sgd = SGDClassifier(loss ='log', random_state=42)
sgd.fit(x_train, y_train)
sgd.score(x_val, y_val)

# -----------------------------------------------------
# Model Trainning and Evaluating using support vector machine
# -----------------------------------------------------
from sklearn.linear_model import SGDClassifier 

sgd = SGDClassifier(loss ='hinge', random_state=42)
sgd.fit(x_train, y_train)
sgd.score(x_test, y_test)

364 91


0.9035087719298246

# Application of Regularization on Logistic Regression 

# Updated Generating Single layer Neural Network

In [3]:
class SingleLayer: 
    
    #--------------------------------------------
    # Add variables for recording verification losses
    #--------------------------------------------
    def __init__(self, learning_rate=0.1, l1=0, l2=0): 
        self.w = None 
        self.b = None 
        self.losses = []
        self.val_losses = []
        self.w_history = []
        self.lr = learning_rate   
        self.l1 = l1
        self.l2 = l2
        
    def forpass(self, x):
        z = np.sum(x * self.w) + self.b      # Calculating line function 
        return z 
    
    def backprop(self, x, err):
        w_grad = x * err                     # Calculating gradient on weight 
        b_grad = 1 * err                     # Calculating gradient on interscept 
        return w_grad, b_grad 
    
    #--------------------------------------------
    # Generating activation() method
    #--------------------------------------------
    def activation(self, z): 
        a = 1 / (1 + np.exp(-z))             # sigmoid calculation 
        return a     
    
    #--------------------------------------------
    # Generating fit() method 
    #--------------------------------------------  
    # mixing sample order of trainning set in each epoch 
    #--------------------------------------------  
    # 3. Recording weight and Managing update quantity 
    #--------------------------------------------  
    def fit(self, x, y, epochs=100, x_val=None, y_val=None):
        self.w = np.ones(x.shape[1])              # initializing weight 
        self.b = 0                                # initializing intercept
        self.w_history.append(self.w.copy())      # recording weight
        np.random.seed(42)                        # designating random seed

        for i in range(epochs):                   # repeate epoches times 
            loss = 0 

            # shuffle indices 
            indexes = np.random.permutation(np.arange(len(x))) 
            for i in indexes:                               # repeat for each sample 
                z = self.forpass(x[i])                      # forward propagation caculation 
                a = self.activation(z)                      # activation function application 
                err = -(y[i] - a)                           # error calculation  
                w_grad, b_grad = self.backprop(x[i], err)   # backward propagation calculation 

                # adding defferentiation of pannelty in Gradient
                w_grad += self.l1 * np.sign(self.w) + self.l2 * self.w 
                self.w -= self.lr * w_grad                  # updating weight 
                self.b -= b_grad                            # updating interscept 

                # recording weight 
                self.w_history.append(self.w.copy())

                # accumulating losses after clipping for safe log calculation 
                a = np.clip(a, 1e-10, 1-1e-10)
                loss += -(y[i]*np.log(a)+(1-y[i])*np.log(1-a))

            # saving average losses in each epoch 
            self.losses.append(loss/len(y))

            # calculating losses for verification set 
            self.update_val_loss(x_val, y_val)

    #--------------------------------------------
    # Add Pannelty on Logistic Loss Function Calculation    
    #--------------------------------------------
    def reg_loss(self): 
        return self.l1 * np.sum(np.abs(self.w)) + self.l2 / 2 * np.sum(self.w**2)

    #--------------------------------------------
    # Modifying update_val_loss() method to call reg_loss()
    #--------------------------------------------
    def update_val_loss(self, x_val, y_val): 
        if x_val is None: 
            return 
        val_loss = 0 

        for i in range(len(x_val)): 
            z = self.forpass(x_val[i])
            a = self.activation(z)
            a = np.clip(a, 1e-10, 1-1e-10)
            val_loss += -(y_val[i] * np.log(a) + (1-y_val[i]) * np.log(1-a))
        self.val_losses.append(val_loss/len(y_val) + self.reg_loss())
         
    #--------------------------------------------
    # Calculation of verification losses
    #--------------------------------------------
    def update_val_loss(self, x_val, y_val):
        if x_val is None: 
            return 
        val_loss = 0 
        for i in range(len(x_val)):
            z = self.forpass(x_val[i])               # forward propagation calculation      
            a = self.activation(z)                   # activation function application 
            a = np.clip(a, 1e-10, 1-1e-10)
            val_loss += -(y_val[i]*np.log(a) + (1-y_val[i])*np.log(1-a))
        self.val_losses.append(val_loss/len(y_val))
        
    #--------------------------------------------
    # Generating Predicting method 
    #--------------------------------------------
    # adding Score method()
    #--------------------------------------------
    def predict(self, x): 
        z = [self.forpass(x_i) for x_i in x]        # forward propogation calculation 
        return np.array(z) > 0                      # step function application    

    def score(self, x, y): 
        return np.mean(self.predict(x) ==y)            

# Implement K-fold Cross Validation 

## 1. Apply Tainning set

In [4]:
validation_scores = []

## 2. K-fold Cross Validation Implementation 

In [5]:
k = 10
bins = len(x_train) // k

for i in range(k):
    start = i * bins
    end = (i+1) * bins
    val_fold = x_train_all[start:end]
    val_target = y_train_all[start:end]
    
    train_index = list(range(0, start))+list(range(end, len(x_train)))
    train_fold = x_train_all[train_index]
    train_target = y_train_all[train_index]
    
    train_mean = np.mean(train_fold, axis = 0)
    train_std = np.std(train_fold, axis = 0)
    train_fold_scaled = (train_fold - train_mean) / train_std
    val_fold_scaled = (val_fold - train_mean) / train_std
    
    lyr = SingleLayer(l2 = 0.01)
    lyr.fit(train_fold_scaled, train_target, epochs = 50)
    score = lyr.score(val_fold_scaled, val_target)
    validation_scores.append(score)
    
print(np.mean(validation_scores))

0.9583333333333333


# Implementation of Cross Validation using sklearn

## 1. Cross Validation Score Calculation using cross_validate() function 

In [6]:
from sklearn.model_selection import cross_validate

sgd = SGDClassifier(loss = 'log', penalty = 'l2', alpha = 0.001, random_state = 42)
scores = cross_validate(sgd, x_train_all, y_train_all, cv = 10)

print(np.mean(scores['test_score']))

0.850096618357488


# Implement Cross Validation including preprocessing procedure

## 1. Implementing Cross Validation using Pipeline Class

In [7]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

pipe = make_pipeline(StandardScaler(), sgd)
scores = cross_validate(pipe, x_train_all, y_train_all, cv=10, return_train_score=True)
print(np.mean(scores['test_score']))

0.9694202898550724


In [8]:
print(np.mean(scores['train_score']))

0.9875478561631581
