# Overfitting Test

In [1]:
import sklearn
from sklearn.datasets import load_breast_cancer

## Load breast cancer data from sklearn

In [2]:
data = load_breast_cancer()

In [3]:
label_names = data['target_names']
labels = data['target']
feature_names = data['feature_names']
features = data['data']

In [4]:
print(label_names)
print(labels[0])
print(feature_names[0])
print(features[0])

['malignant' 'benign']
0
mean radius
[1.799e+01 1.038e+01 1.228e+02 1.001e+03 1.184e-01 2.776e-01 3.001e-01
 1.471e-01 2.419e-01 7.871e-02 1.095e+00 9.053e-01 8.589e+00 1.534e+02
 6.399e-03 4.904e-02 5.373e-02 1.587e-02 3.003e-02 6.193e-03 2.538e+01
 1.733e+01 1.846e+02 2.019e+03 1.622e-01 6.656e-01 7.119e-01 2.654e-01
 4.601e-01 1.189e-01]


## Split Data into training and test set

In [5]:
from sklearn.model_selection import train_test_split


# Split our data
train, test, train_labels, test_labels = train_test_split(features,
                                                          labels,
                                                          test_size=0.33,
                                                          random_state=42)

## Train your model

In [9]:
from sklearn.naive_bayes import GaussianNB

# Initialize our classifier
gnb = GaussianNB()

# Train our classifier
model = gnb.fit(train, train_labels)

In [10]:
preds = model.predict(test)
print(preds)

[1 0 0 1 1 0 0 0 1 1 1 0 1 0 1 0 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0
 1 0 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0 0 1 1 0 0 1 0
 1 1 1 1 1 1 0 1 1 0 0 0 0 0 1 1 1 1 1 1 1 1 0 0 1 0 0 1 0 0 1 1 1 0 1 1 0
 1 1 0 0 0 1 1 1 0 0 1 1 0 1 0 0 1 1 0 0 0 1 1 1 0 1 1 0 0 1 0 1 1 0 1 0 0
 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 0 1 1 1 1 1 1 0 0
 0 1 1]


## Evaluate the model

In [11]:
from sklearn.metrics import accuracy_score


# Evaluate accuracy
print(accuracy_score(test_labels, preds))

0.9414893617021277


## Check for overfitting

In [12]:
train_preds = model.predict(train)

In [13]:
print(train_preds)

[0 1 0 1 1 1 1 0 1 1 0 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 0 1 0 0 1
 1 0 1 1 1 1 1 1 1 1 0 0 1 1 0 1 1 0 1 0 1 0 1 1 1 1 0 1 1 1 0 1 0 1 0 1 0
 1 0 0 1 1 1 1 0 1 1 1 0 1 1 0 0 1 0 1 1 1 1 1 1 1 0 1 1 1 0 1 0 1 1 1 0 1
 0 0 1 1 0 1 0 0 1 1 1 1 0 1 1 0 1 0 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 0 0
 1 1 0 0 0 0 1 1 1 0 1 0 0 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 1 1 1 1 1 0 1 1 0
 0 1 0 1 0 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 0 0 1 0 1 1 0 0 0 1
 1 1 1 1 1 1 0 1 1 1 0 1 1 0 1 1 0 1 0 0 1 1 0 1 0 0 1 0 1 1 1 0 1 1 1 1 0
 1 1 0 0 0 1 1 1 0 0 1 0 0 1 1 1 0 1 0 0 0 1 0 1 0 1 1 0 0 0 0 1 0 1 1 1 1
 1 1 1 0 0 0 0 1 1 1 1 0 1 0 1 1 1 1 1 0 0 0 1 1 0 1 1 0 0 0 0 1 1 1 0 1 1
 1 0 1 0 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0 0 1 1 1 0 0
 1 0 0 1 0 1 1 1 1 0 1]


In [14]:
print(accuracy_score(train_preds, train_labels))

0.9396325459317585


In [15]:
test_preds = model.predict(test)

In [16]:
print(accuracy_score(test_labels, test_preds))

0.9414893617021277


As you can see, the accuracy score for training data (0.939) and test data (0.941) are very similar. Hence our model did not overfit. If the model overfits, you can perform k-fold cross valdiation to overcome this issue. Let's implement K-fold cross valdiation.

## K-fold Cross Validation

In [26]:
from sklearn.model_selection import KFold 
import pandas as pd
import numpy as np

# k-fold initialization
k = 5
kf = KFold(n_splits=k, random_state=None)

# we will load data differently
cancer_data = load_breast_cancer()
df = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
df['target'] = pd.Series(cancer_data.target)
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

# Initialize our classifier
gnb = GaussianNB()

# Train our classifier
model = gnb.fit(train, train_labels)
 
acc_score = []
 
for train_index , test_index in kf.split(X):
    X_train , X_test = X.iloc[train_index,:],X.iloc[test_index,:]
    y_train , y_test = y[train_index] , y[test_index]
     
    model.fit(X_train,y_train)
    pred_values = model.predict(X_test)
     
    acc = accuracy_score(pred_values , y_test)
    acc_score.append(acc)
     
avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

accuracy of each fold - [0.8771929824561403, 0.9210526315789473, 0.956140350877193, 0.9736842105263158, 0.9557522123893806]
Avg accuracy : 0.9367644775655954


Now that the mode is trained on batches with random sample of equal size, it is very difficult for the model to memorize i.e. overfit the input parameters.