<a href="https://colab.research.google.com/github/shivam2003-dev/machine_learning/blob/main/k_fold_cross_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# k-Fold Cross Validation

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [21]:
dataset = pd.read_csv('/content/train.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

## Splitting the dataset into the Training set and Test set

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Feature Scaling

In [23]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Training the Kernel SVM model on the Training set

In [24]:
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

## Making the Confusion Matrix

In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[118   6   0   0]
 [  6  97   7   0]
 [  0  15 106   4]
 [  0   0  16 125]]


0.892

## Applying k-Fold Cross Validation

In [26]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 87.00 %
Standard Deviation: 2.72 %


### Reapeated k_fold example


In [29]:
from sklearn.model_selection import RepeatedKFold
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=42)
for train,test in rkf.split(X):
  print("%s %s"%(train,test));


[0 2] [1 3]
[1 3] [0 2]
[0 2] [1 3]
[1 3] [0 2]


### accuracy and stndard variation 


In [30]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = rkf)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 82.57 %
Standard Deviation: 1.14 %


### Leave one out k-fold example


In [31]:
from sklearn.model_selection import LeaveOneOut
X = [1, 2, 3, 4]
loo = LeaveOneOut()
for train, test in loo.split(X):
  print("%s %s" % (train, test))

[1 2 3] [0]
[0 2 3] [1]
[0 1 3] [2]
[0 1 2] [3]


### accuracy and standard devition 






In [33]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = loo)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 87.07 %
Standard Deviation: 33.56 %


### Satisfied K - FOLD examples


In [37]:
from sklearn.model_selection import StratifiedKFold, KFold
import numpy as np
X, y = np.ones((50, 1)), np.hstack(([0] * 45, [1] * 5))
skf = StratifiedKFold(n_splits=3)
for train, test in skf.split(X, y):
  print('train - {} | test - {}'.format(
  np.bincount(y[train]), np.bincount(y[test])))

train - [30  3] | test - [15  2]
train - [30  3] | test - [15  2]
train - [30  4] | test - [15  1]


### Stratified k-fold vs K-Fold



In [38]:
kf = KFold(n_splits=3)
for train, test in kf.split(X, y):
  print('train - {} | test - {}'.format(
  np.bincount(y[train]), np.bincount(y[test])))

train - [28  5] | test - [17]
train - [28  5] | test - [17]
train - [34] | test - [11  5]


In [39]:
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = skf)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 83.60 %
Standard Deviation: 0.98 %


### Grid Search 


In [41]:
# Applying Grid Search to find the best model and the best parameters
# Importing the required class
from sklearn.model_selection import GridSearchCV
#Defining parameters to be tuned
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]}]

#Creating object of GridSearchCV class
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters,
  scoring = 'accuracy', cv = 10, n_jobs = -1)
#Fitting the Grid Search to the training set
grid_search = grid_search.fit(X_train, y_train)

In [42]:
# Getting best metric value (Mean of 10 accuracies)
best_accuracy =grid_search.best_score_
# Getting best suited parameters’ values
best_parameters = grid_search.best_params_
#Printing the metric and the parameters
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 97.27 %
Best Parameters: {'C': 100, 'kernel': 'linear'}


### Training XGBoost on the Training set


In [43]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [45]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

### K-Fold Cross Validation


In [46]:
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 90.20 %
Standard Deviation: 1.91 %


### Grid Search

In [51]:
# Applying Grid Search to find the best model and the best parameters
# Importing the required class
from sklearn.model_selection import GridSearchCV
#Defining parameters to be tuned
parameters = [{'max_depth': [3, 5, 7,10], 'n_estimators': [100, 200, 300, 400, 500, 700]}]
#Creating object of GridSearchCV class
grid_search = GridSearchCV(estimator = classifier, param_grid = parameters,
  scoring = 'accuracy', cv = 10, n_jobs = -1)
#Fitting the Grid Search to the training set
grid_search = grid_search.fit(X_train, y_train)

In [52]:
# Getting best metric value (Mean of 10 accuracies)
best_accuracy =grid_search.best_score_
# Getting best suited parameters’ values
best_parameters = grid_search.best_params_
#Printing the metric and the parameters
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 91.13 %
Best Parameters: {'max_depth': 3, 'n_estimators': 500}
