# <center style='color:red'>`K-fold cross validation` using Scikit-Learn</center>

# 1. Import required libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold, StratifiedKFold
from sklearn import svm
from sklearn.naive_bayes import CategoricalNB

# 2. Load `car_evaluation` dataset

In [2]:
df = pd.read_csv('car_evaluation.csv')
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
df.shape

(1728, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [5]:
df.doors.unique(), df.doors.value_counts()

(array(['2', '3', '4', '5more'], dtype=object),
 2        432
 3        432
 4        432
 5more    432
 Name: doors, dtype: int64)

In [6]:
df.persons.unique(), df.persons.value_counts()

(array(['2', '4', 'more'], dtype=object),
 2       576
 4       576
 more    576
 Name: persons, dtype: int64)

# 3. Perform preprocessing

In [7]:
doors_dict = {'2': 2, '3': 3, '4': 4, '5more': 5}
df['doors'] = df['doors'].apply(lambda x: doors_dict[x])
persons_dict = {'2': 2, '4': 4, 'more': 5}
df['persons'] = df['persons'].apply(lambda y: persons_dict[y])

In [8]:
df.doors.unique(), df.doors.value_counts()

(array([2, 3, 4, 5]),
 2    432
 3    432
 4    432
 5    432
 Name: doors, dtype: int64)

In [9]:
df.persons.unique(), df.persons.value_counts()

(array([2, 4, 5]),
 2    576
 4    576
 5    576
 Name: persons, dtype: int64)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   int64 
 3   persons   1728 non-null   int64 
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: int64(2), object(5)
memory usage: 94.6+ KB


In [11]:
df['class'].unique(), df['class'].value_counts()

(array(['unacc', 'acc', 'vgood', 'good'], dtype=object),
 unacc    1210
 acc       384
 good       69
 vgood      65
 Name: class, dtype: int64)

In [12]:
cat_features = [i for i in df.columns if df.dtypes[i] == 'object']
cat_features

['buying', 'maint', 'lug_boot', 'safety', 'class']

In [13]:
labelencoder = LabelEncoder()
for j in cat_features:
    df[j] = labelencoder.fit_transform(df[j])
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3,3,2,2,2,1,2
1,3,3,2,2,2,2,2
2,3,3,2,2,2,0,2
3,3,3,2,2,1,1,2
4,3,3,2,2,1,2,2


In [14]:
df['class'].unique(), df['class'].value_counts()

(array([2, 0, 3, 1]),
 2    1210
 0     384
 1      69
 3      65
 Name: class, dtype: int64)

# 4. Separate features and classes

In [15]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# 5. Split the dataset

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=42)
len(X_train), len(X_test), len(y_train), len(y_test)

(1296, 432, 1296, 432)

# 6. Apply `Support Vector Machine`

In [17]:
model = svm.SVC(kernel='linear', C=2.0, random_state=42)
model.fit(X_train, y_train)

In [18]:
round(model.score(X_test, y_test), 4)

0.706

# 7. K-fold cross validation using sklearn's `cross_val_score`

In [19]:
scores = cross_val_score(model, X, y, cv=12, scoring='accuracy')

for fold, score in zip(range(12), scores):
    print(f'Accuracy for fold {fold+1}: {round(score, 4)}')

Accuracy for fold 1: 0.7083
Accuracy for fold 2: 0.6806
Accuracy for fold 3: 0.7847
Accuracy for fold 4: 0.7778
Accuracy for fold 5: 0.7222
Accuracy for fold 6: 0.75
Accuracy for fold 7: 0.7431
Accuracy for fold 8: 0.7778
Accuracy for fold 9: 0.7014
Accuracy for fold 10: 0.7153
Accuracy for fold 11: 0.7153
Accuracy for fold 12: 0.6944


In [20]:
round(scores.mean(), 4)

0.7309

# 8. K-fold cross validation using sklearn's `KFold`

In [21]:
kf = KFold(n_splits=12)

In [22]:
scores_list, k = [], 1

for train_index, test_index in kf.split(X.values, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = svm.SVC(kernel='linear', C=2.0, random_state=42)
    model.fit(X_train, y_train)
    
    print(f'Accuracy for fold {k}: {round(model.score(X_test, y_test), 4)}')
    scores_list.append(round(model.score(X_test, y_test), 4))
    k += 1

Accuracy for fold 1: 0.8958
Accuracy for fold 2: 0.7708
Accuracy for fold 3: 0.6944
Accuracy for fold 4: 0.8333
Accuracy for fold 5: 0.8056
Accuracy for fold 6: 0.7778
Accuracy for fold 7: 0.75
Accuracy for fold 8: 0.6667
Accuracy for fold 9: 0.5556
Accuracy for fold 10: 0.7153
Accuracy for fold 11: 0.5556
Accuracy for fold 12: 0.5556


In [23]:
round(np.mean(scores_list), 4)

0.7147

In [24]:
round(cross_val_score(model, X, y, cv=kf, scoring='accuracy').mean(), 4)

0.7147

# 9. K-fold cross validation using sklearn's `StratifiedKFold`

In [25]:
skf = StratifiedKFold(n_splits=12)

In [26]:
scores_list_2, k = [], 1

for train_index, test_index in skf.split(X.values, y):
    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    model = svm.SVC(kernel='linear', C=2.0, random_state=42)
    model.fit(X_train, y_train)
    
    print(f'Accuracy for fold {k}: {round(model.score(X_test, y_test), 4)}')
    scores_list_2.append(round(model.score(X_test, y_test), 4))
    k += 1

Accuracy for fold 1: 0.7083
Accuracy for fold 2: 0.6806
Accuracy for fold 3: 0.7847
Accuracy for fold 4: 0.7778
Accuracy for fold 5: 0.7222
Accuracy for fold 6: 0.75
Accuracy for fold 7: 0.7431
Accuracy for fold 8: 0.7778
Accuracy for fold 9: 0.7014
Accuracy for fold 10: 0.7153
Accuracy for fold 11: 0.7153
Accuracy for fold 12: 0.6944


In [27]:
round(np.mean(scores_list_2), 4)

0.7309

In [28]:
round(cross_val_score(model, X, y, cv=skf, scoring='accuracy').mean(), 4)

0.7309

In [29]:
round(cross_val_score(model, X, y, cv=12, scoring='accuracy').mean(), 4)

0.7309

# 10. Apply `Categorical Naive Bayes`

In [30]:
clf = CategoricalNB()
round(cross_val_score(clf, X, y, cv=12, scoring='accuracy').mean(), 4)

0.776

## `Categorical Naive Bayes` is providing higher accuracy than `Support Vector Machine`.