# 5-Fold Cross Validation

In [1]:
# import libraries
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# data doesn't have headers, so let's create headers
_headers = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'car']
# read in cars dataset
df = pd.read_csv('https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter07/Dataset/car.data', names=_headers, index_col=None)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
buying      1728 non-null object
maint       1728 non-null object
doors       1728 non-null object
persons     1728 non-null object
lug_boot    1728 non-null object
safety      1728 non-null object
car         1728 non-null object
dtypes: object(7)
memory usage: 94.6+ KB


In [3]:
#split the data into 80% for training and 20% for evaluation
training_df, eval_df = train_test_split(df, train_size=0.8, random_state=1)

In [4]:
training_df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,car
1579,low,med,4,4,med,med,good
634,high,high,5more,4,med,med,acc
299,vhigh,med,5more,2,small,high,unacc
1085,med,med,2,2,med,high,unacc
1659,low,low,3,4,med,low,unacc


In [5]:
eval_df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,car
1233,med,low,3,more,small,low,unacc
592,high,high,3,more,big,med,acc
625,high,high,5more,2,med,med,unacc
1546,low,med,3,2,big,med,unacc
730,high,med,5more,2,small,med,unacc


## KFold

In [6]:
from sklearn.model_selection import KFold

In [7]:
n_splits = 5

In [8]:
#create an instance of KFold
_kf = KFold(n_splits=n_splits)

In [9]:
#create splits as _indices
_indices = _kf.split(df)

In [10]:
# create lists to hold training and validation DataFrames
_t, _v = [], []

In [11]:
#iterate over _indices
for i in range(n_splits):
    train_idx, val_idx = next(_indices)
    _train_df = df.drop(val_idx)
    _t.append(_train_df)
    _val_df = df.drop(train_idx)
    _v.append(_val_df)

In [12]:
for d in _t:
    print(d.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1382 entries, 346 to 1727
Data columns (total 7 columns):
buying      1382 non-null object
maint       1382 non-null object
doors       1382 non-null object
persons     1382 non-null object
lug_boot    1382 non-null object
safety      1382 non-null object
car         1382 non-null object
dtypes: object(7)
memory usage: 86.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1382 entries, 0 to 1727
Data columns (total 7 columns):
buying      1382 non-null object
maint       1382 non-null object
doors       1382 non-null object
persons     1382 non-null object
lug_boot    1382 non-null object
safety      1382 non-null object
car         1382 non-null object
dtypes: object(7)
memory usage: 86.4+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1382 entries, 0 to 1727
Data columns (total 7 columns):
buying      1382 non-null object
maint       1382 non-null object
doors       1382 non-null object
persons     1382 non-null o

In [13]:
for d in _v:
    print(d.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 0 to 345
Data columns (total 7 columns):
buying      346 non-null object
maint       346 non-null object
doors       346 non-null object
persons     346 non-null object
lug_boot    346 non-null object
safety      346 non-null object
car         346 non-null object
dtypes: object(7)
memory usage: 21.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 346 to 691
Data columns (total 7 columns):
buying      346 non-null object
maint       346 non-null object
doors       346 non-null object
persons     346 non-null object
lug_boot    346 non-null object
safety      346 non-null object
car         346 non-null object
dtypes: object(7)
memory usage: 21.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
Int64Index: 346 entries, 692 to 1037
Data columns (total 7 columns):
buying      346 non-null object
maint       346 non-null object
doors       346 non-null object
persons     346 non-null object
lug_boot    346