In [1]:
from sklearn import svm, datasets
import pandas as pd
import itertools
import time
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load data
iris = datasets.load_iris()

# define model, parameters to tune for, and search space
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}

## Replace:
1. clf = GridSearchCV(svc, parameters, cv=5)
2. clf.fit(iris.data, iris.target)

In [4]:
df = pd.DataFrame(iris.data)
df.columns = iris.feature_names
df = df.join(pd.DataFrame(iris.target).rename(columns={0 : "target"}))

In [5]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [6]:
# Implement stratified cross-validation with 5 folds
df['target'].value_counts()

2    50
1    50
0    50
Name: target, dtype: int64

In [7]:
df[df['target']==1][:10]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
50,7.0,3.2,4.7,1.4,1
51,6.4,3.2,4.5,1.5,1
52,6.9,3.1,4.9,1.5,1
53,5.5,2.3,4.0,1.3,1
54,6.5,2.8,4.6,1.5,1
55,5.7,2.8,4.5,1.3,1
56,6.3,3.3,4.7,1.6,1
57,4.9,2.4,3.3,1.0,1
58,6.6,2.9,4.6,1.3,1
59,5.2,2.7,3.9,1.4,1


In [8]:
cv_result_dict = {'params' : list(itertools.product(*[parameters[k] for k in parameters])),
                 'mean_fit_times' : [],
                 'fold_0_scores' : [],
                 'fold_1_scores' : [],
                 'fold_2_scores' : [],
                 'fold_3_scores' : [],
                 'fold_4_scores' : [],
                 'mean_scores' : [],
                 'scores_std' : []}

In [9]:
cv_result_dict

{'params': [('linear', 1), ('linear', 10), ('rbf', 1), ('rbf', 10)],
 'mean_fit_times': [],
 'fold_0_scores': [],
 'fold_1_scores': [],
 'fold_2_scores': [],
 'fold_3_scores': [],
 'fold_4_scores': [],
 'mean_scores': [],
 'scores_std': []}

In [10]:
for param in cv_result_dict['params']:
    this_model = svm.SVC(kernel = param[0], C = param[1])
    this_param_fit_times = []
    this_param_scores = []
    for fold in range(5):
        curr_split_test_df = df[df['target']==0][fold*10 : (fold+1) * 10].append(\
                   df[df['target']==1][fold*10 : (fold+1) * 10]).append(\
                   df[df['target']==2][fold*10 : (fold+1) * 10])
        # relative complement: df \ curr_split_test_df
        curr_split_train_df = df[~df.index.isin(curr_split_test_df.index)] 
        
        start = time.clock()
        this_model.fit(curr_split_train_df.drop(columns=['target']),
                       curr_split_train_df['target'])
        end = time.clock()
        this_param_fit_times.append(end - start)
        pred = this_model.predict(curr_split_test_df.drop(columns=['target']))
        
        # append score to the respective parameter-fold records
        score = round(accuracy_score(curr_split_test_df['target'], pred), 2)
        this_param_scores.append(score)
        cv_result_dict['fold_{}_scores'.format(fold)].append(score)
        
    cv_result_dict['mean_fit_times'].append(round(np.array(this_param_fit_times).mean(), 5))
    cv_result_dict['mean_scores'].append(round(np.array(this_param_scores).mean(), 3))
    cv_result_dict['scores_std'].append(round(np.array(this_param_scores).std(), 3))

In [11]:
cv_result_dict

{'params': [('linear', 1), ('linear', 10), ('rbf', 1), ('rbf', 10)],
 'mean_fit_times': [0.00289, 0.0017, 0.00179, 0.00166],
 'fold_0_scores': [0.97, 1.0, 0.97, 0.97],
 'fold_1_scores': [1.0, 1.0, 1.0, 1.0],
 'fold_2_scores': [0.97, 0.9, 0.97, 0.97],
 'fold_3_scores': [0.97, 0.97, 0.97, 0.97],
 'fold_4_scores': [1.0, 1.0, 1.0, 1.0],
 'mean_scores': [0.982, 0.974, 0.982, 0.982],
 'scores_std': [0.015, 0.039, 0.015, 0.015]}