# K Fold Cross Validation
## (Evaluating model Performance)

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_digits

In [2]:
digits = load_digits()

In [3]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3)

### Creating the classifiers

In [4]:
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9592592592592593

In [5]:
svm = SVC()
svm.fit(x_train, y_train)
svm.score(x_test, y_test)

0.9851851851851852

In [6]:
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
rf.score(x_test, y_test)

0.9685185185185186

### Using K fold cross validation

In [7]:
from sklearn.model_selection import KFold
kf = KFold(n_splits = 3)
kf

KFold(n_splits=3, random_state=None, shuffle=False)

In [8]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)

[3 4 5 6 7 8] [0 1 2]
[0 1 2 6 7 8] [3 4 5]
[0 1 2 3 4 5] [6 7 8]


In [9]:
def get_score(model, x_train, x_test, y_train, y_test):
    model.fit(x_train, y_train)
    return model.score(x_test, y_test)

In [10]:
get_score(LogisticRegression(), x_train, x_test, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.9592592592592593

In [11]:
get_score(SVC(), x_train, x_test, y_train, y_test)

0.9851851851851852

In [12]:
get_score(RandomForestClassifier(), x_train, x_test, y_train, y_test)

0.9777777777777777

In [13]:
# Now using K fold on digits dataset
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits = 3)

In [14]:
scores_lr = []
scores_svm = []
scores_rf = []

In [15]:
for train_index, test_index in kf.split(digits.data):
    x_train, x_test, y_train, y_test = digits.data[train_index], digits.data[test_index], digits.target[train_index], digits.target[test_index]
    
    scores_lr.append(get_score(LogisticRegression(max_iter=10000), x_train, x_test, y_train, y_test))
    scores_svm.append(get_score(SVC(), x_train, x_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40), x_train, x_test, y_train, y_test))    

In [16]:
scores_lr

[0.9282136894824707, 0.9415692821368948, 0.9165275459098498]

In [17]:
scores_svm

[0.9666110183639399, 0.9816360601001669, 0.9549248747913188]

In [18]:
scores_rf

[0.9298831385642737, 0.9565943238731218, 0.9232053422370617]

In [19]:
from sklearn.model_selection import cross_val_score

In [20]:
cross_val_score(LogisticRegression(max_iter=3000), digits.data, digits.target)

array([0.925     , 0.875     , 0.93871866, 0.93314763, 0.89693593])

In [21]:
cross_val_score(SVC(), digits.data, digits.target)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [22]:
cross_val_score(RandomForestClassifier(n_estimators=40), digits.data, digits.target)

array([0.925     , 0.90833333, 0.9637883 , 0.96100279, 0.91364903])

In [23]:
cross_val_score(RandomForestClassifier(n_estimators=15), digits.data, digits.target)

array([0.90277778, 0.83611111, 0.94150418, 0.95264624, 0.90250696])

In [24]:
# Hence SVC classifier model performs much better than others

### Another Task

Use iris flower dataset from sklearn library and use cross_val_score against following models to measure the performance of each. In the end figure out the model with best performance,

1. Logistic Regression
2. SVM
3. Decision Tree
4. Random Forest

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np

In [26]:
from sklearn.datasets import load_iris
iris = load_iris()

In [27]:
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [28]:
from sklearn.model_selection import cross_val_score

#### Logistic Regression

In [29]:
lr_scores = cross_val_score(LogisticRegression(max_iter=5000), iris.data, iris.target)
lr_scores

array([0.96666667, 1.        , 0.93333333, 0.96666667, 1.        ])

In [30]:
np.average(lr_scores)

0.9733333333333334

#### Support Vector Machine (SVM)

In [31]:
svm_scores = cross_val_score(SVC(), iris.data, iris.target)
svm_scores

array([0.96666667, 0.96666667, 0.96666667, 0.93333333, 1.        ])

In [32]:
np.average(svm_scores)

0.9666666666666666

#### Decision Tree

In [33]:
d_scores = cross_val_score(DecisionTreeClassifier(), iris.data, iris.target)
d_scores

array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ])

In [34]:
np.average(d_scores)

0.9533333333333334

#### Random Forest

In [35]:
rf_scores = cross_val_score(RandomForestClassifier(n_estimators=30), iris.data, iris.target)
rf_scores

array([0.96666667, 0.96666667, 0.93333333, 0.93333333, 1.        ])

In [36]:
np.average(rf_scores)

0.96

#### Hence the best score so far is from Logistic Regression: 0.9733333333333334