<h1 style="text-align: center; font-size: 60px;">Assignment 10</h1>
<h3 style="text-align: center; font-size: 20px;">(submitted on 17th Sept 2025)</h3>

# K-Fold Cross Validation 

In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.datasets import load_digits
import matplotlib.pyplot as plt

digits = load_digits()

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target, test_size=0.3)

#### Logistic Regression

In [47]:
lr = LogisticRegression(max_iter=500)
lr.fit(X_train, y_train)
lr.score(X_test, y_test)

0.9165275459098498

#### SVM

In [24]:
svm = SVC(gamma='auto')
svm.fit(X_train, y_train)
svm.score(X_test, y_test)

0.4666666666666667

#### Random Forest

In [25]:
rf = RandomForestClassifier(n_estimators=40)
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

0.9611111111111111

### KFold cross validation

##### Basic example

In [26]:
from sklearn.model_selection import KFold
kf = KFold()
kf

KFold(n_splits=5, random_state=None, shuffle=False)

In [28]:
for train_index, test_index in kf.split([1,2,3,4,5,6,7,8,9]):
    print(train_index, test_index)
# shows how the data is split into train and test.

[2 3 4 5 6 7 8] [0 1]
[0 1 4 5 6 7 8] [2 3]
[0 1 2 3 6 7 8] [4 5]
[0 1 2 3 4 5 8] [6 7]
[0 1 2 3 4 5 6 7] [8]


##### Use KFold for our digits example

In [30]:
# A common method to check score of any model.

def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

In [46]:
# Example:
print(get_score(LogisticRegression(max_iter=500), X_train, X_test, y_train, y_test))
print(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
print(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))

0.9165275459098498
0.5125208681135225
0.9148580968280468


In [40]:
from sklearn.model_selection import StratifiedKFold
folds = StratifiedKFold(n_splits=3)

In [48]:
scores_logistic = []
scores_svm = []
scores_rf = []

for train_index, test_index in folds.split(digits.data,digits.target):
    X_train, X_test, y_train, y_test = digits.data[train_index], digits.data[test_index], \
                                       digits.target[train_index], digits.target[test_index]
    scores_logistic.append(get_score(LogisticRegression(max_iter=500), X_train, X_test, y_train, y_test))  
    scores_svm.append(get_score(SVC(gamma='auto'), X_train, X_test, y_train, y_test))
    scores_rf.append(get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test))

In [49]:
scores_logistic

[0.9198664440734557, 0.9415692821368948, 0.9165275459098498]

In [50]:
scores_svm

[0.3806343906510851, 0.41068447412353926, 0.5125208681135225]

In [52]:
scores_rf

[0.9248747913188647, 0.9565943238731218, 0.9265442404006677]

## cross_val_score function

In [59]:
from sklearn.model_selection import cross_val_score

#Logistic regression model performance using cross_val_score
cross_val_score(LogisticRegression(max_iter = 500), digits.data, digits.target, cv=3)

array([0.91986644, 0.94156928, 0.91652755])

In [60]:
# svm model performance using cross_val_score
cross_val_score(SVC(gamma='auto'), digits.data, digits.target, cv=3)

array([0.38063439, 0.41068447, 0.51252087])

In [61]:
# random forest performance using cross_val_score
cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target, cv=3)

array([0.93322204, 0.94657763, 0.92821369])

## Parameter tunning using K-Fold cross validation

In [62]:
scores1 = cross_val_score(RandomForestClassifier(n_estimators=5),digits.data, digits.target, cv=10)
np.average(scores1)

np.float64(0.8759310986964618)

In [63]:
scores2 = cross_val_score(RandomForestClassifier(n_estimators=20),digits.data, digits.target, cv=10)
np.average(scores2)

np.float64(0.9309807572936064)

In [64]:
scores3 = cross_val_score(RandomForestClassifier(n_estimators=30),digits.data, digits.target, cv=10)
np.average(scores3)

np.float64(0.9443544382371197)

In [65]:
scores4 = cross_val_score(RandomForestClassifier(n_estimators=40),digits.data, digits.target, cv=10)
np.average(scores4)

np.float64(0.9488050900062073)

In [66]:
# CONCLUSION:
# --> Here we used cross_val_score to fine tune our random forest classifier
#     and figured that having around 40 trees in random forest gives best result.

## EXERCISE

In [67]:
'''
Use iris flower dataset from sklearn library and use cross_val_score against 
following models to measure the performance of each. 
In the end figure out the model with best performance,

1. Logistic Regression
2. SVM
3. Decision Tree
4. Random Forest
'''

'\nUse iris flower dataset from sklearn library and use cross_val_score against \nfollowing models to measure the performance of each. \nIn the end figure out the model with best performance,\n\n1. Logistic Regression\n2. SVM\n3. Decision Tree\n4. Random Forest\n'

In [69]:
from sklearn.datasets import load_iris
iris = load_iris()

In [70]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3)

In [71]:
# creating a common function to check scores of any model given.

def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return np.average(model.score(X_test, y_test))

### Working Out

#### Logistic Regression 

In [78]:
get_score(LogisticRegression(max_iter = 500), X_train, X_test, y_train, y_test )

np.float64(0.9111111111111111)

#### SVM

In [74]:
get_score(SVC(), X_train, X_test, y_train, y_test )

np.float64(0.9333333333333333)

#### Decision Tree Classifier

In [75]:
get_score(RandomForestClassifier(n_estimators=40), X_train, X_test, y_train, y_test )

np.float64(0.8888888888888888)

## Using cross_val_score function

In [93]:
# Logistic Regression
l_scores = cross_val_score(LogisticRegression(max_iter = 200), iris.data, iris.target)
np.average(l_scores)

np.float64(0.9733333333333334)

In [94]:
# Decision Tree
from sklearn.tree import DecisionTreeClassifier
d_scores = cross_val_score(DecisionTreeClassifier(), iris.data, iris.target)
np.average(d_scores)

np.float64(0.9600000000000002)

In [95]:
# Support Vector Machine (SVM)
s_scores = cross_val_score(SVC(), iris.data, iris.target)
np.average(s_scores)

np.float64(0.9666666666666666)

In [96]:
# Random Forest Classifer
r_scores = cross_val_score(RandomForestClassifier(n_estimators=40), iris.data, iris.target)
np.average(r_scores)

np.float64(0.9466666666666667)

#### Best score so far is from Logistic Regression: 0.9733333333333334

# END OF ASSIGNMENT 10