In [1]:
### Author : Prasad Meesala
# Importing the necessary modules

import numpy as np
import pandas as pd
from sklearn import ensemble, linear_model, svm, model_selection, datasets

In [2]:
# Collecting the data

data = datasets.load_digits()

df = pd.DataFrame(data.data, columns = data.feature_names)
df['target'] = data.target
df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


In [16]:
X = df.drop('target', axis = 1).values
y = df['target'].values

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.2)

# Using different classifiers to evaluate the model performance

lg = linear_model.LogisticRegression(max_iter = 10000)
svmc = svm.SVC(kernel = "linear")
rf = ensemble.RandomForestClassifier(n_estimators = 50)

def get_score(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

print(get_score(lg, X_train, X_test, y_train, y_test))
print(get_score(svmc, X_train, X_test, y_train, y_test))
print(get_score(rf, X_train, X_test, y_train, y_test))

0.9555555555555556
0.9611111111111111
0.975


In [14]:
# Evaluating the model performance by creating folds 

avg = lambda x : sum(x) / len(x)

print(avg(model_selection.cross_val_score(lg, X, y)))

print(avg(model_selection.cross_val_score(svmc, X, y)))

print(avg(model_selection.cross_val_score(rf, X, y)))

0.9148731043020737
0.9476973073351903
0.9282311977715878


In [15]:
# Checking the accuracy of each fold manually

folds = model_selection.StratifiedKFold(n_splits = 3)

def get_scores(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    return model.score(X_test, y_test)

lg_scores = []
svm_scores, rf_scores = lg_scores.copy(), lg_scores.copy()

for train_index, test_index in folds.split(X, y):
    lg_scores.append(get_score(lg, X[train_index], X[test_index], y[train_index], y[test_index]))
    svm_scores.append(get_score(svmc, X[train_index], X[test_index], y[train_index], y[test_index]))
    rf_scores.append(get_score(rf, X[train_index], X[test_index], y[train_index], y[test_index]))

print(avg(lg_scores), avg(svm_scores), avg(rf_scores))

0.9293266555370061 0.9437952142459656 0.9360044518642181
