In [84]:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.datasets import load_digits

In [85]:
digits = load_digits()

In [86]:
dir(digits)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [87]:
file = pd.DataFrame(digits.data, columns = digits.feature_names)

In [88]:
file.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_6,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [89]:
file["Target"] = digits.target

In [90]:
file.head()
file.to_csv("Datasets/digits.csv")

In [91]:
def get_score(model, train_X, test_X, train_Y, test_Y):
    model.fit(train_X, train_Y)
    return model.score(test_X, test_Y)

In [92]:
from sklearn.model_selection import train_test_split

In [93]:
X = digits.data
X

array([[ 0.,  0.,  5., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ..., 10.,  0.,  0.],
       [ 0.,  0.,  0., ..., 16.,  9.,  0.],
       ...,
       [ 0.,  0.,  1., ...,  6.,  0.,  0.],
       [ 0.,  0.,  2., ..., 12.,  0.,  0.],
       [ 0.,  0., 10., ..., 12.,  1.,  0.]])

In [94]:
Y = digits.target
Y

array([0, 1, 2, ..., 8, 9, 8])

In [95]:
train_X, test_X, train_Y, test_Y = train_test_split(X, Y, test_size=0.3)

In [96]:
len(train_X)

1257

In [97]:
len(test_X)

540

In [98]:
# svc_model = SVC()

In [99]:
get_score(SVC(), train_X, test_X, train_Y, test_Y)

0.9851851851851852

In [100]:
lr_model = LogisticRegression()

In [149]:
get_score(LogisticRegression(solver='liblinear',multi_class='ovr'), train_X, test_X, train_Y, test_Y)

0.9098497495826378

In [150]:
rf_model = RandomForestClassifier()

In [151]:
get_score(RandomForestClassifier(), train_X, test_X, train_Y, test_Y)

0.9248747913188647

The main thing to notice about tain_test_split method is here we have given the set of data as input only once. Hence whenever we get output then it is for the respective training that model has achieved using training data. But if we keep changing input then the performance of the algorithm is seen to be changed a bit.
Hence to overcome this drawback of algorithm we can use K fold algorithm. Here the input data is folded into folds that we provide as parameter and then that data is used for the purpose of training and testing.
It is iterable

In [152]:
from sklearn.model_selection import StratifiedKFold

In [153]:
skf = StratifiedKFold(n_splits=3)

In [159]:
lr_score = []
rf_score = []
svc_score = []

for train_index, test_index in skf.split(digits.data, digits.target):
    train_X, test_X, train_Y, test_Y = digits.data[train_index], digits.data[test_index], digits.target[train_index], digits.target[test_index]
    rf_score.append(get_score(RandomForestClassifier(), train_X, test_X, train_Y, test_Y))
    svc_score.append(get_score(SVC(), train_X, test_X, train_Y, test_Y))
    lr_score.append(get_score(LogisticRegression(solver='liblinear',multi_class='ovr'), train_X, test_X, train_Y, test_Y))

In [160]:
rf_score

[0.9332220367278798, 0.9482470784641068, 0.9298831385642737]

In [161]:
svc_score

[0.9649415692821369, 0.9799666110183639, 0.9649415692821369]

In [162]:
lr_score

[0.8948247078464107, 0.9532554257095158, 0.9098497495826378]

increasing the max_iter value to a very large number may work as a temporary solution, but it is not always the best solution. It is important to understand the underlying cause of the convergence issue and try different approaches to resolve it.

# Same this using cross val score

In [163]:
from sklearn.model_selection import cross_val_score

In [164]:
cross_val_score(SVC(), digits.data, digits.target)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [166]:
cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), digits.data, digits.target)

array([0.92222222, 0.88333333, 0.95264624, 0.95821727, 0.89415042])

In [167]:
cross_val_score(RandomForestClassifier(), digits.data, digits.target)

array([0.93888889, 0.89444444, 0.96100279, 0.96657382, 0.93036212])

In [168]:
#These models as shown here can be used for comparing differnt models in the same way this can be used to test the same model 
# using different parameters