Подготовим данные:

In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from crop_letters import contour_letters_cut_28x28

In [2]:
X, y = [], []

def get_letter(dir_name):
    prefix = ['big_', 'punct_', 'small_', 'большая_', 'маленькая_']
    dict_for_bad_windows = {
        'backslash': '\\', 'slash': '/', 'colon': ':', 'asterisk': '*', 'question': '?',
        'quotation': '"', 'less': '<', 'greater': '>', 'pipe': '|', 'point': '.'
    }
    letter_name = dir_name
    for pref in prefix:
        letter_name = letter_name.replace(pref, '')
    if letter_name in dict_for_bad_windows:
        letter_name = dict_for_bad_windows[letter_name]
    return letter_name

root_dir = 'C:\\Users\\Ася\\Desktop\\Папки\\Университет\\5 семестр\\Проект\\letters_dataset'
dir_array = os.listdir(root_dir)
for curr_dir in dir_array:
    path = f'{root_dir}\\{curr_dir}'
    curr_letter = get_letter(curr_dir)
    path_files = os.listdir(path)
    curr_letter = get_letter(curr_dir)
    for i in range(len(path_files)):
        #возможно, функция поменяется!!!
        f = open(f'letters_dataset\\{curr_dir}\\{path_files[i]}', "rb")
        chunk = f.read()
        chunk_arr = np.frombuffer(chunk, dtype=np.uint8)
        img = cv2.imdecode(chunk_arr, cv2.IMREAD_COLOR)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        picture = contour_letters_cut_28x28(gray)
        if picture is None or np.array(picture).shape[0] == 0:
            continue
        X.append(picture[0].flatten())
        y.append(curr_letter)

In [3]:
X = np.array(X)
y = np.array(y)
y

array(['A', 'A', 'A', ..., 'я', 'я', 'я'], dtype='<U1')

In [4]:
import random

indexes = np.arange(y.shape[0])
random.shuffle(indexes)
X = X[indexes]
y = y[indexes]

In [5]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(y)
class_arr = le.classes_
label_arr = np.arange(class_arr.shape[0])
from_label_get_class = dict(zip(label_arr, class_arr))
z = le.transform(y)

In [6]:
train_ind = (4 * y.shape[0]) // 5
X_train, X_test = X[:train_ind, :], X[train_ind:, :]
z_train, z_test = z[:train_ind], z[train_ind:]
print(X_train.shape, X_test.shape)
print(z_train.shape, z_test.shape)

(56972, 784) (14243, 784)
(56972,) (14243,)


In [7]:
print(from_label_get_class)

{0: '!', 1: '"', 2: '#', 3: '$', 4: '%', 5: '&', 6: "'", 7: '(', 8: ')', 9: '*', 10: '+', 11: ',', 12: '-', 13: '.', 14: '/', 15: ':', 16: ';', 17: '<', 18: '=', 19: '>', 20: '?', 21: '@', 22: 'A', 23: 'B', 24: 'C', 25: 'D', 26: 'E', 27: 'F', 28: 'G', 29: 'H', 30: 'I', 31: 'J', 32: 'K', 33: 'L', 34: 'M', 35: 'N', 36: 'O', 37: 'P', 38: 'Q', 39: 'R', 40: 'S', 41: 'T', 42: 'U', 43: 'V', 44: 'W', 45: 'X', 46: 'Y', 47: 'Z', 48: '[', 49: '\\', 50: ']', 51: '^', 52: '_', 53: '`', 54: 'a', 55: 'b', 56: 'c', 57: 'd', 58: 'e', 59: 'f', 60: 'g', 61: 'h', 62: 'i', 63: 'j', 64: 'k', 65: 'l', 66: 'm', 67: 'n', 68: 'o', 69: 'p', 70: 'q', 71: 'r', 72: 's', 73: 't', 74: 'u', 75: 'v', 76: 'w', 77: 'x', 78: 'y', 79: 'z', 80: '{', 81: '|', 82: '}', 83: '~', 84: 'Ё', 85: 'А', 86: 'Б', 87: 'В', 88: 'Г', 89: 'Д', 90: 'Е', 91: 'Ж', 92: 'З', 93: 'И', 94: 'Й', 95: 'К', 96: 'Л', 97: 'М', 98: 'Н', 99: 'О', 100: 'П', 101: 'Р', 102: 'С', 103: 'Т', 104: 'У', 105: 'Ф', 106: 'Х', 107: 'Ц', 108: 'Ч', 109: 'Ш', 110: 'Щ'

Теперь обучим несколько вариантов классификаторов и сравним их между собой.

1) SVM с квадратичным ядром

In [8]:
from sklearn import svm

clf = svm.SVC(kernel = "poly", degree = 2)
clf.fit(X_train, z_train)

SVC(degree=2, kernel='poly')

In [9]:
z_pred = clf.predict(X_test)

In [10]:
from sklearn import metrics

print(f'accuracy on test is {metrics.accuracy_score(z_test, z_pred)}')

accuracy on test is 0.7459804816401039


In [13]:
import pickle

with open('svm_classifier_deg2.pickle', 'wb') as file:
    pickle.dump(clf, file)

2) SVM с кубическим ядром

In [14]:
clf = svm.SVC(kernel = "poly", degree = 3)
clf.fit(X_train, z_train)

SVC(kernel='poly')

In [15]:
z_pred = clf.predict(X_test)

In [16]:
from sklearn import metrics

print(f'accuracy on test is {metrics.accuracy_score(z_test, z_pred)}')

accuracy on test is 0.7322193358140842


In [17]:
with open('svm_classifier_deg3.pickle', 'wb') as file:
    pickle.dump(clf, file)

3) kNN при k = 10

In [18]:
from sklearn import neighbors

clf = neighbors.KNeighborsClassifier(n_neighbors = 10, weights='uniform')
clf.fit(X_train, z_train)

KNeighborsClassifier(n_neighbors=10)

In [19]:
z_pred = clf.predict(X_test)

In [20]:
print(f'accuracy on test is {metrics.accuracy_score(z_test, z_pred)}')

accuracy on test is 0.7251983430457066


In [21]:
with open('knn_classifier_k10.pickle', 'wb') as file:
    pickle.dump(clf, file)

4) случайный лес

In [35]:
from sklearn.ensemble import RandomForestClassifier

rc = RandomForestClassifier(n_estimators = 150)
rc.fit(X_train.reshape((-1, 784)), z_train)
z_pred = rc.predict(X_test.reshape((-1, 784)))
print(f'accuracy on test is {metrics.accuracy_score(z_test, z_pred)}')

accuracy on test is 0.7462964263146808
