In [None]:
# 训练识别模型

In [2]:
import cv2 as cv
import numpy as np
import matplotlib.pyplot as plt

In [3]:
import os

X = []
y = []


for root, dirs, files in os.walk("train/en"): 
    
    if len(os.path.basename(root)) > 1: 
        continue
 
    char = ord(os.path.basename(root))
    
    for filename in files:
        filepath = os.path.join(root, filename)
        digit_img = cv.imread(filepath)
        digit_img = cv.cvtColor(digit_img, cv.COLOR_BGR2GRAY)
        
        X.append(digit_img)
        y.append(char)
        
X = np.array(X)
y = np.array(y)

In [4]:
X.shape

(13163, 20, 20)

In [5]:
y.shape

(13163,)

In [6]:
X2 = X.reshape(X.shape[0],-1)
X2.shape

(13163, 400)

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.15, random_state=1)

In [9]:
from sklearn.svm import SVC

In [10]:
svc = SVC(C=1, gamma=0.5) 

In [11]:
%%time
svc.fit(X_train, y_train)

Wall time: 3min 12s


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [12]:
svc.score(X_test, y_test)

0.36658227848101266

In [13]:
from numpy.linalg import norm
def preprocess_hog(digits):
    samples = []
    for img in digits:
        gx = cv.Sobel(img, cv.CV_32F, 1, 0)
        gy = cv.Sobel(img, cv.CV_32F, 0, 1)
        mag, ang = cv.cartToPolar(gx, gy)
        bin_n = 16
        bin = np.int32(bin_n*ang/(2*np.pi))
        bin_cells = bin[:10,:10], bin[10:,:10], bin[:10,10:], bin[10:,10:]
        mag_cells = mag[:10,:10], mag[10:,:10], mag[:10,10:], mag[10:,10:]
        hists = [np.bincount(b.ravel(), m.ravel(), bin_n) for b, m in zip(bin_cells, mag_cells)]
        hist = np.hstack(hists)

        # transform to Hellinger kernel
        eps = 1e-7
        hist /= hist.sum() + eps
        hist = np.sqrt(hist)
        hist /= norm(hist) + eps

        samples.append(hist)
    return np.float32(samples)


In [14]:
X2 = preprocess_hog(X)
X2.shape

(13163, 64)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X2, y, test_size=0.15, random_state=1)

In [16]:
%%time
svc.fit(X_train, y_train)

Wall time: 2.55 s


SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [17]:
svc.score(X_test, y_test) 

0.9954430379746836

In [18]:
X2.shape

(13163, 64)

In [19]:
y

array([48, 48, 48, ..., 90, 90, 90])

In [20]:
svc = SVC(C=1, gamma=0.5)
svc.fit(X2, y)

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [21]:
from sklearn.externals import joblib
joblib.dump(svc, 'plate.en')
model = joblib.load('plate.en')

In [22]:
model

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.5, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
# 部份省份
provinces = [
    "zh_cuan", "川",
    "zh_e", "鄂",
    "zh_gan", "赣",
    "zh_gan1", "甘",
    "zh_gui", "贵",
    "zh_gui1", "桂",
    "zh_hei", "黑",
    "zh_hu", "沪",
    "zh_ji", "冀",
    "zh_jin", "津",
    "zh_jing", "京",
    "zh_jl", "吉",
    "zh_liao", "辽",
    "zh_lu", "鲁",
    "zh_meng", "蒙",
    "zh_min", "闽",
    "zh_ning", "宁",
    "zh_qing", "靑",
    "zh_qiong", "琼",
    "zh_shan", "陕",
    "zh_su", "苏",
    "zh_sx", "晋",
    "zh_wan", "皖",
    "zh_xiang", "湘",
    "zh_xin", "新",
    "zh_yu", "豫",
    "zh_yu1", "渝",
    "zh_yue", "粤",
    "zh_yun", "云",
    "zh_zang", "藏",
    "zh_zhe", "浙"
]

In [None]:
import os

X = []
y = []

# 遍历目录
for root, dirs, files in os.walk("train/zh"): 
    
    if not os.path.basename(root).startswith("zh_"):
        continue

    pinyin = os.path.basename(root)

    index = provinces.index(pinyin)  + 1  # +1 是拼音对应的汉字
    for filename in files:
        filepath = os.path.join(root, filename)

        digit_img = cv.imread(filepath)
        digit_img = cv.cvtColor(digit_img, cv.COLOR_BGR2GRAY)
        X.append(digit_img)
        y.append(index)
X = np.array(X)
y = np.array(y)

In [None]:
X.shape

In [None]:
y

In [None]:
X2 = preprocess_hog(X)
X2.shape

In [None]:
svc = SVC(C=1, gamma=0.5) 
svc.fit(X2, y)

In [None]:
from sklearn.externals import joblib
joblib.dump(svc, 'plate.zh')

In [None]:
model = joblib.load('plate.zh')
model