In [50]:
%matplotlib inline
""" handling files support packages """
from glob import glob

""" logic support packages """
import numpy as np
import pandas as pd

""" plot support packages """
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from IPython.display import Image
#import pydotplus

""" image trasformation packages """
from PIL import Image
import skimage.io as skio

""" statistical data visualization packages"""
import seaborn as sns

""" machine learning functions """
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV

""" seaborn configurations """
sns.set_style('white')
sns.set_context('talk')
plt.rcParams['figure.figsize'] = 20, 10

In [51]:
data = pd.read_csv("../letras.csv", header = 0, sep=",")
list(data.columns.values)

['path', 'rotulo', 'caixa_alta_baixa']

In [52]:
print("Temos {0} classes: {1}".format(len(list(set(data.rotulo))), sorted(list(set(data.rotulo)))))

Temos 35 classes: ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [53]:
feat_transform = 'lbp' # ['pca', 'lbp']

if feat_transform == 'lbp': dim2 = True
else: dim2 = False

if dim2:
    # mantém as imagens como matriz
    X = np.array([np.array(Image.open(fname)) for fname in data['path']])
    y = np.array(data['rotulo'])
    
else:
    # torna as imagens e vetores unidimensionais
    X = np.array([np.array(Image.open(fname).getdata()) for fname in data['path']])
    y = np.array(data['rotulo'])

# Unsupervised Learnind: Dimensionality Reduction

In [54]:
if dim2:
    print("temos {0} imagens cada uma com {1}x{2} dimensoes".format(len(X), X[0].shape[0], X[0].shape[1]))
else:
    print("temos {0} imagens cada uma com {1} dimensoes".format(len(X), len(X[0])))

temos 1140 imagens cada uma com 50x35 dimensoes


# Divisao da base em treino e teste e aplicacao de algoritmos de aprendizado

In [55]:
if feat_transform == 'pca':
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    
    from sklearn.decomposition import PCA
    n_components = 1000
    pca = PCA(n_components=n_components, svd_solver='randomized',
          whiten=True).fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    joblib.dump(pca, 'pca_image.pkl')
    
if feat_transform == 'lbp':
    from skimage.feature import local_binary_pattern
    # settings for LBP
    METHOD = 'uniform'
    radius = 3
    n_points = 8 * radius
    
    X_lbp = [local_binary_pattern(img, n_points, radius, METHOD) for img in X]
    X = []
    for img in X_lbp:
        X.append([item for sublist in img.tolist() for item in sublist])
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Treinamento de classificadores

In [56]:
%%time
if False:
    from sklearn.neural_network import MLPClassifier

    MLPparams = {'hidden_layer_sizes':(100, 20, 50),
                 'activation' : ('identity', 'logistic', 'tanh', 'relu'),
                 'solver' : ('lbfgs', 'sgd', 'adam'),
                 'learning_rate': ('constant', 'invscaling', 'adaptative')
                }

    svr = MLPClassifier()
    clfMLP = GridSearchCV(svr, MLPparams)
    clfMLP = MLPClassifier()
    clfMLP.fit( X = X_train, y = y_train )
    print( "Score: {0}".format(clfMLP.score(X_test, y_test)) )
    print( "Best parameters: " + str(clfMLP.get_params) )

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 21.7 µs


In [57]:
%%time
if True:
    from sklearn.ensemble import RandomForestClassifier
    clfRF_balanced = RandomForestClassifier()


    RFparams = {'n_estimators' : (1000, 800, 900, 700, 600),
               'criterion' : ('gini', 'entropy'),
               'min_samples_split' : (2, 10, 40),
               'min_samples_leaf' : (1, 5, 10, 40)}

    svr = RandomForestClassifier()
    clfRF_balanced = GridSearchCV(svr, RFparams)
    clfRF_balanced.fit( X = X_train, y = y_train )
    print( "Score: {0}".format(clfRF_balanced.score(X_test, y_test)) )
    print( "Best parameters: " + str(clfRF_balanced.best_params_) )

Score: 0.4239766081871345
Best parameters: {'criterion': 'entropy', 'min_samples_leaf': 1, 'n_estimators': 900, 'min_samples_split': 2}
CPU times: user 1h 54min 35s, sys: 1min 23s, total: 1h 55min 58s
Wall time: 1h 59min 29s


In [58]:
%%time
if True:
    from sklearn.svm import SVC
    clf = SVC() # kernel = 'poly', degree = 2, max_iter = 100000

    SVMparams = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
                         'C': [1, 10, 100, 1000]},
                {'kernel': ['linear'], 'C': [1, 10, 100, 1000]},
                {'kernel' : ['poly'], 'degree': [1, 2, 3],
                         'C': [1, 10, 100, 1000]}]

    svr = SVC()
    clf = GridSearchCV(svr, SVMparams)

    clf.fit(X_train, y_train)
    print( "Score: {0}".format(clf.score(X_test, y_test)) )
    print( "Best parameters: " + str(clf.best_params_) )

Score: 0.3304093567251462
Best parameters: {'C': 1, 'degree': 2, 'kernel': 'poly'}
CPU times: user 7min 43s, sys: 11 s, total: 7min 54s
Wall time: 7min 56s


In [None]:
#import tensorflow as tf

#image_size = 35 * 50
#n_classes = len(list(set(data.rotulo)))
#x = tf.placeholder(tf.float32, [None, image_size])
#W = tf.Variable(tf.zeros([image_size, n_classes]))
#b = tf.Variable(tf.zeros([n_classes]))

In [None]:
#imgs = skio.imread_collection(list(data['path']))
#imgA = crop_char(imgs[0], 0)

In [None]:
#print(clf.predict([item for sublist in imgA.tolist() for item in sublist]))

In [59]:
%%time
save = True
if save:    
    joblib.dump(clfRF_balanced.best_estimator_, 'classifier_image.pkl')

CPU times: user 2.78 s, sys: 1.15 s, total: 3.93 s
Wall time: 1min 43s
