In [1]:
%matplotlib inline
""" handling files support packages """
from glob import glob

""" logic support packages """
import numpy as np
import pandas as pd

""" plot support packages """
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from IPython.display import Image
#import pydotplus

""" image trasformation packages """
from PIL import Image
import skimage.io as skio

""" statistical data visualization packages"""
import seaborn as sns

""" machine learning functions """
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV

""" seaborn configurations """
sns.set_style('white')
sns.set_context('talk')
plt.rcParams['figure.figsize'] = 20, 10

In [2]:
from funcoes import extract_stats

# Tentativa de com base equilibrada p%

In [3]:
p = 30

In [4]:
combinacoes_equilibradas = pd.read_csv("../combinacoes"+str(p)+".txt", header = 0, sep=",")

In [5]:
X = combinacoes_equilibradas[["mse", "iss", "mse_centro", "iss_centro",
            "mse_canny", "iss_canny", "mse_canny_centro", "iss_canny_centro",
            "mse_skeleton", "iss_skeleton", "mse_skeleton_centro", "iss_skeleton_centro",
            "imgA_mean", "imgB_mean", "imgA_var", "imgB_var",
            #imgA_contraste, imgB_contraste,
            #imgA_angular_momentum, imgB_angular_momentum,
            "imgA_entropy", "imgB_entropy"]]

y = combinacoes_equilibradas["resposta"]

In [6]:
#from sklearn.manifold import Isomap
#iso = Isomap(n_components = 2)
#data_projected = iso.fit_transform(X)

In [7]:
#data_projected.shape

In [8]:
#plt.scatter(data_projected[:, 0], data_projected[:, 1], c=y,
#            edgecolor='none', alpha=0.5, cmap=plt.cm.get_cmap('nipy_spectral'))
#plt.colorbar(label='digita label', ticks=range(10))
#plt.clim(-0.5, 9.5)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## GaussianNB

In [10]:
%%time
from sklearn.naive_bayes import GaussianNB

clf_C = GaussianNB()

clf_C.fit(X_train, y_train)

print( "Score: {0}".format(clf_C.score(X_test, y_test)) )

Score: 0.6938461538461539
CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 5.62 ms


## Decision Tree

In [11]:
%%time
from sklearn import tree

DTparams = { 'criterion' : ('gini', 'entropy'),
           'min_samples_split' : (2, 10, 40),
           'min_samples_leaf' : (1, 3, 7, 40)}

svr = tree.DecisionTreeClassifier()
clf1 = GridSearchCV(svr, DTparams) #criterion = "gini", min_samples_split = 80, min_samples_leaf = 3

clf1.fit( X = X_train, y = y_train )
print( "Score: {0}".format(clf1.score(X_test, y_test)) )
print( "Best parameters: " + str(clf1.best_params_) )

Score: 0.7557264957264958
Best parameters: {'criterion': 'entropy', 'min_samples_split': 40, 'min_samples_leaf': 7}
CPU times: user 11.6 s, sys: 4 ms, total: 11.6 s
Wall time: 11.6 s


# Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier
clfRF_balanced = RandomForestClassifier()

In [13]:
%%time

RFparams = {'n_estimators' : (10, 2, 3, 5, 7),
           'criterion' : ('gini', 'entropy'),
           'min_samples_split' : (2, 10, 40),
           'min_samples_leaf' : (1, 5, 10, 40)}

svr = RandomForestClassifier()
clfRF_balanced = GridSearchCV(svr, RFparams)
clfRF_balanced.fit( X = X_train, y = y_train )
print( "Score: {0}".format(clfRF_balanced.score(X_test, y_test)) )
print( "Best parameters: " + str(clfRF_balanced.best_params_) )

Score: 0.8029059829059829
Best parameters: {'criterion': 'entropy', 'min_samples_split': 10, 'n_estimators': 10, 'min_samples_leaf': 1}
CPU times: user 48.5 s, sys: 16 ms, total: 48.5 s
Wall time: 48.5 s


In [14]:
%%time

from sklearn.ensemble import RandomForestClassifier
clfRF_balanced = RandomForestClassifier(criterion = 'entropy', n_estimators=10, min_samples_leaf=5,
                                       min_samples_split=10)
clfRF_balanced.fit( X = X_train, y = y_train )
print( "Score: {0}".format(clfRF_balanced.score(X_test, y_test)) )

Score: 0.795042735042735
CPU times: user 484 ms, sys: 0 ns, total: 484 ms
Wall time: 483 ms


## Logistic Regression

In [15]:
%%time
from sklearn.linear_model import LogisticRegression

LRparams = {'solver' : ('newton-cg', 'lbfgs', 'liblinear', 'sag')}

svr = LogisticRegression()
clfLR = GridSearchCV(svr, LRparams)
clfLR.fit( X = X_train, y = y_train )
print( "Score: {0}".format(clfLR.score(X_test, y_test)) )
print( "Best parameters: " + str(clfLR.best_params_) )

Score: 0.7507692307692307
Best parameters: {'solver': 'newton-cg'}
CPU times: user 6.44 s, sys: 8.62 s, total: 15.1 s
Wall time: 4.13 s


# Multi-layer Perceptron classifier

In [16]:
%%time
from sklearn.neural_network import MLPClassifier

MLPparams = {'hidden_layer_sizes':(100, 20, 50),
            'activation' : ('identity', 'logistic', 'tanh', 'relu'),
            'solver' : ('lbfgs', 'sgd', 'adam')}

svr = MLPClassifier()
clfMLP = GridSearchCV(svr, MLPparams)
clfMLP = MLPClassifier()
clfMLP.fit( X = X_train, y = y_train )
print( "Score: {0}".format(clfMLP.score(X_test, y_test)) )
print( "Best parameters: " + str(clfMLP.get_params) )

Score: 0.7056410256410256
Best parameters: <bound method BaseEstimator.get_params of MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)>
CPU times: user 544 ms, sys: 604 ms, total: 1.15 s
Wall time: 289 ms


# Tentativa com base de p = 50%

In [17]:
p = 50

In [18]:
combinacoes_equilibradas = pd.read_csv("../combinacoes"+str(p)+".txt", header = 0, sep=",")

In [19]:
X = combinacoes_equilibradas[["mse", "iss", "mse_centro", "iss_centro",
            "mse_canny", "iss_canny", "mse_canny_centro", "iss_canny_centro",
            "mse_skeleton", "iss_skeleton", "mse_skeleton_centro", "iss_skeleton_centro",
            "imgA_mean", "imgB_mean", "imgA_var", "imgB_var",
            #imgA_contraste, imgB_contraste,
            #imgA_angular_momentum, imgB_angular_momentum,
            "imgA_entropy", "imgB_entropy"]]

y = combinacoes_equilibradas["resposta"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## GaussianNB

In [21]:
%%time
from sklearn.naive_bayes import GaussianNB

clf_C = GaussianNB()

clf_C.fit(X_train, y_train)

print( "Score: {0}".format(clf_C.score(X_test, y_test)) )

Score: 0.5988603988603989
CPU times: user 0 ns, sys: 8 ms, total: 8 ms
Wall time: 2.92 ms


## Decision Tree

In [22]:
%%time
from sklearn import tree

DTparams = { 'criterion' : ('gini', 'entropy'),
           'min_samples_split' : (2, 10, 40),
           'min_samples_leaf' : (1, 3, 7, 40)}

svr = tree.DecisionTreeClassifier()
clf1 = GridSearchCV(svr, DTparams) #criterion = "gini", min_samples_split = 80, min_samples_leaf = 3

clf1.fit( X = X_train, y = y_train )
print( "Score: {0}".format(clf1.score(X_test, y_test)) )
print( "Best parameters: " + str(clf1.best_params_) )

Score: 0.7170940170940171
Best parameters: {'criterion': 'gini', 'min_samples_split': 40, 'min_samples_leaf': 3}
CPU times: user 6.02 s, sys: 4 ms, total: 6.02 s
Wall time: 6.02 s


# Random Forest

In [23]:
from sklearn.ensemble import RandomForestClassifier
clfRF_balanced = RandomForestClassifier()

In [24]:
%%time

RFparams = {'n_estimators' : (10, 2, 3, 5, 7),
           'criterion' : ('gini', 'entropy'),
           'min_samples_split' : (2, 10, 40),
           'min_samples_leaf' : (1, 5, 10, 40)}

svr = RandomForestClassifier()
clfRF_balanced = GridSearchCV(svr, RFparams)
clfRF_balanced.fit( X = X_train, y = y_train )
print( "Score: {0}".format(clfRF_balanced.score(X_test, y_test)) )
print( "Best parameters: " + str(clfRF_balanced.best_params_) )

Score: 0.7336182336182336
Best parameters: {'criterion': 'gini', 'min_samples_split': 10, 'n_estimators': 10, 'min_samples_leaf': 5}
CPU times: user 25.4 s, sys: 28 ms, total: 25.4 s
Wall time: 25.4 s


## Logistic Regression

In [25]:
%%time
from sklearn.linear_model import LogisticRegression

LRparams = {'solver' : ('newton-cg', 'lbfgs', 'liblinear', 'sag')}

svr = LogisticRegression()
clfLR = GridSearchCV(svr, LRparams)
clfLR.fit( X = X_train, y = y_train )
print( "Score: {0}".format(clfLR.score(X_test, y_test)) )
print( "Best parameters: " + str(clfLR.best_params_) )

Score: 0.6373219373219373
Best parameters: {'solver': 'newton-cg'}
CPU times: user 3.18 s, sys: 4.38 s, total: 7.56 s
Wall time: 2 s


# Multi-layer Perceptron classifier

In [26]:
%%time
from sklearn.neural_network import MLPClassifier

MLPparams = {'hidden_layer_sizes':(100, 20, 50),
            'activation' : ('identity', 'logistic', 'tanh', 'relu'),
            'solver' : ('lbfgs', 'sgd', 'adam')}

svr = MLPClassifier()
clfMLP = GridSearchCV(svr, MLPparams)
clfMLP = MLPClassifier()
clfMLP.fit( X = X_train, y = y_train )
print( "Score: {0}".format(clfMLP.score(X_test, y_test)) )
print( "Best parameters: " + str(clfMLP.get_params) )

Score: 0.49202279202279203
Best parameters: <bound method BaseEstimator.get_params of MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)>
CPU times: user 200 ms, sys: 148 ms, total: 348 ms
Wall time: 86.8 ms


# Binarizando o melhor modelo

In [27]:
#imgA = skio.imread("../letras/caracter1.png")
#imgB = skio.imread("../letras/caracter15.png")

#mse, iss, mse_centro, iss_centro,\
#			mse_canny, iss_canny, mse_canny_centro, iss_canny_centro,\
#			mse_skeleton, iss_skeleton, mse_skeleton_centro, iss_skeleton_centro = extract_stats(imgA, imgB)
        
#print(clfRF_balanced.predict( [[mse, iss, mse_centro, iss_centro,
#			mse_canny, iss_canny, mse_canny_centro, iss_canny_centro,
#			mse_skeleton, iss_skeleton, mse_skeleton_centro, iss_skeleton_centro]]))

In [28]:
#joblib.dump(clfRF_balanced, 'classifier_balanced.pkl') 