In [1]:
import time
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import LabelBinarizer

In [3]:
shift = 4320
shift_lbp = 26

In [4]:
hog_features = 'C:\\Users\\rafae\\Desktop\\Coleta\\features\\features_hog_*.csv'
lbp_features = 'C:\\Users\\rafae\\Desktop\\Coleta\\features\\features_lbp_*.csv'

In [5]:
hog_files_list = glob.glob(hog_features)
print(len(hog_files_list))

lbp_files_list = glob.glob(lbp_features)
print(len(lbp_files_list))

235
235


In [None]:
# read features
frames = [pd.read_csv(f, header=0) for f in hog_files_list ]
len(frames)

# read features
lbp = [pd.read_csv(f, header=0) for f in lbp_files_list ]
len(lbp)

In [None]:
ds = pd.concat(frames)
ds_lbp = pd.concat(lbp)

In [None]:
print(ds.shape, ds_lbp.shape)

In [None]:
ds.head()

In [None]:
ds_lbp.head()

In [None]:
yData = ds['0']

In [None]:
#xData = ds.iloc[:,2:2+shift+shift+shift]
xData = pd.concat([ds_lbp.iloc[:,2:2+shift_lbp+shift_lbp], ds.iloc[:,2+shift:2+shift+shift+shift]], axis=1)

In [None]:
yData.iloc[20]

In [None]:
print(yData.shape, xData.shape, type(yData), type(xData))

In [None]:
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, rotation=45)
    plt.yticks(tick_marks)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
from sklearn import cross_validation

# Compute confusion matrix
def plot_confusion(yTest, yTestPred, name):
    cm = confusion_matrix(yTest, yTestPred)
    np.set_printoptions(precision=2)

    # Normalize the confusion matrix by row (i.e by the number of samples in each class)
    cm_normalized = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100
    print('Classification report')
    print(classification_report(yTest, yTestPred))
    print('Normalized confusion matrix')
    print(cm_normalized)
    plt.figure(figsize=(5, 5))
    plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix (%s)' % (name))

    plt.show()
    # plot confusion matrix

In [None]:
le = preprocessing.LabelEncoder()
le.fit(yData)
print(le.classes_)
yDataBin = le.transform(yData)

In [None]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(xData, yDataBin, test_size=0.3, random_state=0)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape,
      type(X_train.shape), type(X_test.shape), type(y_train.shape), type(y_test.shape))

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    A feature selector for scikit-learn's Pipeline class that returns
    specified columns from a numpy array.

    """
    data = {'ir1' : [shift_lbp+shift_lbp, shift_lbp+shift_lbp+shift],
            'ir2' : [shift_lbp+shift_lbp+shift, shift_lbp+shift_lbp+shift+shift],
            'dep' : [0,shift_lbp+shift_lbp],
            'rgb' : [1]}
    def __init__(self, key):
        print(type(key), key)
        self.key = key

    def fit(self, X, y=None, **fit_params):
        print('fit', X.shape)
        return self
    
    def transform(self, X, y=None, **fit_params):
        x = X[:, self.data[self.key][0]:self.data[self.key][1]]
        print('trans', X.shape, x.shape, self.data[self.key])
        return x

In [None]:
from sklearn.metrics import make_scorer,precision_recall_fscore_support
from sklearn.pipeline import Pipeline
from sklearn.ensemble import VotingClassifier

def search(X_train, X_test, y_train, y_test):
    
    pipe1 = Pipeline([
               ('sel', ColumnSelector(key='ir1')), # use only IR1 features
               ('clf', SVC(probability=True, C=100, gamma=0.001, kernel='rbf'))])

    pipe2 = Pipeline([
               ('sel', ColumnSelector(key='dep')), # use only Depth features
               ('clf', SVC(probability=True))])
    
    pipe3 = Pipeline([
               ('sel', ColumnSelector(key='ir2')), # use only IR2 features
               ('clf', SVC(probability=True, C=100, gamma=0.001, kernel='rbf'))])

    all = Pipeline([
                   ('scaler', preprocessing.StandardScaler()),
                   ('eclf', VotingClassifier(estimators=[('ir1',pipe1), ('dep',pipe2), ('ir2',pipe3)], voting='soft', weights=[2,1,2]))
    ])
    parameters = {
    #'eclf__ir1__clf__C':(1.0,10,100,1000),
    #'eclf__ir1__clf__C':(1.0,10,100,1000),
    'eclf__dep__clf__C':(1.0,10,100,1000),
    #'eclf__ir2__clf__C':(1.0,10,100,1000)
    }
    
    clf = GridSearchCV(all, parameters, n_jobs=4, cv=2, scoring=['f1_micro','accuracy'], refit='accuracy' , verbose = 10)
    #clf = GridSearchCV(classifier, params, cv=2, scoring=['f1_weighted','accuracy','precision_weighted', 'recall_weighted'], refit='f1_weighted' , verbose = 10)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print(clf.best_score_)
    print()
    print(clf.cv_results_.keys())
    means = clf.cv_results_['mean_test_accuracy']
    stds = clf.cv_results_['std_test_accuracy']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    yTrue, yPred = y_test, clf.predict(X_test)
    print(classification_report(yTrue, yPred))
    plot_confusion(yTrue, yPred, "test")
    print()

In [None]:
search(X_train, X_test, y_train, y_test)

In [None]:
from sklearn.decomposition import PCA

pca = PCA()
pca.fit(nds.iloc[:,2:])
nnds = pca.transform(nds.iloc[:,2:])

In [None]:
from sklearn.metrics import make_scorer,precision_recall_fscore_support

def search2(X_train, X_test, y_train, y_test):
    
    # normalize data
    print("Normalizing data!")
    #stdScale = preprocessing.StandardScaler().fit(xTrain)
    #xTrain = stdScale.transform(xTrain)
    #xTest = stdScale.transform(xTest)
    
    print("Grid Search Classifiers!")
    
    knc = KNeighborsClassifier()
    svc = SVC()
    rfc = RandomForestClassifier()
    gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
    clf1 = SVC()
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    vt = VotingClassifier(estimators=[('svc', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

    kncp = [{'n_neighbors': [3, 5, 7],
             'weights': ['uniform','distance'],
              'algorithm': ['auto','ball_tree', 'kd_tree', 'brute']}]
    svcp = [{'kernel': ['rbf','linear'], 
             'gamma': [1e-3, 1e-4],
             'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 1, 10, 100, 1000]}]
    rfcp = [{'n_estimators': [10, 20, 50, 100,200,300], 
            'max_depth': [None, 1, 10, 100],
            'bootstrap': [True, False],
            'criterion': ["gini", "entropy"]}]
    gbp = [{#'loss' : ['deviance', 'exponential'],
           'n_estimators': [50,100],
           'learning_rate': [0.1,1.0,10],
           'max_depth' : [3,5,10]
            }]
           #'min_impurity_decrease': [0.0]}]#, 0.1]}]
    vtp = [{'svc__C': [1.0, 100.0], 
            'rf__n_estimators': [20, 200],}]
   
    classifiers = [#('kNN', knc, kncp),                                 
                    #('Support Vector', svc, svcp),
                    #('Random Forest', rfc, rfcp),
                    ('Gradient Boosting', gb, gbp),
                    ('Vooting', vt, vtp)]
    
    for name, classifier, params in classifiers:
        print(name)
        clf = GridSearchCV(classifier, params, cv=2, scoring=['f1_weighted','accuracy','precision_weighted', 'recall_weighted'], refit='f1_weighted' , verbose = 10)

        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print(clf.best_score_)
        print()
        print(clf.cv_results_.keys())
        #means = clf.cv_results_['mean_test_score']
        #stds = clf.cv_results_['std_test_score']
        #for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        #    print("%0.3f (+/-%0.03f) for %r"
        #          % (mean, std * 2, params))
        #print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        yTrue, yPred = y_test, clf.predict(X_test)
        print(classification_report(yTrue, yPred))
        plot_confusion(yTrue, yPred, name)
        print()