In [None]:
import time
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import BernoulliRBM
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import LabelBinarizer

In [None]:
shift = 4320
shift_lbp = 26

In [None]:
hog_features = 'C:\\Users\\rafae\\Desktop\\Coleta\\features\\features_hog_*.csv'
lbp_features = 'C:\\Users\\rafae\\Desktop\\Coleta\\features\\features_lbp_*.csv'

In [None]:
hog_files_list = glob.glob(hog_features)
print(len(hog_files_list))

lbp_files_list = glob.glob(lbp_features)
print(len(lbp_files_list))

In [None]:
# read features
frames = [pd.read_csv(f, header=0) for f in hog_files_list ]
len(frames)

# read features
lbp = [pd.read_csv(f, header=0) for f in lbp_files_list ]
len(lbp)

In [None]:
ds = pd.concat(frames)
ds_lbp = pd.concat(lbp)

In [None]:
print(ds.shape, ds_lbp.shape)

In [None]:
ds.head()

In [None]:
ds_lbp.head()

In [None]:
# convert points do binary Eyes ON or OFF Road
on_road_points = [13, 14, 16, 17]
on_off_road_bit = np.where(ds['0'].isin(on_road_points), 1,0)
print(ds['0'].isin(on_road_points).value_counts())

In [None]:
yData = on_off_road_bit

In [None]:
xData = pd.concat([ds['1'],ds.iloc[:,2+shift:2+shift+shift], ds_lbp.iloc[:,2:2+shift_lbp]], axis=1)

In [None]:
xData.shape

In [None]:
yData[20]

In [None]:
print(yData.shape, xData.shape, type(yData), type(xData))

In [None]:
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, title='Confusion matrix', cmap=plt.cm.Blues):
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(2)
    plt.xticks(tick_marks, rotation=45)
    plt.yticks(tick_marks)
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score
from sklearn import cross_validation

# Compute confusion matrix
def plot_confusion(yTest, yTestPred, name):
    cm = confusion_matrix(yTest, yTestPred)
    np.set_printoptions(precision=2)

    # Normalize the confusion matrix by row (i.e by the number of samples in each class)
    cm_normalized = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100
    print('Classification report')
    print(classification_report(yTest, yTestPred))
    print('Normalized confusion matrix')
    print(cm_normalized)
    plt.figure(figsize=(5, 5))
    plot_confusion_matrix(cm_normalized, title='Normalized confusion matrix (%s)' % (name))

    plt.show()
    # plot confusion matrix

In [None]:
from sklearn.metrics import make_scorer,precision_recall_fscore_support

def search(X_train, X_test, y_train, y_test, group_data_train):
    
    # normalize data
    print("Normalizing data!")
    stdScale = preprocessing.StandardScaler().fit(X_train)
    xTrain = stdScale.transform(X_train)
    xTest = stdScale.transform(X_test)
    
    print("Grid Search Classifiers!")
    
    knc = KNeighborsClassifier()
    svc = SVC()
    rfc = RandomForestClassifier()
    gb = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0)
    clf1 = SVC()
    clf2 = RandomForestClassifier(random_state=1)
    clf3 = GaussianNB()
    vt = VotingClassifier(estimators=[('svc', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

    kncp = [{'n_neighbors': [3, 5, 7, 10],
             'weights': ['uniform','distance'],
              'algorithm': ['auto','kd_tree']}]
    svcp = [{'kernel': ['rbf'], #{'kernel': ['rbf','linear'], 
             'class_weight':['balanced'],
             'gamma': ['auto',0.1, 0.001, 0.0001], #'gamma': [0.0001],#
             'C': [0.001, 0.01, 0.1, 1.0, 1, 10, 50, 100]}]#'C': [1000]}]#
    rfcp = [{'n_estimators': [10, 20, 50, 100,200], 
            'max_features': ['auto', 'log2'],
            'max_depth': [None],
            'bootstrap': [True, False],
            'criterion': ["gini", "entropy"]}]
    gbp = [{#'loss' : ['deviance', 'exponential'],
           'n_estimators': [10, 50,100, 200],
           'learning_rate': [0.001, 0.01, 0.1, 1.0,10],
           'max_features': ['auto', 'log2'],
           'max_depth' : [3,5,10, 100]
            }]
           #'min_impurity_decrease': [0.0]}]#, 0.1]}]
    vtp = [{'svc__C': [1.0, 100.0], 
            'rf__n_estimators': [20, 200],}]
   
    classifiers = [('kNN', knc, kncp),                                 
                    ('Support Vector', svc, svcp),
                    ('Random Forest', rfc, rfcp),
                    ('Gradient Boosting', gb, gbp),
                    ('Vooting', vt, vtp)
    ]
    
    for name, classifier, params in classifiers:
        print(name)
        clf = GridSearchCV(classifier, params,n_jobs=4, cv=5, scoring=['f1_weighted','accuracy','precision_weighted', 'recall_weighted'], refit='f1_weighted' , verbose = 10)

        clf.fit(X_train, y_train, groups=group_data_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print(clf.best_score_)
        print()
        print(clf.cv_results_.keys())
        means = clf.cv_results_['mean_test_accuracy']
        stds = clf.cv_results_['std_test_accuracy']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        print()
        print("The model is trained on the full development set.")
        print("The scores are computed on the full evaluation set.")
        print()
        yTrue, yPred = y_test, clf.predict(X_test)
        print(classification_report(yTrue, yPred))
        plot_confusion(yTrue, yPred, name)
        print()

In [None]:
le = preprocessing.LabelEncoder()
le.fit(yData)
print(le.classes_)
yDataBin = le.transform(yData)

In [None]:
from sklearn.model_selection import GroupShuffleSplit

gss = GroupShuffleSplit(n_splits=2, test_size=0.3, random_state=0)

for train_index, test_index in gss.split(xData, yDataBin, groups=ds['1']):
    print(type(train_index), train_index.shape, test_index.shape)
    print("TRAIN:", train_index, "TEST:", test_index)
    print(np.unique(ds.iloc[train_index,1]))
    print(np.unique(ds.iloc[test_index,1]))
    print(ds.iloc[train_index,0].describe())
    print(ds.iloc[test_index,0].describe())
    search(xData.iloc[train_index,:], xData.iloc[test_index,:], 
           yDataBin[train_index], yDataBin[test_index], 
           ds.iloc[train_index,1])


In [None]:
print(type(r), len(r))

In [None]:
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(xData, yDataBin, test_size=0.3, random_state=0, stratify=[ds['1'], ds['0']])

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape,
      type(X_train.shape), type(X_test.shape), type(y_train.shape), type(y_test.shape))

In [None]:
X_train['1'].unique()

In [None]:
X_test['1'].unique()

In [None]:
search(X_train, X_test, y_train, y_test)

In [None]:
import itertools
import numpy as np
import matplotlib.pyplot as plt

from sklearn import svm, datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

class_names = le.classes_
# Run classifier, using a model that is too regularized (C too low) to see
# the impact on the results
classifier = svm.SVC(kernel='rbf', C=10, gamma=0.01)
y_pred = classifier.fit(X_train, y_train).predict(X_test)



def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
                      title='Normalized confusion matrix')

plt.show()

In [None]:
tic = time.clock()
y = classifier.predict(X_test[-1:])
print(time.clock() - tic)

In [None]:
y

In [None]:
y_test[-1:]