In [61]:
import numpy as np
import matplotlib.pyplot as plt

import cv2
import skimage.io
import skimage.transform
import skimage.feature

from sklearn.externals import joblib

from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import os

import pandas as pd

### Hyper-Parameter


In [62]:
IMG_SIZE = 64
feature_type='raw_hog_colorhist_fft'

## Training

### Reading images and labels¶

In [63]:
training_data_path = 'train'
m_tr_imgs = len(img_names)
img_buff = np.zeros((m_tr_imgs, IMG_SIZE, IMG_SIZE, 3))
target = []

for i in range(m_tr_imgs):
    img_file_path = os.path.join(training_data_path, img_names[i])
    img = cv2.imread(img_file_path)
    img = cv2.resize(img,(IMG_SIZE, IMG_SIZE))
    img_buff[i, :, :, :] = img/255.
    if np.mod(i, 1000) == 1:
        print('reading images: ' + str(i) + ' / ' + str(m_tr_imgs))

reading images: 1 / 10222
reading images: 1001 / 10222
reading images: 2001 / 10222
reading images: 3001 / 10222
reading images: 4001 / 10222
reading images: 5001 / 10222
reading images: 6001 / 10222
reading images: 7001 / 10222
reading images: 8001 / 10222
reading images: 9001 / 10222
reading images: 10001 / 10222


In [64]:
labels_csv = pd.read_csv('labels.csv',usecols=[1])
target = np.array(labels_csv).reshape((10222,)).tolist()
print(target[:5])

['boston_bull', 'dingo', 'pekinese', 'bluetick', 'golden_retriever']


### Feature extraction

In [65]:
def cpt_features(img, feat_type):
    
    def _raw_feat(img):
        feat = cv2.resize(img,(8,8))
        return np.reshape(feat, (-1))
    
    def _hog_feat(img):
        feat = skimage.feature.hog(
            np.mean(img, axis=-1), orientations=9, pixels_per_cell=(8, 8), 
            cells_per_block=(1, 1))
        return np.reshape(feat, (-1))
    
    def _colorhist_feat(img):
        hist_r, _ = np.histogram(img[:,:,0], bins=20, range=(0.0, 1.0), density=True)
        hist_g, _ = np.histogram(img[:,:,1], bins=20, range=(0.0, 1.0), density=True)
        hist_b, _ = np.histogram(img[:,:,2], bins=20, range=(0.0, 1.0), density=True)
        feat = np.concatenate((hist_r, hist_g, hist_b), axis=0)
        return np.reshape(feat, (-1))
        
    def _fft_feat(img):
        feat = np.abs(np.fft.fft2(np.mean(img, axis=-1)))
        return np.reshape(feat[0:10, 0:10], (-1))

    if feat_type is 'raw':
        return _raw_feat(img)
        
    if feat_type is 'hog':
        return _hog_feat(img)
            
    if feat_type is 'colorhist':
        return _colorhist_feat(img)
        
    if feat_type is 'fft':
        return _fft_feat(img)
            
    if feat_type is 'raw_hog_colorhist_fft':
        
        return np.concatenate(
            (_raw_feat(img), _hog_feat(img), _colorhist_feat(img), _fft_feat(img)),
            axis=0)

In [66]:
dims = cpt_features(img_buff[0, :, :, :], feature_type).size
data = np.zeros((m_tr_imgs, dims))
for i in range(m_tr_imgs):
    img = img_buff[i, :, :, :]
    feat = cpt_features(img, feature_type)
    data[i, :] = feat
    
    if np.mod(i, 1000) == 1:
        print('extracting features: ' + str(i) + ' / ' + str(m_tr_imgs))

C:\Rhaegal\Anaconda3\envs\tf\lib\site-packages\skimage\feature\_hog.py:119: skimage_deprecation: Default value of `block_norm`==`L1` is deprecated and will be changed to `L2-Hys` in v0.15
  'be changed to `L2-Hys` in v0.15', skimage_deprecation)


extracting features: 1 / 10222
extracting features: 1001 / 10222
extracting features: 2001 / 10222
extracting features: 3001 / 10222
extracting features: 4001 / 10222
extracting features: 5001 / 10222
extracting features: 6001 / 10222
extracting features: 7001 / 10222
extracting features: 8001 / 10222
extracting features: 9001 / 10222
extracting features: 10001 / 10222


### Train a classifier

In [73]:
data_tr, data_val, target_tr, target_val = train_test_split(data, target, test_size=0.2)


clf = RandomForestClassifier(n_estimators=200, max_depth=10, criterion='entropy')

clf.fit(data_tr, target_tr)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [74]:
pred_cls = clf.predict(data_tr)
acc = accuracy_score(target_tr, pred_cls)
print('model: %s, training acc = %s %%' % ("RF", acc*100))
pred_cls = clf.predict(data_val)
acc = accuracy_score(target_val, pred_cls)
print('model: %s, testing acc = %s %%' % ("RF", acc*100))

model: RF, training acc = 99.95108230402347 %
model: RF, testing acc = 4.596577017114915 %
