In [9]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
import cv2
from tqdm import tqdm
import imutils
import numpy as np
import time

In [10]:
df_dict = {
    'train': pd.read_csv('/home/stevie/datasets/chess_vision/256x256/dataframes/train.csv'),
    'test': pd.read_csv('/home/stevie/datasets/chess_vision/256x256/dataframes/test.csv'),
}

In [11]:
# grab the list of images that we'll be describing
print("[INFO] describing images...")
# initialize the raw pixel intensities matrix, the features matrix,
# and labels list
images = {}
features = {}
labels = {}
for set_str in 'train', 'test':
    images[set_str] = []
    features[set_str] = []
    labels[set_str] = []

[INFO] describing images...


In [12]:
def image_to_feature_vector(image, size=(32, 32)):
	# resize the image to a fixed size, then flatten the image into
	# a list of raw pixel intensities
	return cv2.resize(image, size).flatten()

In [13]:
def extract_color_histogram(image, bins=(8, 8, 8)):
	# extract a 3D color histogram from the HSV color space using
	# the supplied number of `bins` per channel
	hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
	hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,
		[0, 180, 0, 256, 0, 256])
	# handle normalizing the histogram if we are using OpenCV 2.4.X
	if imutils.is_cv2():
		hist = cv2.normalize(hist)
	# otherwise, perform "in place" normalization in OpenCV 3 (I
	# personally hate the way this is done
	else:
		cv2.normalize(hist, hist)
	# return the flattened histogram as the feature vector
	return hist.flatten()

In [14]:
# loop over the input images
for set_str, df in df_dict.items():
	print(len(df))
	for i, row in tqdm(df.iterrows()):
		# load the image and extract the class label (assuming that our
		# path as the format: /path/to/dataset/{class}.{image_num}.jpg
		image = cv2.imread(row['path'])
		label = row['label']
		# extract raw pixel intensity "features", followed by a color
		# histogram to characterize the color distribution of the pixels
		# in the image
		pixels = image_to_feature_vector(image)
		hist = extract_color_histogram(image)
		# update the raw images, features, and labels matricies,
		# respectively
		images[set_str].append(pixels)
		features[set_str].append(hist)
		labels[set_str].append(label)

42573


42573it [00:51, 830.43it/s]


12027


12027it [00:15, 773.31it/s]


In [15]:
for set_str in 'train', 'test':
    images[set_str] = np.array(images[set_str])
    features[set_str] = np.array(features[set_str])
    labels[set_str] = np.array(labels[set_str])

    print(set_str)
    print("[INFO] pixels matrix: {:.2f}MB".format(images[set_str].nbytes / (1024 * 1000.0)))
    print("[INFO] features matrix: {:.2f}MB".format(features[set_str].nbytes / (1024 * 1000.0)))

train
[INFO] pixels matrix: 127.72MB
[INFO] features matrix: 85.15MB
test
[INFO] pixels matrix: 36.08MB
[INFO] features matrix: 24.05MB


In [16]:
data = {'input_type': [], 'criterion': [], 'fit_time': [], 'eval_time': [], 'accuracy': []}
for criterion in tqdm(('entropy', 'gini', 'log_loss')):
    print(f"criterion: {criterion}")
    for input_label, input_data in ('images', images), ('features', features):
        # Create Decision Tree classifer object
        clf = DecisionTreeClassifier(criterion=criterion)

        # Train Decision Tree Classifer
        t0 = time.time()
        clf = clf.fit(input_data['train'], labels['train'])
        t1 = time.time()
        fit_time = (t1 - t0) / len(input_data['train'])

        #Predict the response for test dataset
        t0 = time.time()
        y_pred = clf.predict(input_data['test'])
        t1 = time.time()
        eval_time = (t1 - t0) / len(input_data['test'])

        # Model Accuracy, how often is the classifier correct?
        acc = metrics.accuracy_score(labels['test'], y_pred)

        print(f"\tAccuracy: {acc}")
        print(f"\tFit Time: {fit_time}")
        print(f"\tEval Time: {eval_time}")

        data['input_type'].append(input_label)
        data['criterion'].append(criterion)
        data['fit_time'].append(fit_time)
        data['eval_time'].append(eval_time)
        data['accuracy'].append(acc)

        df = pd.DataFrame.from_dict(data)
        df.to_csv('results/data.csv', index=False)

  0%|          | 0/3 [00:00<?, ?it/s]

criterion: entropy
	Accuracy: 0.04099110335079405
	Fit Time: 0.004668886626304096
	Eval Time: 5.507574242388626e-06


 33%|███▎      | 1/3 [03:44<07:28, 224.43s/it]

	Accuracy: 0.051134946370666
	Fit Time: 0.0006003168581112985
	Eval Time: 4.913282184278894e-07
criterion: gini
	Accuracy: 0.046229317369252516
	Fit Time: 0.0040142859632597665
	Eval Time: 4.381117325545923e-06


 67%|██████▋   | 2/3 [06:46<03:19, 199.73s/it]

	Accuracy: 0.02619107009229234
	Fit Time: 0.0002696654919413379
	Eval Time: 4.993171339827185e-07
criterion: log_loss
	Accuracy: 0.036002328095119315
	Fit Time: 0.004674521482226304
	Eval Time: 4.225700209045029e-06


100%|██████████| 3/3 [10:31<00:00, 210.47s/it]

	Accuracy: 0.03924503201130789
	Fit Time: 0.0005978720274328404
	Eval Time: 4.6268309937893647e-07



