# <center>Dog vs Cat classification</center>

## Importing Libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
%matplotlib inline
import random
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from PIL import Image
from pickle import dump
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.tree import plot_tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Defining constants for directories

In [None]:
TEST1_DIR = 'test1'
TRAIN_DIR = 'train'
INPUT_DIR = '../input/dogs-vs-cats'
OUTPUT_DIR = '/kaggle/working/'

## Extraction of the images from the zip files.

In [None]:
from zipfile import ZipFile
files_to_extract = [TEST1_DIR,TRAIN_DIR]

for Dataset in files_to_extract:
    with ZipFile('{}/{}.zip'.format(INPUT_DIR, Dataset),'r') as z:
        z.extractall('.')

Listing the current directories in the workspace.

In [None]:
from subprocess import check_output
print(check_output(['ls', '.']).decode('utf8'))

## Training
### Data preparation
Charging the list of image names.

In [None]:
filenames = os.listdir(TRAIN_DIR)
filenames[:5]

creating the list of expected outputs of the image classification (**0 = Cat**, **1 = Dog**).

In [None]:
labels = [1 if filename.startswith('dog') else 0 for filename in filenames]

labels[:5]

creating training dataset.

In [None]:
df = pd.DataFrame({
    'filename': filenames,
    'category': labels
})

df.head()

Plotting the number of cats and dogs images.

In [None]:
ax = df.category.value_counts().plot.bar(color=['dodgerblue', 'slategray'])
plt.title('Dogs and Cats images count')
plt.xlabel('Dog = 1 - Cat = 0')
plt.ylabel('samples count')
ax.set_xticklabels(['Dog', 'Cat'], rotation=0, fontsize=11)
plt.show()

charging a random image to see the example.

In [None]:
random_filename = '{}/{}'.format(TRAIN_DIR, random.choice(filenames))
random_image = mpimg.imread(random_filename)
# random_image

showing the image.

In [None]:
plt.imshow(random_image)
plt.show()

method to charge a list of image kind files from a directory and resize them into a resolution of 64x64.

In [None]:
def load_images_from_dir(dir_location, filenames):
    return [np.array(Image.open('{}/{}'.format(dir_location, filename)).resize((64, 64))) for filename in filenames]
    

charging all the images.

In [None]:
images = np.array(load_images_from_dir(TRAIN_DIR, filenames))
images.shape

Put the image dimetions into variables and make a dimensionality reduction from four dimentios to two dimentions.

In [None]:
h, w, d = images[0].shape
images_resized = np.array([np.reshape(img, (w*h*d)) for img in images])
images_resized.shape

## Comparing classification models.
### MLP (Multi Layer Perceptron).
creating model instance.

In [None]:
mlp = MLPClassifier()

training the model using the processed data.

In [None]:
mlp.fit(images_resized, labels)

score obtained using default configurations.

In [None]:
mlp.score(images_resized, labels)

### Decision Tree
creating model instance.

In [None]:
tree = DecisionTreeClassifier()

training the model using the processed data.

In [None]:
tree.fit(images_resized, labels)

score obtained using default configurations.

In [None]:
tree.score(images_resized, labels)

ploting the Decision Tree generated.

In [None]:
plot_tree(tree)

### Random Forest
creating model instance.

In [None]:
forest = RandomForestClassifier()

training the model using the processed data.

In [None]:
forest.fit(images_resized, labels)

score obtained using default configurations.

In [None]:
forest.score(images_resized, labels)

## Testing Models.
### Data preparation for training with a random example.
Charging the list of image names.

In [None]:
test_filenames = os.listdir(TEST1_DIR)
test_filenames[:5]

charging a random image to see the example.

In [None]:
random_filename = '{}/{}'.format(TEST1_DIR, random.choice(test_filenames))
random_image = mpimg.imread(random_filename)
random_image.shape

In [None]:
plt.imshow(random_image)
plt.show()

resizing the selected image to make the prediction.

In [None]:
random_image_resized = np.array(Image.fromarray(random_image).resize((64, 64)))
random_image_resized.shape

Make a dimensionality reduction from tree dimentios to two dimentions.

In [None]:
random_image_resized = np.reshape(random_image_resized, (w*h*d))

MLP prediction output.

In [None]:
mlp_prediction = mlp.predict([random_image_resized])
mlp_prediction

Decision Tree prediction output.

In [None]:
tree_prediction = tree.predict([random_image_resized])
tree_prediction

Random Forest prediction output.

In [None]:
forest_prediction = forest.predict([random_image_resized])
forest_prediction

### Data preparation for training with more examples.

charging all the images.

In [None]:
test_images = np.array(load_images_from_dir(TEST1_DIR, test_filenames))
test_images.shape

Make a dimensionality reduction from four dimentios to two dimentions.

In [None]:
test_images_resized = np.array([np.reshape(img, (w*h*d)) for img in test_images])
test_images_resized.shape

### Comparing the otput of the predictions.
Creating the list of predictions with the MLP.

In [None]:
mlp_predictions = mlp.predict(test_images_resized)
mlp_predictions[:5]

Creating the list of predictions with the Decision Tree.

In [None]:
tree_predictions = tree.predict(test_images_resized)
tree_predictions[:5]

Creating the list of predictions with the Random Forest.

In [None]:
forest_predictions = forest.predict(test_images_resized)
forest_predictions[:5]

Creating MLP dataset.

In [None]:
df_test_mlp = pd.DataFrame({
    'filename': test_filenames,
    'category': mlp_predictions
})

df_test_mlp.head()

Creating Decision Tree dataset.

In [None]:
df_test_tree = pd.DataFrame({
    'filename': test_filenames,
    'category': tree_predictions
})

df_test_tree.head()

Creating Random Forest dataset.

In [None]:
df_test_forest = pd.DataFrame({
    'filename': test_filenames,
    'category': forest_predictions
})

df_test_forest.head()

In [None]:
def autolabel(ax):
    """
    Attach a text label above each bar displaying its height
    """
    for p in ax.patches:
        ax.annotate(str(p.get_height()), (p.get_x() * 1.005, p.get_height() * 1.005))

### Plotting the predictions of the counts.

In [None]:
ax = df_test_mlp.category.value_counts().plot.bar(color=['dodgerblue', 'slategray'])
plt.title('Dogs and Cats images count MLP')
plt.xlabel('Dog = 1 - Cat = 0')
plt.ylabel('samples count')
ax.set_xticklabels(['Dog', 'Cat'], rotation=0, fontsize=11)
autolabel(ax)
plt.show()

In [None]:
ax = df_test_tree.category.value_counts().plot.bar(color=['dodgerblue', 'slategray'])
plt.title('Dogs and Cats images count Desicion Tree')
plt.xlabel('Dog = 1 - Cat = 0')
plt.ylabel('samples count')
ax.set_xticklabels(['Dog', 'Cat'], rotation=0, fontsize=11)
autolabel(ax)
plt.show()

In [None]:
ax = df_test_forest.category.value_counts().plot.bar(color=['dodgerblue', 'slategray'])
plt.title('Dogs and Cats images count Random Forest')
plt.xlabel('Dog = 1 - Cat = 0')
plt.ylabel('samples count')
ax.set_xticklabels(['Dog', 'Cat'], rotation=0, fontsize=11)
autolabel(ax)
plt.show()

### Plotting the first samples to compare the prediction with the image.
MLP 13 hits - 5 wrong.

In [None]:
IMAGE_SIZE = (w, h)
sample_test = df_test_mlp.head(18)
sample_test.head()
plt.figure(figsize=(12, 24))
for index, row in sample_test.iterrows():
    filename = row['filename']
    category = row['category']
    img = mpimg.imread('./test1/{}'.format(filename))
    plt.subplot(6, 3, index+1)
    plt.imshow(img)
    plt.xlabel('{} ({})'.format(filename, category))
plt.tight_layout()
plt.show()

Decision Tree 8 hits - 10 wrong

In [None]:
sample_test = df_test_tree.head(18)
sample_test.head()
plt.figure(figsize=(12, 24))
for index, row in sample_test.iterrows():
    filename = row['filename']
    category = row['category']
    img = mpimg.imread('./test1/{}'.format(filename))
    plt.subplot(6, 3, index+1)
    plt.imshow(img)
    plt.xlabel('{} ({})'.format(filename, category))
plt.tight_layout()
plt.show()

Random Forest 14 hits - 4 wrong

In [None]:
sample_test = df_test_forest.head(18)
sample_test.head()
plt.figure(figsize=(12, 24))
for index, row in sample_test.iterrows():
    filename = row['filename']
    category = row['category']
    img = mpimg.imread('./test1/{}'.format(filename))
    plt.subplot(6, 3, index+1)
    plt.imshow(img)
    plt.xlabel('{} ({})'.format(filename, category))
plt.tight_layout()
plt.show()

Changing to the working directory to put the output data.

In [None]:
os.chdir(OUTPUT_DIR)

Saving the best successful model configuration.

In [None]:
FILE_NAME = 'forest_model.sav'
dump(forest, open(FILE_NAME, 'wb'))
# loaded_model = pickle.load(open(filename, 'rb'))
# result = loaded_model.score(X_test, Y_test)
# print(result)

In [None]:
# from IPython.display import FileLink
# FileLink(FILE_NAME)

Saving the random forest dataset.

In [None]:
submission_df = df_test_forest.copy()
submission_df['id'] = submission_df['filename'].str.split('.').str[0]
submission_df['label'] = submission_df['category']
submission_df.drop(['filename', 'category'], axis=1, inplace=True)
submission_df.to_csv('submission.csv', index=False)