<div class="alert alert-block alert-info">
    
# cats vs dogs
## Renana Rimon
    
<div>

In [3]:
import os, cv2
import numpy as np
import pandas as pd
from matplotlib import pyplot
import matplotlib.pyplot as plt
from matplotlib.image import imread
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

In [None]:
# plot dog photos from the 'dogs vs cats' dataset
folder = 'train/'

for i in range(9):
    pyplot.subplot(330 + 1 + i)
    filename = folder + 'dog.' + str(i) + '.jpg'
    # load image pixels
    image = imread(filename)
    # plot raw pixel data
    pyplot.imshow(image)
# show the figure
pyplot.show()

In [None]:
# plot cat photos from the 'dogs vs cats' dataset
folder = 'train/'

for i in range(9):
    pyplot.subplot(330 + 1 + i)
    filename = folder + 'cat.' + str(i) + '.jpg'
    # load image pixels
    image = imread(filename)
    # plot raw pixel data
    pyplot.imshow(image)
# show the figure
pyplot.show()

<div class="alert alert-block alert-warning">


### define two functions for reading images into a DataFrame
   * **readImg_gray**: 
       1. gets file path and returns vector of the resized image in grayScale
       2. Turning the color to gray to reduce dimensions.
   * **pre**:  
       1. insert images into DataFrame
       2. add label column (**cat = 0, dog = 1**)
    
<div>

In [4]:
def readImg_gray(file_path):
    img = imread(file_path)
    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    resized_img = cv2.resize(gray_img, (128,128))
    return resized_img

In [5]:
def pre(folder, name, num):    
    m=12500 #number of images
    size = 128*128 #image size
    X = np.ndarray((m,size), dtype=np.uint8)
    y = np.zeros((m,1))
    for i in range(m):
        img = readImg_gray(folder + name+ '.' + str(i) + '.jpg')
        X[i,:] = np.squeeze(img.reshape((size,1)))
        y[i,0] = num
        
        df = pd.DataFrame(X)
        df['label'] = y
    return df

In [6]:
#train_df
df_cat = pre('train/', 'cat',0)
df_dog = pre('train/', 'dog',1)

In [None]:
#test_df
df_cat_test = pre('test1/', 'cat',0)
df_dog_test = pre('test1/', 'dog',1)

<div class="alert alert-block alert-warning">


**concat both dataFrames into one**
    
<div>

In [7]:
df = pd.concat([df_cat, df_dog])
df.shape

(25000, 16385)

In [None]:
df_test = pd.concat([df_cat_test, df_dog_test])
df_test.shape

In [8]:
y = df['label']
X = df.drop(['label'], axis=1).to_numpy()

y_test_f = df_test['label']
X_test_f = df_test.drop(['label'], axis=1).to_numpy()

<div class="alert alert-block alert-warning">


## pca - dimensionality reduction
   * **original image:** 784 fitures
   * **image after pca:** 15 fitures
    
<div>

In [None]:
# visualization - 'original image' (784 fitures) vs 'image after pca' (16 fitures) 
pca = PCA(n_components=16)
X_train_reduced = pca.fit_transform(X_train)

for i in range(3):
    plt.figure(figsize=(6,4))
    plt.subplot(2, 2, 1)
    plt.imshow(X_test[i].reshape(28,28), cmap="gist_yarg")
    plt.title("Original Image", color = "green")
    plt.axis("off")

    plt.subplot(2, 2, 2)
    plt.imshow(X_train_reduced[i].reshape(4,4), cmap="gist_yarg")
    plt.title("image after pca", color = "Darkred")
    plt.axis("off")
    plt.show()

<div class="alert alert-block alert-warning">
    
## Selecting a Kernel and Tuning Hyperparameters
   1. scaling
   2. pca (n_components=15)
   3. GridSearch: find best parameters in each model
   4. compare models
    
<div>

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=15)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42, shuffle = True)

Kfolds = StratifiedKFold(n_splits=5,shuffle=True, random_state=7)

In [10]:
model_params = {
    'random_forest':{
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 150],
            'max_depth': [10,20,50,80,100]
        }
    },
    
    'logistic_regression':{
        'model': LogisticRegression(multi_class= 'auto'),
        'params': {
            'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
            'C': [0.1, 0.5, 1.0]
        }
    },
    
    'xgb':{
        'model': XGBClassifier(eval_metric='mlogloss'),
        'params':{
            'n_estimators': [100, 150],
            'learning_rate': [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3]            
        }
    },
    
    'knn':{
        'model': KNeighborsClassifier(),
        'params': {
            'n_neighbors': [5,7,10, 15],
            'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
        }
    }
}

In [11]:
from sklearn.model_selection import RandomizedSearchCV
scores = []

for model_name, mp in model_params.items():
    rs = RandomizedSearchCV(mp['model'], mp['params'], cv = Kfolds, return_train_score= False, n_iter= 6)
    rs.fit(X_train, y_train)
    scores.append({
        'model': model_name,
        'best score': clf.best_score_,
        'best params': clf.best_params_
    })

KeyboardInterrupt: 

In [None]:
score_df = pd.DataFrame(scores)
score_df

**after finding best parameters, we will use them to get to best score.<br> 'xgb' and 'random_forest' got the best score, therefore we will use both.**

<div class="alert alert-block alert-warning">
    
### pipeLine: 
   * scale by StandardScaler
   * dimentionality reduction by PCA
   * fit
   * score
<div>
    
    
**first test on train data (split to train & test),<br> and after choosing the best model test on the final test data.**

In [None]:
def pipe(model):
    p = Pipeline([('scaler', StandardScaler()),
                ('pca', PCA(n_components=15)),
                ('mod', model)])
    p.fit(X_train, y_train)
    return round(p.score(X_test, y_test),4)

In [None]:
xgb = pipe(XGBClassifier(learning_rate=0.2, n_estimators=150))
xgb

In [None]:
randomForest = pipe(RandomForestClassifier(max_depth=80, n_estimators=150))
randomForest

In [None]:
#stacking
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=15)),
    ('stack', StackingClassifier([
        ('rf', RandomForestClassifier(max_depth=80, n_estimators=150)),    
        ('knn', KNeighborsClassifier(n_neighbors=7, algorithm='auto'),
        ('xgb', XGBClassifier(eval_metric='mlogloss', learning_rate=0.2, n_estimators=150))
        )]))])

pipeline.fit(X_train, y_train)
round(pipeline.score(X_test, y_test),4)

In [None]:
#voting
pipeline = Pipeline([
    ('scaler1', StandardScaler()),
    ('pca1', PCA(n_components=15)),
    ('clf', VotingClassifier([
        ('rf', RandomForestClassifier(max_depth=80, n_estimators=150)),
        ('knn',KNeighborsClassifier(n_neighbors=7, algorithm='auto')),
        ('xgb', XGBClassifier(learning_rate=0.2, n_estimators=150))],voting='soft'))])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
round(pipeline.score(X_test, y_test),4)

<div class="alert alert-block alert-success">

## *#1_conclusions:*

*after dimensionality reduction with PCA, from **784** features to **15** features.*

### accuracy:

   * RandomForestClassifier: 
   * xgb: 
   * stacking: 
   * voting: 
   
   
### best model:
    
    
<div>

<div class="alert alert-block alert-warning">

    
### Test 'voting' model on final Test Data

    
<div>

In [None]:
report = classification_report(y_test_f, y_pred, output_dict=True)
df_report = pd.DataFrame(report)
df_report

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix
plt.figure(figsize=(8,5))
cnf_matrix = confusion_matrix(y_test_f, y_pred)

# create seabvorn heatmap with required labels
axis_labels = ['T-shirt', 'Trouser', 'Pullover', 'Dress', 'Coat', 'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot'] # labels for x-axis
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True,cmap="Blues" , fmt='g', xticklabels=axis_labels, yticklabels=axis_labels, linewidths=0.1)

plt.tight_layout()
plt.title('Confusion matrix voting\n',fontsize=20)

<div class="alert alert-block alert-success">
    
## *#2_conclusions:*
    
## voting: 85% accuracy with 15 features
    
### NOTE: the model is tested only once in the final test data to maintain purity!