## Using graphlab(https://turi.com/products/create/docs/index.html) for handling images

In [29]:
import numpy as np         # dealing with arrays
import os                  # dealing with directories
from random import shuffle # mixing up or currently ordered data that might lead our network astray in training.
import pandas as pd
from sklearn import preprocessing
import graphlab
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from PIL import Image

### Loading the directories for images

In [2]:
TRAIN_DIR = './train'
TEST_DIR = './test'

### Loading the labels file 

In [3]:
labels = pd.read_csv("./labels.csv", sep=",")
print(labels.head())

                                 id             breed
0  000bec180eb18c7604dcecc8fe0dba07       boston_bull
1  001513dfcb2ffafc82cccf4d8bbaba97             dingo
2  001cdf01b096e06d78e9e5112d419397          pekinese
3  00214f311d5d2247d5dfe4fe24b2303d          bluetick
4  0021f9ceb3235effd7fcde7f7538ed62  golden_retriever


## Data Analysis
### Finding the top most categories to which most of the dogs belong

In [4]:
top_labels = labels
top_breeds = sorted(list(top_labels['breed'].value_counts().head(16).index))
labels_top = top_labels[top_labels['breed'].isin(top_breeds)]
print(labels_top.breed.value_counts())

scottish_deerhound      126
maltese_dog             117
afghan_hound            116
entlebucher             115
bernese_mountain_dog    114
shih-tzu                112
pomeranian              111
great_pyrenees          111
basenji                 110
samoyed                 109
airedale                107
tibetan_terrier         107
leonberg                106
cairn                   106
japanese_spaniel        105
beagle                  105
Name: breed, dtype: int64


## Data Processing
### Label conversion

In [5]:
le = preprocessing.LabelEncoder()
le.fit(list(labels['breed']))

LabelEncoder()

### Label Assignment

In [6]:
def label_img(img):
    for index, row in labels.iterrows():
        if row['id'] == img.split('.')[0]:
            word_label = le.transform(row['breed'])
            return word_label

In [7]:
label_assign = []
for img in os.listdir(TRAIN_DIR):
    n = label_img(img)
    label_assign.append(n)

In [8]:
print(len(label_assign))
print(label_assign[0:10])

10222
[19, 37, 85, 15, 49, 10, 10, 18, 7, 97]


### Generating train data for training

In [9]:
def create_train_data():
    data = graphlab.image_analysis.load_images(TRAIN_DIR)
    data['label'] = label_assign
    return data

graphlab.canvas.set_target('ipynb')
data = create_train_data()
data.show()

       

This non-commercial license of GraphLab Create for academic use is assigned to vartikanarang.hcst.cs14@sgei.org and will expire on October 01, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\Ridhima\AppData\Local\Temp\graphlab_server_1519690818.log.0


### Resizing images

In [44]:
size = 128, 128
for img in os.listdir(TRAIN_DIR):
    DIR = TRAIN_DIR + '/'
    img  = DIR+str(img)
    im = Image.open(img)
    im.resize(size)

In [27]:
print(data.head())
data[0:3]['image'].show()

+-------------------------------+------------------------+-------+
|              path             |         image          | label |
+-------------------------------+------------------------+-------+
| F:/coursera pdfs/kaggle/Do... | Height: 375 Width: 500 |   19  |
| F:/coursera pdfs/kaggle/Do... | Height: 329 Width: 500 |   37  |
| F:/coursera pdfs/kaggle/Do... | Height: 300 Width: 400 |   85  |
| F:/coursera pdfs/kaggle/Do... | Height: 332 Width: 500 |   15  |
| F:/coursera pdfs/kaggle/Do... | Height: 231 Width: 288 |   49  |
| F:/coursera pdfs/kaggle/Do... | Height: 577 Width: 800 |   10  |
| F:/coursera pdfs/kaggle/Do... | Height: 184 Width: 266 |   10  |
| F:/coursera pdfs/kaggle/Do... | Height: 500 Width: 375 |   18  |
| F:/coursera pdfs/kaggle/Do... | Height: 500 Width: 474 |   7   |
| F:/coursera pdfs/kaggle/Do... | Height: 235 Width: 200 |   97  |
+-------------------------------+------------------------+-------+
+-------------------------------+
|      deep_features.image  

### Feature extraction model using graphlab

In [10]:
extractor = graphlab.feature_engineering.DeepFeatureExtractor(features = 'image',
                                                        model='auto')

### Feature extraction for training data 

In [11]:
extractor_train = extractor.fit(data)
extracted_model = extractor_train['model']
data = extractor_train.transform(data)
print data
print data['label']

+-------------------------------+------------------------+-------+
|              path             |         image          | label |
+-------------------------------+------------------------+-------+
| F:/coursera pdfs/kaggle/Do... | Height: 375 Width: 500 |   19  |
| F:/coursera pdfs/kaggle/Do... | Height: 329 Width: 500 |   37  |
| F:/coursera pdfs/kaggle/Do... | Height: 300 Width: 400 |   85  |
| F:/coursera pdfs/kaggle/Do... | Height: 332 Width: 500 |   15  |
| F:/coursera pdfs/kaggle/Do... | Height: 231 Width: 288 |   49  |
| F:/coursera pdfs/kaggle/Do... | Height: 577 Width: 800 |   10  |
| F:/coursera pdfs/kaggle/Do... | Height: 184 Width: 266 |   10  |
| F:/coursera pdfs/kaggle/Do... | Height: 500 Width: 375 |   18  |
| F:/coursera pdfs/kaggle/Do... | Height: 500 Width: 474 |   7   |
| F:/coursera pdfs/kaggle/Do... | Height: 235 Width: 200 |   97  |
+-------------------------------+------------------------+-------+
+-------------------------------+
|      deep_features.image  

In [12]:
print(len(data['deep_features.image'][1]))

4096


### Generating test data and feature extraction of test data

In [13]:
test_data = graphlab.image_analysis.load_images(TEST_DIR)
u_i = []
for img in os.listdir(TEST_DIR):
    i = img.split('.')[0]
    u_i.append(i)
test_data['id'] = u_i
print(test_data)

+-------------------------------+-------------------------+
|              path             |          image          |
+-------------------------------+-------------------------+
| F:/coursera pdfs/kaggle/Do... |  Height: 338 Width: 450 |
| F:/coursera pdfs/kaggle/Do... |  Height: 296 Width: 334 |
| F:/coursera pdfs/kaggle/Do... |  Height: 375 Width: 500 |
| F:/coursera pdfs/kaggle/Do... | Height: 811 Width: 1025 |
| F:/coursera pdfs/kaggle/Do... |  Height: 375 Width: 500 |
| F:/coursera pdfs/kaggle/Do... |  Height: 375 Width: 500 |
| F:/coursera pdfs/kaggle/Do... |  Height: 500 Width: 333 |
| F:/coursera pdfs/kaggle/Do... |  Height: 328 Width: 350 |
| F:/coursera pdfs/kaggle/Do... |  Height: 375 Width: 500 |
| F:/coursera pdfs/kaggle/Do... |  Height: 300 Width: 400 |
+-------------------------------+-------------------------+
+-------------------------------+
|               id              |
+-------------------------------+
| 000621fb3cbb32d8935728e486... |
| 00102ee9d8eb9081235068

In [14]:
extractor_test = extractor.fit(test_data)
extracted_model = extractor_test['model']
test_data = extractor_test.transform(test_data)

In [17]:
print(len(test_data['deep_features.image'][0]))
print(test_data.head())

4096
+-------------------------------+-------------------------+
|              path             |          image          |
+-------------------------------+-------------------------+
| F:/coursera pdfs/kaggle/Do... |  Height: 338 Width: 450 |
| F:/coursera pdfs/kaggle/Do... |  Height: 296 Width: 334 |
| F:/coursera pdfs/kaggle/Do... |  Height: 375 Width: 500 |
| F:/coursera pdfs/kaggle/Do... | Height: 811 Width: 1025 |
| F:/coursera pdfs/kaggle/Do... |  Height: 375 Width: 500 |
| F:/coursera pdfs/kaggle/Do... |  Height: 375 Width: 500 |
| F:/coursera pdfs/kaggle/Do... |  Height: 500 Width: 333 |
| F:/coursera pdfs/kaggle/Do... |  Height: 328 Width: 350 |
| F:/coursera pdfs/kaggle/Do... |  Height: 375 Width: 500 |
| F:/coursera pdfs/kaggle/Do... |  Height: 300 Width: 400 |
+-------------------------------+-------------------------+
+-------------------------------+-------------------------------+
|               id              |      deep_features.image      |
+----------------------

### Splitting data into train and test for cross validation

In [18]:
train, test = data.random_split(0.8, seed=1)

### Training the model 

In [19]:

neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(np.array(train['deep_features.image']), np.array(train['label']))
predict = neigh.predict_proba(np.array(test['deep_features.image']))
print(predict)
print(predict.shape)
neigh.score(np.array(test['deep_features.image']),np.array(test['label']))


[[ 0.   0.   0.  ...,  0.   0.   0. ]
 [ 0.   0.2  0.  ...,  0.   0.   0. ]
 [ 0.   0.   0.  ...,  0.   0.   0. ]
 ..., 
 [ 0.   0.   0.  ...,  0.   0.2  0. ]
 [ 0.   0.   0.  ...,  0.   0.   0. ]
 [ 0.   0.   0.  ...,  0.   0.   0. ]]
(2040L, 120L)


0.005392156862745098

### Prediction for our test data

In [20]:
prediction = neigh.predict_proba(np.array(test_data['deep_features.image']))

In [21]:
labels_ = le.classes_
final_test_data = pd.DataFrame()
final_test_data['id'] = test_data['id']
for i in range(len(labels_)):
    final_test_data[str(labels_[i])] = prediction[:,i]
print(final_test_data)

                                     id  affenpinscher  afghan_hound  \
0      000621fb3cbb32d8935728e48679680e            0.0           0.0   
1      00102ee9d8eb90812350685311fe5890            0.0           0.0   
2      0012a730dfa437f5f3613fb75efcd4ce            0.0           0.0   
3      001510bc8570bbeee98c8d80c8a95ec1            0.0           0.2   
4      001a5f3114548acdefa3d4da05474c2e            0.0           0.0   
5      00225dcd3e4d2410dd53239f95c0352f            0.0           0.0   
6      002c2a3117c2193b4d26400ce431eebd            0.0           0.0   
7      002c58d413a521ae8d1a5daeb35fc803            0.0           0.0   
8      002f80396f1e3db687c5932d7978b196            0.0           0.2   
9      0036c6bcec6031be9e62a257b1c3c442            0.0           0.0   
10     0041940322116ae58c38130f5a6f71f9            0.0           0.0   
11     0042d6bf3e5f3700865886db32689436            0.0           0.0   
12     004476c96f575879af4af471af65cae8            0.0          

In [22]:
final_test_data.to_csv('dog_breed_submissions.csv', sep=',',index=False)