# Load data + setup

In [2]:
import graphlab
image_train = graphlab.SFrame('image_train_data/')
image_test = graphlab.SFrame('image_test_data/')
graphlab.canvas.set_target('ipynb')
image_train.head()

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1547186107.log
INFO:graphlab.cython.cy_server:GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1547186107.log


This non-commercial license of GraphLab Create for academic use is assigned to xaero@y7mail.com and will expire on October 26, 2019.


id,image,label,deep_features,image_array
24,Height: 32 Width: 32,bird,"[0.242871761322, 1.09545373917, 0.0, ...","[73.0, 77.0, 58.0, 71.0, 68.0, 50.0, 77.0, 69.0, ..."
33,Height: 32 Width: 32,cat,"[0.525087952614, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[7.0, 5.0, 8.0, 7.0, 5.0, 8.0, 5.0, 4.0, 6.0, 7.0, ..."
36,Height: 32 Width: 32,cat,"[0.566015958786, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[169.0, 122.0, 65.0, 131.0, 108.0, 75.0, ..."
70,Height: 32 Width: 32,dog,"[1.12979578972, 0.0, 0.0, 0.778194487095, 0.0, ...","[154.0, 179.0, 152.0, 159.0, 183.0, 157.0, ..."
90,Height: 32 Width: 32,bird,"[1.71786928177, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[216.0, 195.0, 180.0, 201.0, 178.0, 160.0, ..."
97,Height: 32 Width: 32,automobile,"[1.57818555832, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[33.0, 44.0, 27.0, 29.0, 44.0, 31.0, 32.0, 45.0, ..."
107,Height: 32 Width: 32,dog,"[0.0, 0.0, 0.220677852631, 0.0, ...","[97.0, 51.0, 31.0, 104.0, 58.0, 38.0, 107.0, 61.0, ..."
121,Height: 32 Width: 32,bird,"[0.0, 0.23753464222, 0.0, 0.0, 0.0, 0.0, ...","[93.0, 96.0, 88.0, 102.0, 106.0, 97.0, 117.0, ..."
136,Height: 32 Width: 32,automobile,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.5737862587, 0.0, ...","[35.0, 59.0, 53.0, 36.0, 56.0, 56.0, 42.0, 62.0, ..."
138,Height: 32 Width: 32,bird,"[0.658935725689, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[205.0, 193.0, 195.0, 200.0, 187.0, 193.0, ..."


#### 1. Computing summary statistics of the data: 
Sketch summaries are techniques for computing summary statistics of data very quickly. In GraphLab Create, SFrames and SArrays include a method:

.sketch_summary()

which computes such summary statistics. Using the training data, compute the sketch summary of the ‘label’ column and interpret the results. What’s the least common category in the training data? Save this result to answer the quiz at the end.

In [3]:
labels = image_train['label']
sketch = graphlab.Sketch(labels)

In [4]:
sketch = labels.sketch_summary()

In [5]:
sketch


+------------------+-------+----------+
|       item       | value | is exact |
+------------------+-------+----------+
|      Length      |  2005 |   Yes    |
| # Missing Values |   0   |   Yes    |
| # unique values  |   4   |    No    |
+------------------+-------+----------+

Most frequent items:
+-------+------------+-----+-----+------+
| value | automobile | cat | dog | bird |
+-------+------------+-----+-----+------+
| count |    509     | 509 | 509 | 478  |
+-------+------------+-----+-----+------+


#### 2. Creating category-specific image retrieval models: 
In most retrieval tasks, the data we have is unlabeled, thus we call these unsupervised learning problems. However, we have labels in this image dataset, and will use these to create one model for each of the 4 image categories, {‘dog’,’cat’,’automobile’,bird’}

In [6]:
dogs = image_train.filter_by('dog', 'label')
cats = image_train.filter_by('cat', 'label')
automobiles = image_train.filter_by('automobile', 'label')
birds = image_train.filter_by('bird', 'label')

In [7]:
dog_model = graphlab.nearest_neighbors.create(dogs, features=['deep_features'], label='id')

In [8]:
cat_model = graphlab.nearest_neighbors.create(cats, features=['deep_features'], label='id')

In [9]:
automobile_model = graphlab.nearest_neighbors.create(automobiles, features=['deep_features'], label='id')

In [10]:
bird_model = graphlab.nearest_neighbors.create(birds, features=['deep_features'], label='id')

In [11]:
query = image_test[0:1]

In [12]:
cat_neighbour = cat_model.query(query)
cat_neighbour.head()

query_label,reference_label,distance,rank
0,16289,34.623719208,1
0,45646,36.0068799284,2
0,32139,36.5200813436,3
0,25713,36.7548502521,4
0,331,36.8731228168,5


In [13]:
dog_neighbour = dog_model.query(query)
dog_neighbour.head()

query_label,reference_label,distance,rank
0,16976,37.4642628784,1
0,13387,37.5666832169,2
0,35867,37.6047267079,3
0,44603,37.7065585153,4
0,6094,38.5113254907,5


In [14]:
def get_images_from_ids(query_result):
    return image_train.filter_by(query_result['id'], 'id')

### What is the nearest ‘cat’ labeled image in the training data to the cat image above (the first image in the test data)? Save this result.

In [15]:
nearest_cat = cats[cats['id']==16289]
get_images_from_ids(nearest_cat)['image'].show()

### What is the nearest ‘dog’ labeled image in the training data to the cat image above (the first image in the test data)? Save this result.

In [16]:
nearest_dog = dogs[dogs['id']==16976]
get_images_from_ids(nearest_dog)['image'].show()


### 3. A simple example of nearest-neighbors classification: 
When we queried a nearest neighbors model, the ‘distance’ column in the table above shows the computed distance between the input and each of the retrieved neighbors. In this question, you will use these distances to perform a classification task, using the idea of a nearest-neighbors classifier.

In [17]:
cat_neighbour.head()

query_label,reference_label,distance,rank
0,16289,34.623719208,1
0,45646,36.0068799284,2
0,32139,36.5200813436,3
0,25713,36.7548502521,4
0,331,36.8731228168,5


In [18]:
cat_mean = cat_neighbour['distance'].mean()

In [19]:
cat_mean

36.15573070978294

In [20]:
dog_mean = dog_neighbour['distance'].mean()

In [21]:
dog_mean

37.77071136184157

### On average, image_test[0:1] is nearer to cat data

#### 4. [Challenging Question] Computing nearest neighbors accuracy using SFrame operations: 
A nearest neighbor classifier predicts the label of a point as the most common label of its nearest neighbors. In this question, we will measure the accuracy of a 1-nearest-neighbor classifier, i.e., predict the output as the label of the nearest neighbor in the training data. Although there are simpler ways of computing this result, we will go step-by-step here to introduce you to more concepts in nearest neighbors and SFrames, which will be useful later in this Specialization.

In [22]:
image_test_dog = image_test.filter_by('dog', 'label')
image_test_cat = image_test.filter_by('cat', 'label')
image_test_automobile = image_test.filter_by('automobile', 'label')
image_test_bird = image_test.filter_by('bird', 'label')


In [68]:
dog_dog_neighbours = dog_model.query(image_test_dog, k=1)
dog_cat_neighbours = cat_model.query(image_test_dog, k=1)
dog_automobile_neighbours = automobile_model.query(image_test_dog, k=1)
dog_bird_neighbours = bird_model.query(image_test_dog, k=1)
cat_cat_neighbours = cat_model.query(image_test_cat, k=1)
cat_dog_neighbours = dog_model.query(image_test_cat, k=1)
cat_automobile_neighbours = automobile_model.query(image_test_cat, k=1)
cat_bird_neighbours = bird_model.query(image_test_cat, k=1)
# automobile_dog_neighbours = dog_model.query(image_test_automobile, k=1)
# automobile_cat_neighbors = cat_model.query(image_test_automobile, k=1)
# automobile_bird_neighbours = bird_model.query(image_test_automobile, k=1)
# bird_dog_neighbours = dog_model.query(image_test_bird, k=1)
# bird_cat_neighbours = cat_model.query(image_test_bird, k=1)
# bird_automobile_neighbours = automobile_model.query(image_test_bird, k=1)


### See contents of dog_cat_neighbors

In [69]:
dog_cat_neighbors

query_label,reference_label,distance,rank
0,33,36.4196077068,1
1,30606,38.8353268874,1
2,5545,36.9763410854,1
3,19631,34.5750072914,1
4,7493,34.778824791,1
5,47044,35.1171578292,1
6,13918,40.6095830913,1
7,10981,39.9036867306,1
8,45456,38.0674700168,1
9,44673,42.7258732951,1


In [70]:
dog_distances = graphlab.SFrame({'dog_dog': dog_dog_neighbours['distance'],
                                   'dog_cat': dog_cat_neighbours['distance'],
                                   'dog_automobile': dog_automobile_neighbours['distance'],
                                   'dog_bird': dog_bird_neighbours['distance']})

In [71]:
dog_distances.head()

dog_automobile,dog_bird,dog_cat,dog_dog
41.9579761457,41.7538647304,36.4196077068,33.4773590373
46.0021331807,41.3382958925,38.8353268874,32.8458495684
42.9462290692,38.6157590853,36.9763410854,35.0397073189
41.6866060048,37.0892269954,34.5750072914,33.9010327697
39.2269664935,38.272288694,34.778824791,37.4849250909
40.5845117698,39.1462089236,35.1171578292,34.945165344
45.1067352961,40.523040106,40.6095830913,39.0957278345
41.3221140974,38.1947918393,39.9036867306,37.7696131032
41.8244654995,40.1567131661,38.0674700168,35.1089144603
45.4976929401,45.5597962603,42.7258732951,43.2422832585


In [72]:
cat_distances = graphlab.SFrame({'cat_cat': cat_cat_neighbours['distance'],
                                   'cat_dog': cat_dog_neighbours['distance'],
                                   'cat_automobile': cat_automobile_neighbours['distance'],
                                   'cat_bird': cat_bird_neighbours['distance']})

In [73]:
def is_dog_correct(row):
    dog_distance = row['dog_dog']
    
    ## ToDo:: Call this dog_distance_smaller_than_others
    if dog_distance < row['dog_cat'] and dog_distance < row['dog_automobile'] and dog_distance < row['dog_bird']:
        return 1
    else: 
        return 0

In [74]:
def is_cat_correct(row):
    cat_distance = row['cat_cat']
    
    ## ToDo:: Call this cat_distance_smaller_than_others
    if cat_distance < row['cat_dog'] and cat_distance < row['cat_automobile'] and cat_distance < row['cat_bird']:
        return 1
    else: 
        return 0

In [75]:
dog_result = dog_distances.apply(is_dog_correct)

In [76]:
dog_result.sum()

678

In [77]:
cat_result = cat_distances.apply(is_cat_correct)

In [78]:
cat_result.sum()

548

### Accuracy for dog images using nearest neighbours

In [85]:
correct_predictions = dog_result.sum()
total_images = len(dog_distances)


In [91]:
accuracy = float(correct_predictions) / total_images

In [95]:
accuracy

0.678