# DSCI 572 Lab 4

In [None]:
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from scipy.signal import convolve2d

import matplotlib.pyplot as plt
%matplotlib inline

from skimage.color import rgb2gray
from skimage.transform import resize

import tensorflow as tf

from tensorflow.keras.datasets import mnist
from tensorflow.keras.layers import Dense, Dropout, Flatten, Activation, Conv2D, MaxPooling2D, GlobalAveragePooling2D
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from tensorflow.keras import utils
from tensorflow.keras.applications.inception_v3 import InceptionV3

# se imports
import time
from sklearn.dummy import DummyClassifier

In [None]:
plt.rcParams['font.size'] = 16

## Exercise 2. Convolutional networks for MNIST

Sorry to continue with MNIST so long. It's just _THE_ classic data set for this stuff.


In [None]:
# define a simple CNN model
def build_mnist_CNN():
    mnist_model = Sequential()
    mnist_model.add(Conv2D(32, (5, 5), input_shape=(28, 28, 1), activation='relu'))
    mnist_model.add(MaxPooling2D(pool_size=(2, 2)))
    mnist_model.add(Dropout(0.2))
    mnist_model.add(Flatten())
    mnist_model.add(Dense(128, activation='relu'))
    mnist_model.add(Dense(num_classes, activation='softmax'))

    # Compile model
    mnist_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return mnist_model

> Run with 10,000 training samples

In [None]:
# load data
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# reshape to be [samples][channels][width][height]
X_train = X_train.reshape(X_train.shape[0], 28, 28, 1).astype('float32')
X_test = X_test.reshape(X_test.shape[0], 28, 28, 1).astype('float32')

# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255

# one hot encode outputs
y_train = utils.to_categorical(y_train)
y_test = utils.to_categorical(y_test)
num_classes = y_test.shape[1]

# take a subset of the data for speed
subset_size = 10000
X_train = X_train[:subset_size]
y_train = y_train[:subset_size]

mnist_model = build_mnist_CNN()

# Fit the model
start_time = time.time()
mnist_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=256)
# Final evaluation of the model
scores = mnist_model.evaluate(X_test, y_test, verbose=0)
print("CNN Error: %.2f%%" % (100-scores[1]*100))
elapsed_time = time.time()-start_time
print("---Running Time: %s seconds ---" % elapsed_time)

mnist_model.summary()

> Run with full 60,000 samples:

In [None]:
# load data
(X_train, y_train), (X_test, y_test) = mnist.load_data()

# reshape to be [samples][channels][width][height]
X_train = X_train.reshape(X_train.shape[0], 28, 28, 1).astype('float32')
X_test = X_test.reshape(X_test.shape[0], 28, 28, 1).astype('float32')

# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
X_test = X_test / 255

# one hot encode outputs
y_train = utils.to_categorical(y_train)
y_test = utils.to_categorical(y_test)
num_classes = y_test.shape[1]

# take a subset of the data for speed
# subset_size = 10000
# X_train = X_train[:subset_size]
# y_train = y_train[:subset_size]

mnist_model = build_mnist_CNN()

# Fit the model
start_time = time.time()
mnist_model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, batch_size=256)
# Final evaluation of the model
scores = mnist_model.evaluate(X_test, y_test, verbose=0)
print("CNN Error: %.2f%%" % (100-scores[1]*100))
elapsed_time = time.time()-start_time
print("---Running Time: %s seconds ---" % elapsed_time)

mnist_model.summary()

# Exercise 3: Transfer learning

In [None]:
data = pd.read_csv('../input/dog-breed-identification/labels.csv')
data = data[:2000]
data['image_path'] = data.apply( lambda row: (os.path.join("../input/dog-breed-identification/train", row["id"] + ".jpg") ), axis=1)
data.head()

In [None]:
target_labels = data['breed']
total_classes = len(set(target_labels))
print("number of dog breeds:", total_classes)

In [None]:
# read images from the image directory. 
images = np.array([img_to_array(
                    load_img(img, target_size=(256,256))
                    ) for img in data['image_path'].values.tolist()])

In [None]:
images.shape

In [None]:
images = images.astype('float32')/255.0 

In [None]:
plt.imshow(images[0]);
plt.grid(True);
plt.xticks([]);
plt.yticks([]);
plt.title("Breed = " + target_labels[0]);

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(images, target_labels, 
                                                    stratify=np.array(target_labels), 
                                                    random_state=42)

print(X_train.shape)
print(X_valid.shape)

#### 3(a)
rubric={reasoning:10}

Before we start, do some EDA to assess whether there is serious class imbalance in the training data. What training accuracy would you get with `DummyClassifier`? Briefly discuss your results.

In [None]:
y_train.shape

In [None]:
y_train.value_counts()

In [None]:
y_train.value_counts().hist()

> There is some class inbalance. The histogram shows the distribution of class size. The distribution is very roughly Gaussian. For some breeds we have many observations (20), and for some breeds we only have a few.
>
> Using the dummy classifier with the `most_frequent` strategy, we would expect a very high error rate. The code below demonstrates an error rate of ~ 98.6%. This makes sense since the model is just predicting the most common occurence everytime.
>
> It looks like `bernese_mountain_dog` is the most common class with 20 occurences. Based on this, we would expect the model to only have an accuracy of 20/1500.

In [None]:
1 - 20/1500

In [None]:
dummy = DummyClassifier(strategy="most_frequent").fit(X_train, y_train)
print(f"Train error: {1 - dummy.score(X_train, y_train)}")
print(f"Valid error: {1 - dummy.score(X_valid, y_valid)}")

#### 3(b)
rubric={reasoning:5}

How many training examples do we have per breed of dog, roughly? In the context of other classification tasks we've done in MDS, do you consider this to be a lot or a little?

> The histogram above demonstrates how many training samples we have roughly per breed of dog. Most breeds only have 8 to 14 training samples. This does not seem like very many, and is much less than what we are used to for most MDS problems we have tackled to date.

In [None]:
# OHE

Y_train = pd.get_dummies(y_train.reset_index(drop=True)).values
Y_valid = pd.get_dummies(y_valid.reset_index(drop=True)).values

print(Y_train.shape)
print(Y_valid.shape)

# Note: it would be better to use keras.utils.to_categorical, or something else like that,
# just in case one of the classes is absent in one of the two sets.
# But this works for now.

### Approach 1

Now, we try Approach 1, which is training an end-to-end CNN on the dog breed classification task.

In [None]:
model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(256, 256, 3)))
model.add(Activation('relu')) # this is just different syntax for specifying the activation function
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(total_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

history = model.fit(X_train, Y_train, epochs=10, validation_data=(X_valid, Y_valid))

# FYI: it's often a good idea to save your weights after training or during training.
# But you don't have to here.
# model.save_weights('my_conv_net.h5')  

In [None]:
model.evaluate(X_valid, Y_valid, verbose=0)

#### 3(c)
rubric={reasoning:1}

What do you think of the results? Are you impressed? 

> I am not very impressed with the results. The validation accuracy of ~ 0.02 is not very good. My assumption is that our small number of training examples for each class is hurting our results

### Approach 2

Here we load a pre-trained model and add some layers on top. The syntax is not what you're used to - that's OK, don't worry about it.

In [None]:
# Get the InceptionV3 model trained on the ImageNet data set
base_inception = InceptionV3(weights='imagenet', include_top=False, input_shape=(256, 256, 3))

top_block = base_inception.output
top_block = GlobalAveragePooling2D()(top_block) # pool over height/width to reduce number of parameters
top_block = Dense(256, activation='relu')(top_block) # add a Dense layer
predictions = Dense(total_classes, activation='softmax')(top_block) # add another Dense layer

model_transfer = Model(inputs=base_inception.input, outputs=predictions)

for layer in base_inception.layers:
    layer.trainable = False
    
model_transfer.compile(Adam(lr=.001), loss='categorical_crossentropy', metrics=['accuracy']) 
model_transfer.summary() # run me if you dare
history = model_transfer.fit(X_train, Y_train, validation_data=(X_valid, Y_valid), epochs=10)

In [None]:
model_transfer.evaluate(X_valid, Y_valid, verbose=0)

#### 3(d)
rubric={reasoning:1}

How does this result compare to the "from scratch" CNN?

> The above code made some major improvements compared to the "from scratch" CNN. The validation accuracy has increased to ~ 75%.

### Approach 3

Below, we un-freeze the last "15" layers, which is really only the last one or two layers, since the list of Keras layer objects doesn't really correspond to our idea of a layer (see `model.summary()`).

In [None]:
for i, layer in enumerate(reversed(model_transfer.layers)):
    layer.trainable = True
#     print(layer)
    if i > 15:
        break

# compile the model with a SGD/momentum optimizer and a very slow learning rate.
model_transfer.compile(loss='categorical_crossentropy',
              optimizer=SGD(lr=1e-4, momentum=0.9),
              metrics=['accuracy'])

# fine-tune the unfrozen layers
history = model_transfer.fit(X_train, Y_train, validation_data=(X_valid, Y_valid), epochs=10)

In [None]:
model_transfer.evaluate(X_valid, Y_valid, verbose=0)

#### (optional) 3(e)
rubric={reasoning:1}

Un-freezing some of the layers seems to have a small effect here. Was it actually useful at all, or could we have achieved the same results by just training our top layers for more epochs?

> Unfreezing additional layers does not seem useful in this case. The bump in validation accuracy was very small. Assuming that the original layers were well trained on the additional training class it is probably best to not re-train, and just do more epochs on the top layers.

#### 3(f)
rubric={reasoning:5}

In Lab 3 we noticed that unlike scikit-learn's `fit`, Keras's `fit` doesn't re-initialize the weights, but rather continues on from where you were. In the above code, we benefitted from this. Briefly describe how/why this behaviour was useful to us.

> This is useful because it means that hopefully we are starting with weights that have already been optimized. Best case, the weights were optimized by someone who knows what they are doing, and had access to a really powerful computer. This saves us a lot of compute time and power if we are starting with meaningful weights instead of randomly initializing them. It should take less epochs and iterations to converge to a minimum.

#### 3(g)
rubric={reasoning:10}

Brainstorm 3 other applications of this type of transfer learning, where you use a pre-trained network plus some modifications. In each case, what is the original task and what is the new task? (It's OK if you don't actually have access to a pre-trained network to do the original task; we're just brainstorming here.)

> **(1) Self Driving cars and existing computer vision**
> - There are many existing models for image classification
> - To train self driving cars could be a timley endeavor, because you would need to rack up many millions of miles of real people driving to get good training data
> - Self driving car models could "shortcut" some of this by using transfer learning from existing image classifications. These existing image classifications may already be able to identify useful things (such as stop signs), and also may be able to extract useful features that are relevant to this task (where a road ends, where water is, etc.)
>
> **(2) Sentiment Classification**
> - Imagine in 20 years from now, everyday langauge will have probably changed a little bit (new sayings, new slang terms, etc.).
> - Old sentiment classifications may not work as good as they used to.
> - Instead of building completely new models, you could start with as a base an existing model. Then continue to train it on new data.
> - This will likely reduce the number of new training samples you need because the model will already be good at identifying features that lead to positive or negative sentiment.
>
> **(3) Writing code in a new langauge**
> - There are many existing AI tools that currently exist that can right code, or provide reccomendations for code.
> - Imagine a new language is created.
> - It may require a lot of training data (that may not exist yet) to train an AI tool that can write code or provide code completion reccomendations.
> - Intead of starting from scratch you could use an existing model, say for Python. It will likely be good at identifying sctructres that are common accross many langauges such as for loops.