# Setup

In [None]:
#***Importing libraries***

#Math tools
import numpy as np

#Data science tools
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

#OS tools
import os

#Image processing tools
import cv2 

#Plotting tools
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
# Random seeds
np.random.seed(1)

# Custom Methods

In [None]:
### Error definition ###
def error_fn(y_true, y_pred):
    error = 0;
    for i in range(0,len(y_true)):
        if y_true[i] != y_pred[i]:
            error += 1; #Counts the number of misclassifications
    return error

# Reading the images

In [None]:
#Specifying the number of classes and images to use
n_classes = 10;
n_figs = 700;

#Specifying the total number of classes and images avaialble
N_classes = 45;
N_figs = 700;

#Randomly choosing the classes and figures to analyze
class_idx = np.random.permutation(N_classes)[:n_classes];
fig_idx = np.random.permutation(N_figs)[:n_figs]
#These two lines above generate different choices each time they're executed unless the random seed is reset

In [None]:
#Specifying the path
PROJECT_ROOT_DIR = '..'
DATASET_FOLDER = 'training_data'
DATASET_PATH = os.path.join(PROJECT_ROOT_DIR,DATASET_FOLDER)


#Reading the folders tree
classes_all = sorted(os.listdir(DATASET_PATH));
classes_all.remove('summary'); #Removes the summary file
N_classes = len(classes_all)
if n_classes > 0 and n_classes <= N_classes:
    classes = [classes_all[idx] for idx in class_idx]
else:
    print('Wrong number of classes requested')
    raise SystemExit(0)
    
#Reading the figures
figs = [];
for folder in classes:
    CLASS_PATH = os.path.join(DATASET_PATH, folder);
    figs_names_all = sorted(os.listdir(CLASS_PATH));
    if n_figs > 0 and n_figs <= N_figs:
        figs_names = [figs_names_all[idx] for idx in fig_idx]
        FIGS_PATHS = [os.path.join(CLASS_PATH, fig) for fig in figs_names]
        figs_i = [cv2.imread(PATH,0) for PATH in FIGS_PATHS]; #cv2.imread with flag 0 reads the image in grayscale
        figs.append(figs_i)
    else:
        print('Wrong number of figures requested')
        raise SystemExit(0)

#Converting into a numpy array
figs = np.array(figs);
print('figs.shape')
print(figs.shape)

##***Output***
# figs: Multidimensional numpy array containing all read images. First dimension corresponds to class and second dimension to figures. 
#Each figure is a multidimensional array itself of 256x256 pixels and only the greyscale channel: int(0,255)

In [None]:
#Shows the names of the chosen classes and their corresponding indexes in figs
print('Chosen classes:')
print(list(enumerate(classes)))

In [None]:
#Visualizing some of the data
fig_test = figs[0][0];
plt.imshow(fig_test, cmap = mpl.cm.binary); plt.axis("off");

# Preparing the data set

In [None]:
#Linearizing the input data
X = figs.reshape([n_classes*n_figs, 256, 256]); #Inputs
y = np.zeros([n_classes, n_figs], dtype=int);
for i in range (n_classes):
    for j in range(n_figs):
        y[i,j] = i;
y = y.reshape([n_classes*n_figs]); #Labels

In [None]:
#Shuffling the input data
idx_random = np.random.permutation(len(X));
X = X[idx_random,:,:];
y = y[idx_random]

In [None]:
#Visualizing some of the data
idx = 5;
X_sample = X[idx];
plt.imshow(X_sample, cmap = mpl.cm.binary); plt.axis("off");
print(y[idx])
print(classes[y[idx]])

In [None]:
#Reshaping the dataset to a 2D array
X = X.reshape(-1, 256*256);
X.shape

In [None]:
#Splitting the data set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, train_size=0.75, random_state=0)
N_train = len(X_train)
N_test = len(X_test)

In [None]:
X_train.shape

# Training the SVM classifier

In [None]:
#Setting up the models
C = 1  # SVM regularization parameter
models = (svm.SVC(kernel='linear', C=C),
          svm.SVC(kernel='poly', degree=2, gamma='auto', C=C),
          svm.SVC(kernel='poly', degree=3, gamma='auto', C=C),
          svm.SVC(kernel='rbf', gamma=0.7, C=C))
titles = ('SVM with linear kernel',
          'SVM with polynomial (degree 2) kernel',
          'SVM with polynomial (degree 3) kernel',
          'SVM with RBF kernel')
#Training 
models = (clf.fit(X_train, y_train) for clf in models)

## Training error

In [None]:
i = 0;
error = np.zeros(len(titles))
for clf in models:       
    y_pred = clf.predict(X_train)
    error[i] = error_fn(y_train, y_pred)
    i += 1
print('*** Training error ***')
print('N_train: '+str(N_train))
i = 0
for title in titles:
    print(title+': '+str(error[i]))
    i += 1

In [None]:
y_train

In [None]:
y_pred

## Generalization error

In [None]:
### Computing the generalization error ###
i = 0;
error = np.zeros(len(titles))
for clf in models:
    y_pred_test = clf.predict(X_test)
    error[i] = error_fn(y_test, y_pred_test)
    i += 1
print('*** Generalization error ***')
print('N_test: '+str(N_test))
i = 0
for title in titles:
    print(title+': '+str(error[i]))
    i += 1

In [None]:
y_test

In [None]:
y_pred_test

> ***The model is clearly overfitted!!!!***

# Best k-rank approximation through PCA

**Key questions**
- How to efficiently apply SVD to a batch of images and make decision about the number of modes to use? 
- Do we apply SVD to each input image, each class, or to the training dataset as a whole?
- What could be the optimum number of principal components to use? Which metric do we use to compare them?
- Training the model with the reduced components turned out to be much more computationall y expensive than training with the raw images. Why???

In [None]:
pca = PCA();
pca.fit(X_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)  #Note that the max number of singular values is the number of images in X_train
d = np.argmax(cumsum >= 0.95) + 1

In [None]:
d

In [None]:
plt.figure(figsize=(6,4))
plt.plot(cumsum, linewidth=3)
#plt.axis([0, 400, 0, 1])
plt.xlabel("Dimensions")
plt.ylabel("Explained Variance")
plt.plot([d, d], [0, 0.95], "k:")
plt.plot([0, d], [0.95, 0.95], "k:")
plt.plot(d, 0.95, "ko")
#plt.annotate("Elbow", xy=(65, 0.85), xytext=(70, 0.7),
#             arrowprops=dict(arrowstyle="->"), fontsize=16)
plt.grid(True)
plt.show()

In [None]:
pca = PCA(n_components=0.8)
X_train_reduced = pca.fit_transform(X_train)
X_train_recovered = pca.inverse_transform(X_train_reduced)

In [None]:
pca.n_components_

In [None]:
np.sum(pca.explained_variance_ratio_)

In [None]:
#Visualizing some of the data
idx = 100;
X_sample = X_train_recovered[idx].reshape(256,256);
plt.imshow(X_sample, cmap = mpl.cm.binary); plt.axis("off");

In [None]:
### Projecting the rest of the data onto the PCs of the training data ###
#Centers the data
X_test_centered = X_test - X_test.mean(axis=0) # Test dataset
#Extract the normal vectors of the principal components
V_r = np.transpose(pca.components_);
#Projects the data
X_test_reduced = np.matmul(X_test_centered, V_r)

In [None]:
X_test_reduced.shape

# Training the SVM classifier on the reduced space

In [None]:
#Setting up the models
C = 1  # SVM regularization parameter
models = (svm.SVC(kernel='linear', C=C),
          svm.SVC(kernel='poly', degree=2, gamma='auto', C=C),
          svm.SVC(kernel='poly', degree=3, gamma='auto', C=C),
          svm.SVC(kernel='rbf', gamma=0.7, C=C))
titles = ('SVM with linear kernel',
          'SVM with polynomial (degree 2) kernel',
          'SVM with polynomial (degree 3) kernel',
          'SVM with RBF kernel')
#Training 
models = (clf.fit(X_train, y_train) for clf in models)

## Training error

In [None]:
i = 0;
error = np.zeros(len(titles))
for clf in models:       
    y_pred = clf.predict(X_train_reduced)
    error[i] = error_fn(y_train, y_pred)
    i += 1
print('*** Training error ***')
print('N_train: '+str(N_train))
i = 0
for title in titles:
    print(title+': '+str(error[i]))
    i += 1

In [None]:
y_train

In [None]:
y_train

## Generalization error

In [None]:
### Computing the generalization error ###
i = 0;
error = np.zeros(len(titles))
for clf in models:
    y_pred_test = clf.predict(X_test_reduced)
    error[i] = error_fn(y_test, y_pred_test)
    i += 1
print('*** Generalization error ***')
print('N_test: '+str(N_test))
i = 0
for title in titles:
    print(title+': '+str(error[i]))
    i += 1

In [None]:
y_test

In [None]:
y_pred_test

# Reduced input space through random projection (maybe?)

**Key questions**
- What kind of information/improvements would we get this way compared to PCA?

In [None]:
<code>

# Cool ways to visualize the performance of the classifier

**Ideas**
- Manually add noise/perturbations to the images to test the classfier under extreme scenarios
- We could augment our training data set this way too! Do we need more data or are 700 images per class enough?
- Could we extende the trained model to be interactive such that we can upload a picture and it gets classified instantly?

In [None]:
<code>