### mount google drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


### copy leaf data to local runtime for faster processing

In [6]:
!cp -r /content/drive/My\ Drive/ECE542_sp20_CompetitionData ./

cp: cannot open '/content/drive/My Drive/ECE542_sp20_CompetitionData/Results-C1/Results-C1.gsheet' for reading: Operation not supported
cp: cannot open '/content/drive/My Drive/ECE542_sp20_CompetitionData/Scoreboard-C2/Results.gsheet' for reading: Operation not supported


### import libraries

In [0]:
import numpy as np
import pandas as pd
import os, random, math, glob, cv2, pickle
from datetime import datetime

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.utils import shuffle
from sklearn.model_selection import  train_test_split
from keras.utils import to_categorical
from sklearn.decomposition import PCA
from sklearn.svm import LinearSVC
from sklearn.metrics import confusion_matrix, classification_report

### define data paths

In [0]:
train_data_path = '/content/drive/My Drive/ECE542_sp20_CompetitionData/TrainData-C1/'
test_data_path = '/content/drive/My Drive/ECE542_sp20_CompetitionData/TestData/'
train_annotations_path = '/content/drive/My Drive/ECE542_sp20_CompetitionData/TrainData-C1/TrainAnnotations.csv'

### load annotations

In [0]:
df=pd.read_csv(train_annotations_path)
df.head()

# count of all class images
c=5
for i in range(c):
    print(df[df['annotation']==i].shape[0])

488
185
130
131
91


### vectorize image and generate data and label vectors

In [0]:
i=0
data_dir=train_data_path
file_nm = ''

x = []
y = []

for filenm in glob.glob(data_dir+'*.jpg'):
  
    fnm = filenm.split('/')[-1]
    vec=np.array(cv2.imread(data_dir+fnm, cv2.IMREAD_GRAYSCALE)).flatten()
    x.append(vec)
    y.append(df[df['file_name']==fnm]['annotation'].iloc[0])

### split into train and val data - stratified; equal percentage from all classes

In [0]:
x = np.array(x)
y = np.array(y)

test_per=0.2
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = test_per, shuffle = True, stratify = y)

print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)

(820, 307200)
(820,)
(205, 307200)
(205,)


### pca decomposition

In [0]:
n_comp = 800
pca = PCA(n_components=n_comp)
x_train_pca = pca.fit_transform(x_train)
x_val_pca = pca.transform(x_val)

print(x_train_pca.shape, x_val_pca.shape)

(820, 800) (205, 800)


### train model on training data

In [0]:
clf = LinearSVC(max_iter=100000)
clf.fit(x_train_pca, y_train)
print('\nAccuracy: %f'%clf.score(x_val_pca,y_val))


Accuracy: 0.741463


### confusion matrix and classification report on validation data

In [0]:
ypred = clf.predict(x_val_pca)

print(confusion_matrix(y_val,ypred))
print(classification_report(y_val,ypred))

[[65 26  6  0  1]
 [ 4 30  2  1  0]
 [ 0  2 22  1  1]
 [ 0  1  3 22  0]
 [ 0  0  1  4 13]]
              precision    recall  f1-score   support

           0       0.94      0.66      0.78        98
           1       0.51      0.81      0.62        37
           2       0.65      0.85      0.73        26
           3       0.79      0.85      0.81        26
           4       0.87      0.72      0.79        18

    accuracy                           0.74       205
   macro avg       0.75      0.78      0.75       205
weighted avg       0.80      0.74      0.75       205



### read test filenames

In [0]:
test_list = []
test_dir = test_data_path
test_file_list=[]

for img_name in os.listdir(test_dir):
    test_file_list.append(img_name)

test_file_list=sorted(test_file_list)
print(len(test_file_list))

200


### vectorize test images and pca 

In [0]:
x_test = []

for filenm in test_file_list:
    vec=np.array(cv2.imread(test_dir+filenm, cv2.IMREAD_GRAYSCALE)).flatten()
    x_test.append(vec)

x_test = np.array(x_test)
x_test_pca = pca.transform(x_test)

### generate final predictions

In [0]:
nclass=5
final_pred = clf.predict(x_test_pca)
final_pred_cat = to_categorical(final_pred, num_classes=nclass)

print(final_pred_cat.shape)

(200, 5)


### class distribution in test

In [0]:
np.bincount(final_pred)

array([101,  60,  32,   7])

### save final predictions

In [0]:
curr_dt_string = datetime.now().strftime('%d_%m_%y__%H_%M_%S')

np.savetxt('predictions_'+curr_dt_string+'.csv', final_pred_cat, delimiter=',')
np.save('final_pred_'+curr_dt_string+'.npy', final_pred)

### save classifier as pickle model

In [0]:
with open('clf_'+curr_dt_string+'.pkl', 'wb') as f:
    pickle.dump(clf, f)

### load classifier from pickle file

In [0]:
with open('clf_'+curr_dt_string+'.pkl', 'rb') as f:
    clf1 = pickle.load(f)