In [None]:
!pip install numpy pandas matplotlib tensorflow keras opencv-python graphviz scikit-learn scikit-image sqlalchemy

In [None]:
import os
import cv2
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Activation, Dropout, Flatten, Dense
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from skimage.feature import peak_local_max, corner_peaks
import sqlite3
from adspy_shared_utilities import plot_class_regions_for_classifier, plot_feature_importances, plot_decision_tree

%matplotlib notebook

In [None]:
%matplotlib notebook

### Traditional programming vs. Machine Learning (ML)
<img src="imgs/tradit_vs_ML.png" width="640" height="320">

### AI / ML / DL
<img src="imgs/AI_ML_DL.jpg" width="640" height="320">

### Features vs. Labels / Targets
<img src="imgs/feat_labels.jpg" width="500" height="250">

### ML vs. DL
<img src="imgs/ML_DL.jpg" width="600" height="300">

### Supervised vs. Unsupervised learning
<img src="imgs/super_unsuper.jpg" width="600" height="300">

### Classification vs. Regression
<img src="imgs/class_regr.png" width="600" height="300">

In [None]:
path = 'AAC_dataset/usable/'
files = [file for file in os.listdir(path) if file.endswith('.npy')]
image = np.load(path + files[0])
image.shape

In [None]:
new_height = 300 
new_width = 300
data_path = 'AAC_dataset/'
class_mappings = {}

subfolders = [folder for folder in os.listdir(data_path) if not folder.startswith('.')]
X, y, timestamps = [], [], []
for i, subfolder in enumerate(subfolders):
    class_mappings[i] = subfolder
    files = [file for file in os.listdir(data_path + subfolder) if file.endswith('.npy')]
    for file in files:
        image = np.load(data_path + subfolder + '/' + file)
        image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_AREA)
        X.append(image)
        y.append(i)
        parsed_ts = pd.to_datetime(file.split('.')[0][4:], format='%Y-%m-%d_%H-%M-%S')
        timestamps.append(parsed_ts)

X = np.array(X)
y = np.array(y)

In [None]:
class_ids, counts = np.unique(y, return_counts=True)
plt.figure(figsize=(5,3))
plt.bar([class_mappings[class_id] for class_id in class_ids], counts)
plt.ylabel('Number of images')
plt.show()

In [None]:
# visualize classes - raw images
fig, axs = plt.subplots(figsize=(9.5, 6), nrows=3, ncols=4)

for class_id, class_name in class_mappings.items():
    class_subset = np.where(y == class_id)[0]
    rnd_idx = np.random.choice(class_subset)
    
    for j in range(4):
        im = axs[class_id, j].imshow(X[rnd_idx, :, :, j], cmap='gray')
        fig.colorbar(im, ax=axs[class_id, j])
        axs[class_id, j].set_title(class_name + '_' + str(rnd_idx) + '_' + str(j))
        axs[class_id, j].axis('off')
plt.tight_layout()

In [None]:
# visualize classes - log images
fig, axs = plt.subplots(figsize=(9.5, 6), nrows=3, ncols=4)

for class_id, class_name in class_mappings.items():
    class_subset = np.where(y == class_id)[0]
    rnd_idx = np.random.choice(class_subset)
    
    for j in range(4):
        im = axs[class_id, j].imshow(np.log(1 + X[rnd_idx, :, :, j]), cmap='gray')
        fig.colorbar(im, ax=axs[class_id, j])
        axs[class_id, j].set_title(class_name + '_' + str(rnd_idx) + '_' + str(j))
        axs[class_id, j].axis('off')
plt.tight_layout()

In [None]:
# visualize classes - quantile images
fig, axs = plt.subplots(figsize=(9.5, 6), nrows=3, ncols=4)

for class_id, class_name in class_mappings.items():
    class_subset = np.where(y == class_id)[0]
    rnd_idx = np.random.choice(class_subset)
    
    for j in range(4):
        X_slice = X[rnd_idx, :, :, j]
        im = axs[class_id, j].imshow(X_slice, 
                                     cmap='gray', 
                                     vmin=np.percentile(X_slice, 50), 
                                     vmax=np.percentile(X_slice, 99))
        fig.colorbar(im, ax=axs[class_id, j])
        axs[class_id, j].set_title(class_name + '_' + str(rnd_idx) + '_' + str(j))
        axs[class_id, j].axis('off')
plt.tight_layout()

### Model 1

In [None]:
feats = pd.DataFrame()
feats['timestamp'] = timestamps
feats['max_all'] = [img_set.max() for img_set in X]
feats['mean_all'] = [img_set.mean() for img_set in X]
for i in range(4):
    feats['max_' + str(i)] = [img_set[:, :, i].max() for img_set in X]
    feats['mean_' + str(i)] = [img_set[:, :, i].mean() for img_set in X]

y_onehot = to_categorical(y)
feats['y_unusable'] = y_onehot[:, 0].astype(int)
feats['y_spoilt'] = y_onehot[:, 1].astype(int)
feats['y_usable'] = y_onehot[:, 2].astype(int)
feats

In [None]:
feats.corr()

In [None]:
X_transf = []
for img_set in X:
    X_transf_1 = img_set.astype(float)
    for j in range(4):
        X_slice = X_transf_1[:, :, j]
        p1 = np.percentile(X_slice, 50)
        p2 = np.percentile(X_slice, 99)
        X_slice[X_slice <= p1] = p1
        X_slice[X_slice >= p2] = p2
        X_slice -= X_slice.min()
        X_slice /= X_slice.max()
    X_transf.append(X_transf_1)
X_transf = np.array(X_transf)

In [None]:
test_img = X_transf[130, :, :, 3]
# coordinates = peak_local_max(test_img, min_distance=10, threshold_abs=0.3)
coordinates = corner_peaks(test_img, min_distance=5, threshold_abs=0.3)
print(len(coordinates))

plt.figure()
plt.imshow(test_img, cmap='gray')
plt.plot(coordinates[:, 1], coordinates[:, 0], 'r.', markersize=2)
plt.axis('off')
plt.show()

In [None]:
nb_peaks = []
for img_set in X_transf:
    peak_sum = 0
    for j in range(4):
        peak_sum += len(corner_peaks(img_set[:, :, j], min_distance=5, threshold_abs=0.3))
    nb_peaks.append(peak_sum)

feats['nb_peaks'] = nb_peaks
feats

In [None]:
feats.corr()

In [None]:
### Task

In [None]:
feats_final = feats[['timestamp', 'max_all', 'mean_all', 'nb_peaks', 'black_pixs']].copy()
feats_final['y'] = y
feats_final = feats_final.sort_values(by='timestamp')
feats_final

In [None]:
# train/test split
X_train = feats_final.iloc[:180, 1:5].copy()
X_test = feats_final.iloc[180:, 1:5].copy()
y_train = feats_final.iloc[:180, 5].copy()
y_test = feats_final.iloc[180:, 5].copy()

In [None]:
# normalize features
for col in X_train.columns:    
    intercept = X_train[col].min()
    scale = (X_train[col] - intercept).max()
    X_train[col] = (X_train[col] - intercept) / scale
    X_test[col] = (X_test[col] - intercept) / scale

#### Logistic regression

In [None]:
lr = LogisticRegression(C=10)
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

In [None]:
feat_subset = [2, 3]
feature_names = list(X_train.columns[feat_subset])
X_subset_tr = X_train.values[:, feat_subset].copy()
X_subset_te = X_test.values[:, feat_subset].copy()

lr = LogisticRegression(C=10)
lr.fit(X_subset_tr, y_train)
print(lr.score(X_subset_tr, y_train))
print(lr.score(X_subset_te, y_test))

In [None]:
plot_class_regions_for_classifier(lr, 
                                  X_subset_tr, 
                                  y_train.values, 
                                  X_test=X_subset_te, 
                                  y_test=y_test.values, 
                                  title='Logistic regression',
                                  target_names=list(class_mappings.values()),
                                  feature_names=feature_names)

#### K nearest neighbours

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train, y_train)
print(knn.score(X_train, y_train))
print(knn.score(X_test, y_test))

In [None]:
for i in range(1, 11):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    print('======== ' + str(i) + ' ========')
    print(knn.score(X_train, y_train))
    print(knn.score(X_test, y_test))

In [None]:
feat_subset = [0, 3]
feature_names = list(X_train.columns[feat_subset])
X_subset_tr = X_train.values[:, feat_subset].copy()
X_subset_te = X_test.values[:, feat_subset].copy()

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_subset_tr, y_train)
print(knn.score(X_subset_tr, y_train))
print(knn.score(X_subset_te, y_test))

In [None]:
plot_class_regions_for_classifier(knn, 
                                  X_subset_tr, 
                                  y_train.values, 
                                  X_test=X_subset_te, 
                                  y_test=y_test.values, 
                                  title='K nearest neighbours',
                                  target_names=list(class_mappings.values()),
                                  feature_names=feature_names)

#### Trees

In [None]:
dt4 = DecisionTreeClassifier(max_depth=5)
dt4.fit(X_train, y_train)
print(dt4.score(X_train, y_train))
print(dt4.score(X_test, y_test))

In [None]:
plot_decision_tree(dt4, X_train.columns, list(class_mappings.values()))

In [None]:
plot_feature_importances(dt4, X_train.columns)

In [None]:
feat_subset = [0, 3]
feature_names = list(X_train.columns[feat_subset])
X_subset_tr = X_train.values[:, feat_subset].copy()
X_subset_te = X_test.values[:, feat_subset].copy()

dt = DecisionTreeClassifier(max_depth=5)
dt.fit(X_subset_tr, y_train)
print(dt.score(X_subset_tr, y_train))
print(dt.score(X_subset_te, y_test))

In [None]:
plot_class_regions_for_classifier(dt, 
                                  X_subset_tr, 
                                  y_train.values, 
                                  X_test=X_subset_te, 
                                  y_test=y_test.values, 
                                  title='K nearest neighbours',
                                  target_names=list(class_mappings.values()),
                                  feature_names=feature_names)

In [None]:
rf = RandomForestClassifier(max_depth=4, n_estimators=200)
rf.fit(X_train, y_train)
print(rf.score(X_train, y_train))
print(rf.score(X_test, y_test))

### Model #3

In [None]:
# with datetime train/test separation (more fair)
# separate images for train/test, normalize images

break_date = feats_final.iloc[180, 0]

X_train2, X_test2, y_train2, y_test2 = [], [], [], []
for i in range(len(timestamps)):
    if timestamps[i] < break_date:
        X_train2.append(X_transf[i])
        y_train2.append(y_onehot[i])
    else:
        X_test2.append(X_transf[i])
        y_test2.append(y_onehot[i])
        
X_train2 = np.array(X_train2)
X_test2 = np.array(X_test2)
y_train2 = np.array(y_train2)
y_test2 = np.array(y_test2)

In [None]:
plt.figure()
plt.imshow(X_train2[100, :, :, 2], cmap='gray')
plt.colorbar()
plt.show()

In [None]:
def create_model_arch(drop_rate=0):
    model = Sequential()
    model.add(Conv2D(32, (3, 3), input_shape=(300, 300, 4)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(32, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Conv2D(64, (3, 3)))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))

    model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
    model.add(Dense(32))
    model.add(Activation('relu'))
    model.add(Dropout(0.0))
    model.add(Dense(3))
    model.add(Activation('softmax'))

    return model

In [None]:
model3 = create_model_arch(drop_rate=0.0)
model3.summary()

In [None]:
batch_size = 16
lr = 1e-3

model3.compile(loss='categorical_crossentropy',
               optimizer=tf.keras.optimizers.Adam(lr),
               metrics=['accuracy'])

train_datagen = ImageDataGenerator(horizontal_flip=True,
                                   rotation_range=45)

test_datagen = ImageDataGenerator()
train_generator = train_datagen.flow(
        X_train2,
        y=y_train2,
        batch_size=batch_size)

validation_generator = test_datagen.flow(
        X_test2,
        y=y_test2,
        batch_size=batch_size)

In [None]:
mcp_save = ModelCheckpoint('im_quality_pred.h5', save_best_only=True, monitor='val_loss', mode='min')

history = model3.fit(train_generator,
                     steps_per_epoch=len(X_train2) // batch_size,
                     epochs=50,
                     validation_data=validation_generator,
                     validation_steps=len(X_test2) // batch_size,
                     callbacks=[mcp_save])

In [None]:
def plot_results(history):
    fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(9.5, 4))

    axs[0].plot(history.history['loss'])
    axs[0].plot(history.history['val_loss'])
    axs[0].set_title('Model loss')
    axs[0].set_ylabel('loss')
    axs[0].set_xlabel('epoch')
    axs[0].legend(['train', 'test'], loc='upper right')

    axs[1].plot(history.history['accuracy'])
    axs[1].plot(history.history['val_accuracy'])
    axs[1].set_title('Model accuracy')
    axs[1].set_ylabel('accuracy')
    axs[1].set_xlabel('epoch')
    axs[1].legend(['train', 'test'], loc='upper left')

    plt.tight_layout()
    plt.show()

plot_results(history)

### Save to a database 

In [None]:
con = sqlite3.connect("ae_test.db")
aac = pd.read_sql('select * from aac', con)
aac

In [None]:
aac['timestamp'] = pd.to_datetime(aac['timestamp'])
X_norm = pd.concat([X_train, X_test])
X_norm

In [None]:
y_pred = dt4.predict(X_norm)
X_norm['y_pred'] = y_pred
X_norm

In [None]:
X_norm = pd.merge(X_norm, feats['timestamp'], left_index=True, right_index=True, how='inner')
X_norm

In [None]:
final_df = pd.merge(X_norm[['timestamp', 'y_pred']], aac, left_on='timestamp', right_on='timestamp', how='right')
final_df

In [None]:
final_df = final_df.drop(columns=['class'])
final_df = final_df.rename(columns={'y_pred': 'class'})[['timestamp', 'image_name', 'class']]
final_df['timestamp'] = final_df['timestamp'].astype(str)
final_df.to_sql('aac2', con, index=False)