<a href="https://colab.research.google.com/github/rgukhui/cm4709/blob/main/extra.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#CM4709 Computer Vision
#On-campus Day Extra Lab

##Aim
1. To load images from a CSV file of engineering drawing symbols.
1. Use CNN to classify these symbols.

##Uploading File and Mounting GoogleDrive

The symbol drawings are in a CSV file.
Upload it to a folder in GoogleDrive.
Then mount your GoogleDrive in the runtime.

In [52]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.metrics import confusion_matrix
import itertools
import seaborn as sns
from random import randint

# imports for array-handling and plotting
import matplotlib
import os
import tensorflow as tf
from tensorflow.python.keras.layers import Input, Dense


##Loading the CSV File

We can read the CSV file as a dataframe using Pandas.
The format of the CSV:
1. Each row/line in the CSV file is a data instance, which contains a 100x100 bitmap and a class label.
1. Each data instance has 10000 columns, followed by a label/class column. The 10000 columns represent the 100x100 image.

In [None]:
df =pd.read_csv('/content/gdrive/MyDrive/RGU/cm4709/on-campus day/extra lab/Symbols_pixel.csv')

#show first 5 rows dataframe
#
print('===first 5 rows===')
print(df.head())

#show the columns
#
print('===columns===')
print(df.columns)

#show shape of dataframe
#
print('===shape===')
print(df.shape)


##Generating Header Names

Optionally, we can put in the column names. We will name the first 1000 columns "p_x" and the last one "label".

In [None]:
#get no. of columns
#
num_cols=df.shape[1]

#generate header
#
rng=range(1,num_cols)
new_cols = ['p_' + str(i) for i in rng]
new_cols.append('label')
print('===new column names===')
print(new_cols)

#set column labels of dataframe
#ensure the length of the new columns list is equal to the length of df's columns
df.columns = new_cols[:num_cols]

#show first 5 rows
print('===first 5 rows===')
print(df.head())


##Split into X and Y

We then divide the dataframe into X (input features) and Y (output label).

In [None]:
#no. of columns
#
n = df.shape[1]

#get 'label' column
#this will be y
#
labels=df['label']

#get all columns except last one
#this will be x
#
features=df.iloc[:,0:n-1]

print('===features===')
print(features)

print('===labels===')
print(labels)

In [None]:
#show shape of x
#
print('x shape: ',features.values.shape)

#show shape of y
#
print('y shape: ',labels.values.shape)

##Show Random Symbols

Even with the bitmap in a 1-dimensional array, we can still use OpenCV to "reshape" the bitmape and display it.

In [None]:
fig, ax = plt.subplots(8, 8, figsize=(8, 8))

#get x as an array
#
fx = features.values
print(ax.flat)

#show a number of images
#
for i, axi in enumerate(ax.flat):
  #get a random index
  value = randint(0, df.shape[0])

  #get features, shape it into 100x100 and show it
  #
  axi.imshow(fx[value].reshape(100,100), cmap='gray')
  axi.set(xticks=[], yticks=[])
plt.show()

##Show Class Distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
plt.figure(figsize=(15,7))
plt.hist(labels,bins=39)
plt.title('Symbols Distribution')
plt.xlabel('Symbol Class')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

In [None]:
#show frequency/count of labels
#
df['label'].value_counts()

##Remove Infrequent Symbols

In [None]:
#remove symbols with less than 7 instances
#
df = df[~df['label'].isin(['Ultrasonic Flow Meter', 'Barred Tee','Temporary Strainer',
                     'Control Valve Angle Choke','Line Blindspacer',
                     'Vessel','Valve Gate Through Conduit','Deluge','Control Valve'])]

print('shape: ',df.values.shape)
print(df['label'].value_counts())

In [None]:
dataset = df.values
# split into input (X) and output (y) variables
x = dataset[:, :-1]
y = dataset[:,-1]
print('x shape: ',x.shape)
print('y shape',y.shape)
print('===Sample===')
print(x[0])
print(y[0])

In [None]:
print(f'There are {len(df.label.unique())} Unique Symbol in the dataset')

##Examine Datasets

In [None]:
from sklearn.model_selection import train_test_split
# simple functions to check the shapes of all compoents of the dataset (train, test, labels)

def data_summary(x_train, y_train, x_test, y_test):
  print('Train images shape:', x_train.shape)
  print('Train labels shape:', y_train.shape)
  print('Test images shape:', x_test.shape)
  print('Test labels shape:', y_test.shape)

# split the data and check the shapes of results sets
# train_test_split(...) returns numpy arrays

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20)
data_summary(x_train,y_train,x_test,y_test)

##Reshape Data

We need to reshape the 1D array into a 2D one for the CNN.

In [None]:
print('===BEFORE RESHAPE===')
print('X_train ',x_train.shape)
print('y_train ',y_train.shape)
print('X_test ',x_test.shape)
print('y_test ',y_test.shape)
x_train = x_train.reshape(x_train.shape[0], 100,100,1)
x_test = x_test.reshape(x_test.shape[0], 100,100,1)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
print('===AFTER RESHAPE===')
print('x_train ',x_train.shape)
print('y_train ',y_train.shape)
print('x_test ',x_test.shape)
print('y_test ',y_test.shape)

print('==SAMPLES')
print(x_train[0])
print(y_train[0])

##Prepare Data for Model


In [None]:
# Encode target variables
# prepare target
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

y_train_cat = y_train
y_test_cat = y_test

# prepare target
def prepare_targets(y_train, y_test):
    le = LabelEncoder()
    le.fit(y_train)
    y_train_enc = le.transform(y_train)
    y_test_enc = le.transform(y_test)
    return y_train_enc, y_test_enc

y_train, y_test = prepare_targets(y_train,y_test)

print('y_train shape: ',y_train.shape)
print('y_test shape:',y_test.shape)
print('===Sample===')
print(y_train[0])


##Encode Label Data

In [None]:
#from tensorflow.keras.utils import to_categorical
#from keras.utils import np_utils

from tensorflow.keras import utils

y_train = utils.to_categorical(y_train)
y_test = utils.to_categorical(y_test)
num_classes = y_test.shape[1]
print('y_train shape: ',y_train.shape)
print('y_test shape: ',y_test.shape)
print('===Sample===')
print(y_train[0])

##Validation Dataset

In [68]:
#shuffle the training dataset (5 times!)
for _ in range(5):
    indexes = np.random.permutation(len(x_train))

x_train = x_train[indexes]
y_train = y_train[indexes]

# Keep 10% of the training data for validation
# cross-validation sets
val_perc = 0.10
val_count = int(val_perc * len(x_train))

# Validation set (val_data)
val_data = x_train[:val_count,:]
val_labels_cat = y_train[:val_count,:]

##Build Model

In [None]:
def build_modelU():
  model = tf.keras.Sequential()
  # add Convolutional layers
  model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=(3,3), activation='relu', padding='same', input_shape=(100, 100, 1)))
  model.add(tf.keras.layers.MaxPooling2D(pool_size=(2,2)))
  model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=(3,3), activation='relu', padding='same'))
  model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=(3,3), activation='relu', padding='same'))
  model.add(tf.keras.layers.MaxPooling2D(pool_size=(2,2)))
  model.add(tf.keras.layers.Conv2D(filters=128, kernel_size=(3,3), activation='relu', padding='same'))
  model.add(tf.keras.layers.Conv2D(filters=128, kernel_size=(3,3), activation='relu', padding='same'))
  model.add(tf.keras.layers.MaxPooling2D(pool_size=(2,2)))
  model.add(tf.keras.layers.Flatten())
  # Densely connected layers
  #model.add(tf.keras.layers.Dense(128, activation='relu'))
  #model.add(tf.keras.layers.Dropout(0.5))
  model.add(tf.keras.layers.Dense(64, activation='relu'))
  #model.add(tf.keras.layers.Dropout(0.1))
  # output layer
  model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
  # compile with adam optimizer & categorical_crossentropy loss function
  model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['accuracy'])
  return model

model = build_modelU()
print(model.summary())

##Train Model

In [None]:
nEpochs = 25
results = model.fit(x_train, y_train,
                    epochs=nEpochs, batch_size=64,
                    validation_data=(val_data, val_labels_cat))

##Plot Training History

In [None]:
history_dict = results.history
print(history_dict.keys())

##Evaluate Model

In [None]:
# accuracy, test
test_loss, test_accuracy = model.evaluate(x_test, y_test, batch_size=64)
print('Test loss: %.4f accuracy: %.4f' % (test_loss, test_accuracy))

In [None]:
predictions = model.predict(x_test)
preds = np.argmax(predictions, axis=1)
actuals = np.argmax(y_test,axis=1)
print('Accuracy: ',sum(preds==actuals)/x_test.shape[0])

In [None]:
score = model.evaluate(x_test, y_test, verbose=0)
print('Test accuracy:', score[1])
predictions = model.predict(x_test)
preds = np.argmax(predictions, axis=1)
actuals = np.argmax(y_test,axis=1)
#print(confusion_matrix(preds,actuals))
# correctly identified symbols
print('Incorrectly Identified Symbols: ',(np.argmax(predictions, axis=1) != np.argmax(y_test,axis=1)).sum())

In [None]:
score = model.evaluate(x_train, y_train, verbose=0)
print('Train loss:', score[0])
print('Train accuracy:', score[1])
predictions = model.predict(x_train)
preds = np.argmax(predictions, axis=1)
actuals = np.argmax(y_train,axis=1)
#print(confusion_matrix(preds,actuals))
# correctly identified symbols
print('Incorrectly Identified Symbols: ',(np.argmax(predictions, axis=1) != np.argmax(y_train,axis=1)).sum())

In [None]:
# Plot the loss and accuracy curves for training and validation
sns.set_style('whitegrid')
fig, ax = plt.subplots(1,2, figsize=(14, 7))
ax[0].plot(results.history['loss'], color='b', label="Training loss")
ax[0].plot(results.history['val_loss'], color='r', label="validation loss",axes=ax[0])
legend = ax[0].legend(loc='best', shadow=True)
ax[1].plot(results.history['accuracy'], color='b', label="Training accuracy")
ax[1].plot(results.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

##Show Some Predictions

In [None]:
y_pred = model.predict(x_test)
x_test__ = x_test.reshape(x_test.shape[0], 100, 100)
fig, axis = plt.subplots(4, 4, figsize=(8, 12))
for i, ax in enumerate(axis.flat):
  ax.imshow(x_test__[i], cmap='binary')
  ax.set(title = f"Actual Symbol {y_test[i].argmax()}\nPredicted is {y_pred[i].argmax()}");