# COVID-19 Classification using Transfer Learning

In this notebook, we have implemented baseline models of extracting last layer embeddings from VGG16 and use logistic regression.

In [10]:
! pip install opencv-python



In [11]:
import numpy as np
import os
import cv2
import shutil
from glob import glob
from sklearn import preprocessing
from keras.preprocessing import image
from keras.applications.vgg16 import VGG16
from keras.applications import xception
from keras.applications import inception_v3
from sklearn.metrics import precision_score, \
    recall_score, confusion_matrix, classification_report, \
    accuracy_score, f1_score

In [12]:
BASE_DATA_FOLDER = "./data"
TRAin_DATA_FOLDER = os.path.join(BASE_DATA_FOLDER, "train")

In [13]:
nrows = 6
ncols = 4
pic_index = 0
images = []
labels = []

for class_folder_name in os.listdir(TRAin_DATA_FOLDER):
    class_folder_path = os.path.join(TRAin_DATA_FOLDER, class_folder_name)
    for image_path in glob(os.path.join(class_folder_path, "*.jpeg")):
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        image = cv2.resize(image, (224,224))
        image = np.stack((image,)*3, axis=-1)
        images.append(image)
        labels.append(class_folder_name)
    for image_path in glob(os.path.join(class_folder_path, "*.png")):
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        image = cv2.resize(image, (224,224))
        image = np.stack((image,)*3, axis=-1)    
        images.append(image)
        labels.append(class_folder_name)
    for image_path in glob(os.path.join(class_folder_path, "*.jpg")):
        image = cv2.imread(image_path, cv2.IMREAD_COLOR)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        image = cv2.resize(image, (224,224))
        image = np.stack((image,)*3, axis=-1)
        images.append(image)
        labels.append(class_folder_name)

In [14]:
images = np.array(images)
labels = np.array(labels)
pre = preprocessing.LabelEncoder()
pre.fit(labels)
labels_numeric = pre.transform(labels)
Num_Class = 3

def OneHotEncoded(y_train):
    y_t=np.zeros((len(y_train),Num_Class), dtype=int)
    for i,x in enumerate(y_train):
        y_t[i][int(x)-1]=1
    return y_t

labels = OneHotEncoded(labels_numeric)

X_train, X_test= train_test_split(images, test_size=0.2, random_state=42)
y_train, y_test= train_test_split(labels, test_size=0.2, random_state=42)
X_train = X_train.reshape(200, 224, 224, 3)
X_test = X_test.reshape(51, 224, 224, 3)

## the next step is to define the model that will be trained to recognize covid, normal or Viral Pneumonia from these images

In [15]:
train_vgg_bf = X_train.reshape(X_train.shape[0],-1)
valid_vgg_bf = X_test.reshape(X_test.shape[0],-1)

In [16]:
POOLING = 'avg'
vgg_bottleneck = VGG16(weights='imagenet', include_top=False, pooling=POOLING)
train_vgg_bf = vgg_bottleneck.predict(X_train, batch_size=32, verbose=1)
valid_vgg_bf = vgg_bottleneck.predict(X_test, batch_size=32, verbose=1)
print('VGG train bottleneck features shape: {} size: {:,}'.format(train_vgg_bf.shape, train_vgg_bf.size))
print('VGG valid bottleneck features shape: {} size: {:,}'.format(valid_vgg_bf.shape, valid_vgg_bf.size))


VGG train bottleneck features shape: (200, 512) size: 102,400
VGG valid bottleneck features shape: (51, 512) size: 26,112


In [17]:
compare_loss={}
compare_accuracy = {}
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', random_state=147)
logreg.fit(train_vgg_bf, (y_train * range(Num_Class)).sum(axis=1))
valid_probs = logreg.predict_proba(valid_vgg_bf)
valid_preds = logreg.predict(valid_vgg_bf)
compare_loss['Vgg16']=log_loss(y_test, valid_probs)
compare_accuracy['Vgg16']=accuracy_score((y_test * range(Num_Class)).sum(axis=1), valid_preds)
print('Validation VGG LogLoss {}'.format(compare_loss['Vgg16']))
print('Validation VGG Accuracy {}'.format(compare_accuracy['Vgg16']))
y_tl_test = (y_test * range(Num_Class)).sum(axis=1)

Validation VGG LogLoss 0.44055241175848586
Validation VGG Accuracy 0.9215686274509803


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [18]:
print ('Accuracy:', accuracy_score(y_tl_test, valid_preds))
print ('F1 score:', f1_score(y_tl_test, valid_preds, average='macro'))
print ('Recall:', recall_score(y_tl_test, valid_preds, average='macro'))
print ('Precision:', precision_score(y_tl_test, valid_preds, average='macro'))
print ('\n clasification report:\n', classification_report(y_tl_test,valid_preds))
print ('\n confussion matrix:\n',confusion_matrix(y_tl_test, valid_preds))

Accuracy: 0.9215686274509803
F1 score: 0.9180555555555556
Recall: 0.9215686274509803
Precision: 0.9290382819794584

 clasification report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.94        15
           1       1.00      0.76      0.87        17
           2       0.90      1.00      0.95        19

    accuracy                           0.92        51
   macro avg       0.93      0.92      0.92        51
weighted avg       0.93      0.92      0.92        51


 confussion matrix:
 [[15  0  0]
 [ 2 13  2]
 [ 0  0 19]]
