In [50]:
from google_drive_downloader import GoogleDriveDownloader as gdd

gdd.download_file_from_google_drive(file_id='1BFc2Lt2N1swO8BKRvLMEyUhLzF52RFvr',
dest_path='content/covid_image_data.zip',
unzip=True)

In [1]:
import pandas as pd 
import numpy as np 
import tensorflow
import os 
import cv2 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import f1_score, accuracy_score

In [2]:
labels = pd.read_csv("content/covid_image_data/Training_set_covid.csv") # loading the labels
labels.head()

Unnamed: 0,filename,label
0,Image_1.jpg,1
1,Image_2.jpg,0
2,Image_3.jpg,0
3,Image_4.jpg,0
4,Image_5.jpg,0


In [3]:
labels.tail()

Unnamed: 0,filename,label
3474,Image_3475.jpg,0
3475,Image_3476.jpg,0
3476,Image_3477.jpg,0
3477,Image_3478.jpg,1
3478,Image_3479.jpg,0


In [4]:
file_paths = [[fname, 'content/covid_image_data/train/' + fname] for fname in labels['filename']]

In [5]:
# Confirm if number of images is same as number of labels given
if len(labels) == len(file_paths):
    print('Number of labels i.e. ', len(labels), 'matches the number of filenames i.e. ', len(file_paths))
else:
    print('Number of labels does not match the number of filenames')

Number of labels i.e.  3479 matches the number of filenames i.e.  3479


In [6]:
# Converting the file_paths to dataframe
images = pd.DataFrame(file_paths, columns=['filename', 'filepaths'])
images.head()

Unnamed: 0,filename,filepaths
0,Image_1.jpg,content/covid_image_data/train/Image_1.jpg
1,Image_2.jpg,content/covid_image_data/train/Image_2.jpg
2,Image_3.jpg,content/covid_image_data/train/Image_3.jpg
3,Image_4.jpg,content/covid_image_data/train/Image_4.jpg
4,Image_5.jpg,content/covid_image_data/train/Image_5.jpg


In [7]:
# Combining the labels with the images
train_data = pd.merge(images, labels, how = 'inner', on = 'filename')
train_data.head()

Unnamed: 0,filename,filepaths,label
0,Image_1.jpg,content/covid_image_data/train/Image_1.jpg,1
1,Image_2.jpg,content/covid_image_data/train/Image_2.jpg,0
2,Image_3.jpg,content/covid_image_data/train/Image_3.jpg,0
3,Image_4.jpg,content/covid_image_data/train/Image_4.jpg,0
4,Image_5.jpg,content/covid_image_data/train/Image_5.jpg,0


Data Preprocessing

In [8]:
data = [] # initialize an empty numpy array

image_size = 100 # image size taken is 100 here. one can take other size too
for i in range(len(train_data)):
    img_array = cv2.imread(train_data['filepaths'][i], cv2.IMREAD_GRAYSCALE) # converting the image to gray scale
    
    new_img_array = cv2.resize(img_array, (image_size, image_size)) # resizing the image array
    
    data.append([new_img_array, train_data['label'][i]])

In [9]:
# image pixels of a image
data[0]

[array([[110, 114, 118, ..., 129, 122, 227],
        [123, 124, 127, ..., 130, 127, 131],
        [156, 128, 134, ..., 124, 132,  94],
        ...,
        [166, 206, 224, ..., 194, 190, 162],
        [177, 203, 217, ..., 197, 190, 166],
        [194, 200, 205, ..., 199, 183, 163]], dtype=uint8),
 1]

In [10]:
# Shuffle the data
np.random.shuffle(data)

In [11]:
# Separating the images and labels

x = []
y = []
for image in data:
    x.append(image[0])
    y.append(image[1])

# converting x & y to numpy array as they are list
x = np.array(x)
y = np.array(y)

In [12]:
np.unique(y, return_counts=True)

(array([0, 1]), array([3073,  406]))

In [13]:
# split the data
X_train, X_val, y_train, y_val = train_test_split(x,y,test_size=0.3, random_state = 42)



Building a model

In [14]:
# Defining the model
import tensorflow as tf

model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(100, 100)), # flattening the image
tf.keras.layers.Dense(100, activation='relu'),
tf.keras.layers.Dense(50, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
loss='binary_crossentropy',
metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff484708220>

Validate a model

In [15]:
pred = model.predict(X_val) # predict labels for validation set

y_pred = []
for item in pred:
    if item < 0.5:
        y_pred.append(0)
    else:
        y_pred.append(1)

In [16]:
f1_score(y_val, y_pred)

0.0

In [17]:
accuracy_score(y_val, y_pred)

0.8898467432950191

Predicting the Output for tesing dataset

In [18]:
# load test dataset

# Loading the order of the image's name that has been provided
test_image_order = pd.read_csv("content/covid_image_data/Testing_set_covid.csv")
test_image_order.head()

Unnamed: 0,filename
0,Image_1.jpg
1,Image_2.jpg
2,Image_3.jpg
3,Image_4.jpg
4,Image_5.jpg


In [19]:
# Getting images file path
file_paths = [[fname, 'content/covid_image_data/test/' + fname] for fname in test_image_order['filename']]

Confirm if number of images in test folder is same as number of image names in 'Testing_set_face_mask.csv'

In [20]:
# Confirm if number of images is same as number of labels given

if len(test_image_order) == len(file_paths):
    print('Number of image names i.e. ', len(test_image_order), 'matches the number of file paths i.e. ', len(file_paths))
else:
    print('Number of image names does not match the number of filepaths')

Number of image names i.e.  870 matches the number of file paths i.e.  870


In [21]:
# Converting the file_paths to dataframe
test_images = pd.DataFrame(file_paths, columns=['filename', 'filepaths'])
test_images.head()

Unnamed: 0,filename,filepaths
0,Image_1.jpg,content/covid_image_data/test/Image_1.jpg
1,Image_2.jpg,content/covid_image_data/test/Image_2.jpg
2,Image_3.jpg,content/covid_image_data/test/Image_3.jpg
3,Image_4.jpg,content/covid_image_data/test/Image_4.jpg
4,Image_5.jpg,content/covid_image_data/test/Image_5.jpg


Data Pre-processing on test_data

In [22]:
test_pixel_data = [] # initialize an empty numpy array
image_size = 100 # image size taken is 100 here. one can take other size too
for i in range(len(test_images)):

    img_array = cv2.imread(test_images['filepaths'][i], cv2.IMREAD_GRAYSCALE) # converting the image to gray scale

    new_img_array = cv2.resize(img_array, (image_size, image_size)) # resizing the image array

    test_pixel_data.append(new_img_array)

In [23]:
test_pixel_data = np.array(test_pixel_data)

Make Prediction on Test Dataset

In [24]:
pred = model.predict(test_pixel_data)

In [25]:
# The predicted values are the probabilities value
pred[0]

array([0.00066155], dtype=float32)

In [26]:
prediction = []
for value in pred:
    if value < 0.5:
        prediction.append(0)
    else:
        prediction.append(1)

In [27]:
prediction[0]

0

In [29]:
res = pd.DataFrame({'filename': test_images['filename'], 'label': prediction}) # prediction is nothing but the final predictions of your model on input features of your new unseen test data
res.to_csv("submission1.csv", index = False)