In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("/content/drive/MyDrive/Covid Data Set/metadata.csv")

In [4]:
missing_values = df.isnull()

In [5]:
constant_value = 0
df['RT_PCR_positive'] = df['RT_PCR_positive'].fillna(constant_value)

In [6]:
df['RT_PCR_positive'] = df['RT_PCR_positive'].replace("Y", 1)
df['RT_PCR_positive'] = df['RT_PCR_positive'].replace("Unclear", 1)


In [7]:
columns_to_keep = ['RT_PCR_positive', 'filename']

In [8]:
df = df.drop(columns=[col for col in df.columns if col not in columns_to_keep])

In [9]:
df.head

<bound method NDFrame.head of      RT_PCR_positive                                           filename
0                  1  auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...
1                  1  auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...
2                  1  auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...
3                  1  auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...
4                  1                              nejmc2001573_f1a.jpeg
..               ...                                                ...
945                0        072ecaf8c60a81980abb57150a8016_jumbo-9.jpeg
946                0        ff33c406392b968d483174c97eb857_jumbo-9.jpeg
947                0                                     000001-266.jpg
948                0                                     000001-272.jpg
949                0                                     000002-268.jpg

[950 rows x 2 columns]>

In [10]:
import os

In [11]:
file_column = df['filename']

In [12]:
images_path_valid = []

In [13]:
for file_name in file_column:
    _, file_extension = os.path.splitext(file_name)

    if file_extension.lower() == '.jpg':
        images_path_valid.append(file_name)
    if file_extension.lower() == '.jpeg':
        images_path_valid.append(file_name)
    if file_extension.lower() == '.png':
        images_path_valid.append(file_name)

In [14]:
jpg_count = len(images_path_valid)

print(f"Number of JPG files: {jpg_count}")

Number of JPG files: 929


In [15]:
filtered_df = df[df['filename'].str.lower().str.endswith(('.jpg', '.jpeg', '.png'))]

In [16]:
filtered_df

Unnamed: 0,RT_PCR_positive,filename
0,1,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...
1,1,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...
2,1,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...
3,1,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...
4,1,nejmc2001573_f1a.jpeg
...,...,...
945,0,072ecaf8c60a81980abb57150a8016_jumbo-9.jpeg
946,0,ff33c406392b968d483174c97eb857_jumbo-9.jpeg
947,0,000001-266.jpg
948,0,000001-272.jpg


In [17]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.layers import Dropout

In [18]:
image_folder = "/content/drive/MyDrive/Covid Data Set/images/images/"

In [19]:
filtered_df['file_path'] = image_folder + filtered_df['filename']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['file_path'] = image_folder + filtered_df['filename']


In [20]:
filtered_df

Unnamed: 0,RT_PCR_positive,filename,file_path
0,1,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,/content/drive/MyDrive/Covid Data Set/images/i...
1,1,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,/content/drive/MyDrive/Covid Data Set/images/i...
2,1,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,/content/drive/MyDrive/Covid Data Set/images/i...
3,1,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,/content/drive/MyDrive/Covid Data Set/images/i...
4,1,nejmc2001573_f1a.jpeg,/content/drive/MyDrive/Covid Data Set/images/i...
...,...,...,...
945,0,072ecaf8c60a81980abb57150a8016_jumbo-9.jpeg,/content/drive/MyDrive/Covid Data Set/images/i...
946,0,ff33c406392b968d483174c97eb857_jumbo-9.jpeg,/content/drive/MyDrive/Covid Data Set/images/i...
947,0,000001-266.jpg,/content/drive/MyDrive/Covid Data Set/images/i...
948,0,000001-272.jpg,/content/drive/MyDrive/Covid Data Set/images/i...


In [21]:
import cv2

In [22]:
def preprocess_image(image_path, target_size=(224, 224)):
  if os.path.exists(image_path):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    image = cv2.resize(image, target_size)
    image = image / 255.0
    return image
  else:
    print(f"File not found: {image_path}")
    return None

In [23]:
import numpy as np

In [24]:
X = np.array([preprocess_image(file_path) for file_path in filtered_df['file_path']])
y = filtered_df['RT_PCR_positive'].values

In [25]:
from sklearn.model_selection import train_test_split

In [26]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [29]:
model = Sequential([
    Conv2D(128, (3, 3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [31]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x7be2e03442b0>

In [32]:
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f'Test accuracy: {test_acc}')

Test accuracy: 0.7714285850524902
