In [17]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import random
import pickle
from tqdm import tqdm
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
for dirname, _, filenames in os.walk('./dataset-2'):
    for filename in filenames:
        os.path.join(dirname, filename)

In [18]:
df = pd.read_csv("./dataset-2/full_df.csv")
# df.head()

In [19]:
def has_cataract(text):
    if "cataract" in text:
        return 1
    else:
        return 0

In [20]:
df["left_cataract"] = df["Left-Diagnostic Keywords"].apply(lambda x: has_cataract(x))
df["right_cataract"] = df["Right-Diagnostic Keywords"].apply(lambda x: has_cataract(x))
df["right_cataract"][1:5]

1    0
2    0
3    0
4    0
Name: right_cataract, dtype: int64

In [21]:
left_cataract = df.loc[(df.C ==1) & (df.left_cataract == 1)]["Left-Fundus"].values
right_cataract = df.loc[(df.C ==1) & (df.right_cataract == 1)]["Right-Fundus"].values

In [22]:
print("Number of images in left cataract: {}".format(len(left_cataract)))
print("Number of images in right cataract: {}".format(len(right_cataract)))

Number of images in left cataract: 304
Number of images in right cataract: 290


In [23]:
left_normal = df.loc[(df.C ==0) & (df["Left-Diagnostic Keywords"] == "normal fundus")]["Left-Fundus"].sample(250,random_state=42).values
right_normal = df.loc[(df.C ==0) & (df["Right-Diagnostic Keywords"] == "normal fundus")]["Right-Fundus"].sample(250,random_state=42).values

In [24]:
cataract = np.concatenate((left_cataract,right_cataract),axis=0)
normal = np.concatenate((left_normal,right_normal),axis=0)
print(len(cataract),len(normal))

594 500


In [25]:
from tensorflow.keras.preprocessing.image import load_img,img_to_array
dataset_dir = "./dataset-2/preprocessed_images"
image_size=224
labels = []
dataset = []
def create_dataset(image_category,label):
    for img in tqdm(image_category):
        image_path = os.path.join(dataset_dir,img)
        try:
            image = cv2.imread(image_path,cv2.IMREAD_COLOR)
            image = cv2.resize(image,(image_size,image_size))

        except:
            continue
        
        dataset.append([np.array(image),np.array(label)])
    random.shuffle(dataset)
    return dataset
        

In [26]:
dataset = create_dataset(cataract,1)
dataset = create_dataset(normal,0)

100%|██████████| 594/594 [00:02<00:00, 252.21it/s]
100%|██████████| 500/500 [00:02<00:00, 238.15it/s]


In [27]:
x = np.array([i[0] for i in dataset]).reshape(-1,image_size,image_size,3)
y = np.array([i[1] for i in dataset])

In [28]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(x_train.reshape(x_train.shape[0], -1), y_train)
dt_y_pred = dt.predict(x_test.reshape(x_test.shape[0], -1))

print("Decision Tree Classifier")
print("Accuracy:", accuracy_score(y_test, dt_y_pred))
print("Precision:", precision_score(y_test, dt_y_pred))
print("Recall:", recall_score(y_test, dt_y_pred))
print("F1 Score:", f1_score(y_test, dt_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, dt_y_pred))

# SVM Classifier
svm = SVC()
svm.fit(x_train.reshape(x_train.shape[0], -1), y_train)
svm_y_pred = svm.predict(x_test.reshape(x_test.shape[0], -1))

print("\nSVM Classifier")
print("Accuracy:", accuracy_score(y_test, svm_y_pred))
print("Precision:", precision_score(y_test, svm_y_pred))
print("Recall:", recall_score(y_test, svm_y_pred))
print("F1 Score:", f1_score(y_test, svm_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, svm_y_pred))


# Logistic Regression Classifier
lr = LogisticRegression(max_iter=2000,solver='saga')
lr.fit(x_train.reshape(x_train.shape[0], -1), y_train)
lr_y_pred = lr.predict(x_test.reshape(x_test.shape[0], -1))

print("\nLogistic Regression Classifier")
print("Accuracy:", accuracy_score(y_test, lr_y_pred))
print("Precision:", precision_score(y_test, lr_y_pred))
print("Recall:", recall_score(y_test, lr_y_pred))
print("F1 Score:", f1_score(y_test, lr_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, lr_y_pred))



Logistic Regression Classifier
Accuracy: 0.8807339449541285
Precision: 0.8439716312056738
Recall: 0.967479674796748
F1 Score: 0.9015151515151516
Confusion Matrix:
[[ 73  22]
 [  4 119]]


In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Decision Tree Classifier
dt = DecisionTreeClassifier()
dt.fit(x_train.reshape(x_train.shape[0], -1), y_train)
dt_y_pred = dt.predict(x_test.reshape(x_test.shape[0], -1))

print("Decision Tree Classifier")
print("Accuracy:", accuracy_score(y_test, dt_y_pred))
print("Precision:", precision_score(y_test, dt_y_pred))
print("Recall:", recall_score(y_test, dt_y_pred))
print("F1 Score:", f1_score(y_test, dt_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, dt_y_pred))

# SVM Classifier
svm = SVC()
svm.fit(x_train.reshape(x_train.shape[0], -1), y_train)
svm_y_pred = svm.predict(x_test.reshape(x_test.shape[0], -1))

print("\nSVM Classifier")
print("Accuracy:", accuracy_score(y_test, svm_y_pred))
print("Precision:", precision_score(y_test, svm_y_pred))
print("Recall:", recall_score(y_test, svm_y_pred))
print("F1 Score:", f1_score(y_test, svm_y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, svm_y_pred))

Decision Tree Classifier
Accuracy: 0.8119266055045872
Precision: 0.803030303030303
Recall: 0.8760330578512396
F1 Score: 0.8379446640316206
Confusion Matrix:
[[ 71  26]
 [ 15 106]]

SVM Classifier
Accuracy: 0.7889908256880734
Precision: 0.8048780487804879
Recall: 0.8181818181818182
F1 Score: 0.8114754098360656
Confusion Matrix:
[[73 24]
 [22 99]]


In [39]:
import pandas as pd

# Create a dictionary to store the results of the different classifiers
results = {
    'Classifier': ['Decision Tree', 'SVM', 'Logistic Regression'],
    'Accuracy': [accuracy_score(y_test, dt_y_pred), accuracy_score(y_test, svm_y_pred), accuracy_score(y_test, lr_y_pred)],
    'Precision': [precision_score(y_test, dt_y_pred), precision_score(y_test, svm_y_pred), precision_score(y_test, lr_y_pred)],
    'Recall': [recall_score(y_test, dt_y_pred), recall_score(y_test, svm_y_pred), recall_score(y_test, lr_y_pred)],
    'F1 Score': [f1_score(y_test, dt_y_pred), f1_score(y_test, svm_y_pred), f1_score(y_test, lr_y_pred)]
}

# Create a DataFrame from the results dictionary
df = pd.DataFrame(results)

# Set the 'Classifier' column as the index of the DataFrame
df.set_index('Classifier', inplace=True)

# Print the DataFrame
print(df)


                     Accuracy  Precision    Recall  F1 Score
Classifier                                                  
Decision Tree        0.811927   0.803030  0.876033  0.837945
SVM                  0.788991   0.804878  0.818182  0.811475
Logistic Regression  0.550459   0.581560  0.677686  0.625954
