SVM implementation for Lung Image Classification

In [None]:
#importing relevant libraries required for code execution
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
import os
import matplotlib as mpl
import matplotlib.pyplot as plt
from IPython.display import display
%matplotlib inline
from PIL import Image
from skimage.feature import hog
from skimage.color import rgb2grey
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import BaggingClassifier 

In [None]:
!pip install imutils
import imutils

Defining the functions to extract features from the images

In [None]:
#creating feature vector from pixel values by resizing size
def image_to_feature_vector(image, size=(32, 32)):
    return cv2.resize(image, size).flatten()
#creating feature vector from color histogram
def extract_color_histogram(image, bins=(8, 8, 8)):
    hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    hist = cv2.calcHist([hsv], [0, 1, 2], None, bins,[0, 180, 0, 256, 0, 256])
    if imutils.is_cv2():
        hist = cv2.normalize(hist)
    else:
        cv2.normalize(hist, hist)
    return hist.flatten() # return the flattened histogram as the feature vector

In [None]:
#creating imagePath array by concatenating paths of all images
imagePaths = []

for dirname, _, filenames in os.walk('/kaggle/input/covid19-radiography-database/COVID-19 Radiography Database/'):
    for filename in filenames:
        if (filename[-3:] == 'png'):
            imagePaths.append(os.path.join(dirname, filename))

#Checking to validate all imagePaths have been included
len(imagePaths) == 219+1341+1345

In [None]:
# defining the arrays to store the feature vectors and labels
rawImages = []
features = []
labels = []
img=[]

for imagePath in imagePaths:
    label = imagePath.split(os.path.sep)[-2]
    image = cv2.imread(imagePath)
    pixels = image_to_feature_vector(image) #function call to first feature extraction method
    hist = extract_color_histogram(image)
    rawImages.append(pixels) #first feature vector
    features.append(hist) #second feature vector
    labels.append(label)
    img.append(image)
    
rawImages = np.array(rawImages)
features = np.array(features)
labels = np.array(labels)

Checking model performance with color histogram features with 5-fold cross validation

In [None]:
X = pd.DataFrame(features)
y = pd.Series(labels)
svm = SVC(kernel='linear', probability=True, random_state=42) #defining the model
cv = KFold(n_splits=5, random_state=1, shuffle=True) #5-fold cross validation setup
recall = cross_val_score(svm, X, y, scoring='recall_macro', cv=cv) #calculating recall 
print("Recall: ",np.mean(recall))
precision = cross_val_score(svm, X, y, scoring='precision_macro', cv=cv) #calculating precision
print("Precision: ",np.mean(precision))
accuracy = cross_val_score(svm, X, y, scoring='accuracy', cv=cv) #calculating accuracy
print("Accuracy: ",np.mean(accuracy))
f1score = cross_val_score(svm, X, y, scoring='f1_macro', cv=cv) #calculating f1score
print("F1 Score: ",np.mean(f1score))

In [None]:
#Plotting the model performance metrics 
svm_metrics = ['Accuracy', 'Recall', 'Precision', 'F1-Score']
svm_metric_values = [np.mean(accuracy), np.mean(recall), np.mean(precision), np.mean(f1score)]
svm_pos = [i for i, _ in enumerate(svm_metrics)]
plt.bar(svm_pos, svm_metric_values , color='green')
plt.xlabel("Percentage values")
plt.ylabel("Metrics")
plt.title("Performance metrics for SVM")
plt.xticks(svm_pos, svm_metrics)
plt.show()

Checking model performance with pixel intensity as features with 5-fold cross validation

In [None]:
X = pd.DataFrame(rawImages)
y = pd.Series(labels)
# define support vector classifier
svm = SVC(kernel='linear', probability=True, random_state=42)
cv = KFold(n_splits=5, random_state=1, shuffle=True)
recall = cross_val_score(svm, X, y, scoring='recall_macro', cv=cv)
print("Recall: ",np.mean(recall))
precision = cross_val_score(svm, X, y, scoring='precision_macro', cv=cv)
print("Precision: ",np.mean(precision))
accuracy = cross_val_score(svm, X, y, scoring='accuracy', cv=cv)
print("Accuracy: ",np.mean(accuracy))
f1score = cross_val_score(svm, X, y, scoring='f1_macro', cv=cv)
print("F1 Score: ",np.mean(f1score))


In [None]:
#Plotting the model performance metrics 
svm_metrics = ['Accuracy', 'Recall', 'Precision', 'F1-Score']
svm_metric_values = [np.mean(accuracy), np.mean(recall), np.mean(precision), np.mean(f1score)]
svm_pos = [i for i, _ in enumerate(svm_metrics)]
plt.bar(svm_pos, svm_metric_values , color='gray')
plt.xlabel("Percentage values")
plt.ylabel("Metrics")
plt.title("Performance metrics for SVM without bagging")
plt.xticks(svm_pos, svm_metrics)
plt.show()

Checking model performance with pixel intensity as features with 5-fold cross validation and bagging with 5 estimators

In [None]:
X = pd.DataFrame(rawImages)
y = pd.Series(labels)
# define support vector classifier
svm = SVC(kernel='linear', probability=True, random_state=42)
cv = KFold(n_splits=5, random_state=1, shuffle=True)
svm = BaggingClassifier(base_estimator=svm, n_estimators=5, random_state=314) #defining the model with bagging
recall = cross_val_score(svm, X, y, scoring='recall_macro', cv=cv)
print("Recall: ",np.mean(recall))
precision = cross_val_score(svm, X, y, scoring='precision_macro', cv=cv)
print("Precision: ",np.mean(precision))
accuracy = cross_val_score(svm, X, y, scoring='accuracy', cv=cv)
print("Accuracy: ",np.mean(accuracy))
f1score = cross_val_score(svm, X, y, scoring='f1_macro', cv=cv)
print("F1 Score: ",np.mean(f1score))

In [None]:
svm_metrics = ['Accuracy', 'Recall', 'Precision', 'F1-Score']
svm_metric_values = [np.mean(accuracy), np.mean(recall), np.mean(precision), np.mean(f1score)]
svm_pos = [i for i, _ in enumerate(svm_metrics)]
plt.bar(svm_pos, svm_metric_values, color='green')
plt.xlabel("Percentage values")
plt.ylabel("Metrics")
plt.title("Performance metrics for SVM with bagging")

plt.xticks(svm_pos, svm_metrics)

plt.show()

Evaluating model with different values of K for K-Fold cross validation

In [None]:
splits = [x for x in range(2,7)] # define the folds for cross-validation
#create empty arrays to store the model evaluation metrics
recall_final = [] 
precision_final = []
accuracy_final = []
f1_final = []

#training and testing the model for different values of K between 2 and 6
for split in splits: 
    cv = KFold(n_splits=split, random_state=1, shuffle=True)
    recall = cross_val_score(svm, X, y, scoring='recall_macro', cv=cv)
    recall_final.append(np.mean(recall))
    precision = cross_val_score(svm, X, y, scoring='precision_macro', cv=cv)
    precision_final.append(np.mean(precision))
    accuracy = cross_val_score(svm, X, y, scoring='accuracy', cv=cv)
    accuracy_final.append(np.mean(accuracy))
    f1score = cross_val_score(svm, X, y, scoring='f1_macro', cv=cv)
    f1_final.append(np.mean(f1score))
print("Recall: ",recall_final)
print("Precision: ",precision_final)
print("Accuracy: ",accuracy_final)
print("F1 Score: ",f1_final)

In [None]:
#plotting the model performance metrics for different values of K 
fig2 = plt.figure(figsize =(12, 7))
plt.plot(splits,recall_final, label='recall')
plt.plot(splits,precision_final, label='precision')
plt.plot(splits,accuracy_final, label='accuracy')
plt.plot(splits,f1_final, label='f1score')
plt.xlabel('Folds')
plt.ylabel('Model Evaluation Metrics')
plt.title('SVM performance obtained at different folds ')
plt.legend(loc='lower right')
plt.show()