In [1]:
import io
from io import StringIO
import os
import sys
import cv2
from pyspark import SparkContext
from pyspark.ml.feature import CountVectorizer
from pyspark.sql import Row, SparkSession 
from pyspark.sql.functions import udf, col
from pyspark.sql.types import ArrayType, FloatType, StringType, IntegerType
import pyspark.sql.functions as F
# for visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 

In [2]:
# import spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
data_path = "./coronahack-chest-xraydataset/Chest_xray_Corona_Metadata.csv"
meta_data = spark.read.format('csv').options(header='true', inferschema='true').load(data_path)

In [4]:
# meta_data.show(5)

In [5]:
meta_data_with_labels = meta_data.withColumn(
    'new_label',
    F.when((F.col("Label")  =="Normal"), 0).otherwise(1)
)

In [7]:
number_of_samples = meta_data_with_labels.count()
train_set = meta_data_with_labels.where(meta_data.Dataset_type == "TRAIN")
test_set = meta_data_with_labels.where(meta_data.Dataset_type == "TEST")
# num_train = train_set.count()
# num_test = test_set.count()

# print(f"Number of samples: ",number_of_samples )
# print(f"Train samples: ", num_train)
# print(f"Test samples: ",num_test)

In [8]:
train_path = os.getcwd() +'/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/train/'
@udf(returnType=StringType())
def get_absolute_path(img_file):
    abs_path = train_path+str(img_file)
    return abs_path
train = train_set.withColumn("image_path", get_absolute_path(col("X_ray_image_name")))

test_path = os.getcwd() +'/coronahack-chest-xraydataset/Coronahack-Chest-XRay-Dataset/Coronahack-Chest-XRay-Dataset/test/'
@udf(returnType=StringType())
def get_test_absolute_path(img_file):
    abs_path = test_path+str(img_file)
    return abs_path
test = test_set.withColumn("image_path", get_test_absolute_path(col("X_ray_image_name")))



In [10]:
select_train = train.select("image_path", "new_label")
select_test = test.select("image_path", "new_label")
# select_train.show(4)
# select_test.show(4)
#print(select_train.count())
#print(select_test.count())


# Convert images to Descriptors : Hu Moments, Haralick Texture and Color Histogram

In [11]:
import mahotas

def fetch_descriptor(img):
    image = cv2.imread(img)
    image = cv2.resize(image, (500, 500))
    
    im_1 = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    fv_hu_moments = cv2.HuMoments(cv2.moments(im_1)).flatten().tolist()
    
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # compute the haralick texture feature vector
    fv_haralick = mahotas.features.haralick(gray).mean(axis=0).flatten().tolist()
    
    bins =8
    # change to HSV 
    imm = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    # find the color histogram
    hist  = cv2.calcHist([imm], [0, 1, 2], None, [bins, bins, bins], [0, 256, 0, 256, 0, 256])
    # histogram normalization
    cv2.normalize(hist, hist)
    
    fv_histogram = hist.flatten().tolist()
    
    # concatenate the 3 feature 
    final_descriptor = fv_histogram+fv_haralick+fv_hu_moments
    
    return final_descriptor

udf_image = udf(fetch_descriptor, ArrayType(FloatType()))


train_descriptor = select_train.withColumn("descriptors", udf_image("image_path"))
train_descriptor = train_descriptor.filter(train_descriptor.descriptors. isNotNull())

test_descriptor = select_test.withColumn("descriptors", udf_image("image_path"))
test_descriptor = test_descriptor.filter(test_descriptor.descriptors. isNotNull())

#train_descriptor.show(4)
test_descriptor.show(4)
#print(train_descriptor.count())
print(test_descriptor.count())

+--------------------+---------+--------------------+
|          image_path|new_label|         descriptors|
+--------------------+---------+--------------------+
|/Users/mma525/Doc...|        0|[0.015090854, 0.0...|
|/Users/mma525/Doc...|        0|[0.12612872, 0.11...|
|/Users/mma525/Doc...|        0|[0.09986997, 0.02...|
|/Users/mma525/Doc...|        0|[0.3424064, 0.038...|
+--------------------+---------+--------------------+
only showing top 4 rows

624


In [17]:
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf
list_to_vector_udf = udf(lambda l: Vectors.dense(l), VectorUDT())

train_with_vec = train_descriptor.withColumn("vec_descriptors", list_to_vector_udf("descriptors"))
test_with_vec = test_descriptor.withColumn("vec_descriptors", list_to_vector_udf("descriptors"))

# print("Total train examples: ",train_with_vec.count())
# print("Total test examples: ",test_with_vec.count())


Split the training into 90/10 into train and validation data

In [18]:
train, val = train_with_vec.randomSplit([0.9, 0.1])
train.show()
val.show()
# print("Total train split: ",train.count())
# print("Total validation split: ",val.count())

+--------------------+---------+--------------------+--------------------+
|          image_path|new_label|         descriptors|     vec_descriptors|
+--------------------+---------+--------------------+--------------------+
|/Users/mma525/Doc...|        1|[0.03601592, 0.03...|[0.03601592034101...|
|/Users/mma525/Doc...|        1|[0.0, 0.006120887...|[0.0,0.0061208875...|
|/Users/mma525/Doc...|        1|[0.4169933, 0.217...|[0.41699329018592...|
|/Users/mma525/Doc...|        1|[0.27934158, 0.09...|[0.27934157848358...|
|/Users/mma525/Doc...|        1|[0.0023198084, 0....|[0.00231980835087...|
|/Users/mma525/Doc...|        1|[0.2190629, 0.195...|[0.21906289458274...|
|/Users/mma525/Doc...|        1|[0.15192017, 0.12...|[0.15192016959190...|
|/Users/mma525/Doc...|        1|[0.04534819, 0.07...|[0.04534818977117...|
|/Users/mma525/Doc...|        1|[0.08312031, 0.08...|[0.08312030881643...|
|/Users/mma525/Doc...|        1|[0.068923436, 0.0...|[0.06892343610525...|
|/Users/mma525/Doc...|   

# Logistic Regression Classifier

### Training

Define our model and train on the training data

In [19]:
from pyspark.ml.classification import LogisticRegression
logistic_regression = LogisticRegression(featuresCol = 'vec_descriptors', labelCol = 'new_label', maxIter=100)
logistic_regression_model = logistic_regression.fit(train)

### Evaluating on training data

In [20]:
predictions_train = logistic_regression_model.transform(train)
predictions_train.select("image_path","new_label","probability", "prediction").show(4)

+--------------------+---------+--------------------+----------+
|          image_path|new_label|         probability|prediction|
+--------------------+---------+--------------------+----------+
|/Users/mma525/Doc...|        1|[0.24766249816287...|       1.0|
|/Users/mma525/Doc...|        1|[0.25253832007836...|       1.0|
|/Users/mma525/Doc...|        1|[0.25565537911483...|       1.0|
|/Users/mma525/Doc...|        1|[0.24931572951512...|       1.0|
+--------------------+---------+--------------------+----------+
only showing top 4 rows



##### ROC evaluator

In [21]:
# Here we evaluate
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='new_label', metricName='areaUnderROC')
print('Train Area Under ROC: ', evaluator.evaluate(predictions_train))


Train Area Under ROC:  0.9219045256220604


##### Classifier evaluator

In [22]:

# multiclassclassification evaluator is used in order to measure accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='new_label', metricName='accuracy')
print('Train Accuracy: ', evaluator.evaluate(predictions_train))




Train Accuracy:  0.7473772555602182


### Evaluating on validation data

In [23]:
predictions_val = logistic_regression_model.transform(val)
predictions_val.select("image_path","new_label","probability", "prediction").show(4)

+--------------------+---------+--------------------+----------+
|          image_path|new_label|         probability|prediction|
+--------------------+---------+--------------------+----------+
|/Users/mma525/Doc...|        1|[0.24945619709254...|       1.0|
|/Users/mma525/Doc...|        1|[0.25469124804894...|       1.0|
|/Users/mma525/Doc...|        1|[0.24967060904034...|       1.0|
|/Users/mma525/Doc...|        1|[0.25399986404169...|       1.0|
+--------------------+---------+--------------------+----------+
only showing top 4 rows



##### ROC evaluator

In [24]:
# Here we evaluate
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='new_label', metricName='areaUnderROC')
print('Val Area Under ROC: ', evaluator.evaluate(predictions_val))


Val Area Under ROC:  0.9184118673647469


##### Classifier evaluator

In [25]:

# multiclassclassification evaluator is used in order to measure accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='new_label', metricName='accuracy')
print('Val Accuracy: ', evaluator.evaluate(predictions_val))




Val Accuracy:  0.7346153846153847


### Evaluating on test data

We evaluate on the test data

In [26]:
test = test_with_vec
predictions_test = logistic_regression_model.transform(test)
predictions_test.select("image_path","new_label","probability", "prediction").show(4)

+--------------------+---------+--------------------+----------+
|          image_path|new_label|         probability|prediction|
+--------------------+---------+--------------------+----------+
|/Users/mma525/Doc...|        0|[0.25581789941654...|       1.0|
|/Users/mma525/Doc...|        0|[0.25705819765774...|       1.0|
|/Users/mma525/Doc...|        0|[0.25725671745614...|       1.0|
|/Users/mma525/Doc...|        0|[0.25874611885734...|       1.0|
+--------------------+---------+--------------------+----------+
only showing top 4 rows



##### ROC evaluator

In [27]:
# Here we evaluate
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(rawPredictionCol='probability', labelCol='new_label', metricName='areaUnderROC')
print('Test Area Under ROC: ', evaluator.evaluate(predictions_test))

Test Area Under ROC:  0.8611439842209088


##### Classifier evaluator

In [28]:
# multiclassclassification evaluator is used in order to measure accuracy
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='new_label', metricName='accuracy')
print('Test Accuracy: ', evaluator.evaluate(predictions_test))

Test Accuracy:  0.625
