In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## EDA is inspired by meghnasingh, below is the link

https://www.kaggle.com/meghnasingh2080/covid-19-x-ray-classifier

In [None]:
# Import library
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from glob import glob
from PIL import Image
import os
import random
import cv2

In [None]:
path = '../input/hrct-chest-covid-data-ct-scan/HRCT-Chest-Covid-Data-CT-SCAN/HRCT Chest Covid Data CT SCAN'

diag_code_dict = {
    'Covid': 0,

    'Normal': 1
}

diag_title_dict = {
    'Covid': 'Covid-19',

    'Normal': 'Healthy'}

imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x for x in glob(os.path.join(path, '*','*.jpg'))}

imageid_path_dict

In [None]:
covidData = pd.DataFrame.from_dict(imageid_path_dict, orient = 'index').reset_index()
covidData.columns = ['image_id','path']
classes = covidData.image_id.str.split('-').str[0]
covidData['diag'] = classes
covidData['target'] = covidData['diag'].map(diag_code_dict.get)
covidData['Class'] = covidData['diag'].map(diag_title_dict.get)

In [None]:
samples,feature = covidData.shape
duplicated = covidData.duplicated().sum()
null_values = covidData.isnull().sum().sum()

print('Simple EDA')
print('Number of samples: %d'%(samples))
print('duplicates: %d'%(duplicated))
print('null values: %d' %(null_values))

In [None]:
#samples per class
plt.figure(figsize=(20,8))
sns.set(style="ticks", font_scale = 1)
ax = sns.countplot(data = covidData,x='Class',order = covidData['Class'].value_counts().index,palette="flare")
sns.despine(top=True, right=True, left=True, bottom=False)
plt.xticks(rotation=0,fontsize = 12)
ax.set_xlabel('Sample Type - Diagnosis',fontsize = 14,weight = 'bold')
ax.set(yticklabels=[])
ax.axes.get_yaxis().set_visible(False) 
plt.title('Number of Samples per Class', fontsize = 16,weight = 'bold');
#plot percentage
for p in ax.patches:
    ax.annotate("%.1f%%" % (100*float(p.get_height()/samples)), (p.get_x() + p.get_width() / 2., abs(p.get_height())),
    ha='center', va='bottom', color='black', xytext=(0, 10),rotation = 'horizontal',
    textcoords='offset points')

In [None]:
covidData['image'] = covidData['path'].map(lambda x: np.asarray(Image.open(x).resize((75, 75))))

In [None]:
n_samples =3

fig, m_axs = plt.subplots(2, n_samples, figsize = (2*n_samples, 3*2))

for n_axs, (type_name, type_rows) in zip(m_axs,covidData.sort_values(['diag']).groupby('diag')):
    n_axs[1].set_title(type_name,fontsize = 14,weight = 'bold')
    for c_ax, (_, c_row) in zip(n_axs, type_rows.sample(n_samples, random_state=1234).iterrows()):       
        picture = c_row['path']
        image = cv2.imread(picture)
        c_ax.imshow(image)
        c_ax.axis('off')

In [None]:
print('shape of the image: {}'.format(image.shape))

In [None]:
print('image size {}'.format(image.size))

In [None]:
image.dtype

In [None]:
print('max rgb: {}'.format(image.max()))

In [None]:
print('min rgb: {}'.format(image.min()))

In [None]:
image[0, 0]

In [None]:
plt.title('Bchannel', fontsize=14, weight='bold')
plt.imshow(image[:,:,0])
plt.axis('off');
plt.show()

In [None]:
mean_val = []
std_dev_val = []
max_val = []
min_val = []

for i in range(0, samples):
    mean_val.append(covidData['image'][i].mean())
    std_dev_val.append(np.std(covidData['image'][i]))
    max_val.append(covidData['image'][i].max())
    min_val.append(covidData['image'][i].min())

imageEDA = covidData.loc[:, ['image', 'Class', 'path']]
imageEDA['mean'] = mean_val
imageEDA['stedev'] = std_dev_val
imageEDA['max'] = max_val
imageEDA['min'] = min_val

subt_mean_samples = imageEDA['mean'].mean() - imageEDA['mean']
imageEDA['subt_mean'] = subt_mean_samples

In [None]:
ax = sns.displot(data = imageEDA, x = 'mean', kind="kde")
plt.title('Image color mean value distribution')

In [None]:
ax = sns.displot(data = imageEDA, x = 'mean', kind="kde", hue='Class')
plt.title('Image color mean value distribution by class')

In [None]:
ax = sns.displot(data = imageEDA, x = 'max', kind="kde", hue='Class')
plt.title('Image color max value distribution by class')

In [None]:
sns.displot(data = imageEDA, x = 'min', kind="kde", hue='Class')
plt.title('Image color min value distribution by class')

In [None]:
plt.figure(figsize=(20,8))
sns.set(style="ticks", font_scale = 1)
ax = sns.scatterplot(data=imageEDA, x="mean", y=imageEDA['stedev'], hue = 'Class',alpha=0.8);
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xticks(rotation=0,fontsize = 12)
ax.set_xlabel('Image Channel Colour Mean',fontsize = 14,weight = 'bold')
ax.set_ylabel('Image Channel Colour Standard Deviation',fontsize = 14,weight = 'bold')
plt.title('Mean and Standard Deviation of Image Samples', fontsize = 16,weight = 'bold');

In [None]:
plt.figure(figsize=(20,8));
g = sns.FacetGrid(imageEDA, col="Class", height=6)
g.map_dataframe(sns.scatterplot, x='mean', y='stedev');
g.set_titles(col_template="{col_name}", row_template="{row_name}", size=14)
g.fig.subplots_adjust(top=.7)
g.fig.suptitle('Mean and standard dev of img samples')
axes = g.axes.flatten()
axes[0].set_ylabel('std dev')
for ax in axes:
    ax.set_xlabel('Mean')
g.fig.tight_layout()