In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = (15,5)
sns.set_style('white')
plt.style.use('tableau-colorblind10')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
        print(os.path.join(dirname))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
normal = pd.read_csv('/kaggle/input/large-covid19-ct-slice-dataset/meta_data_normal.csv')
cap = pd.read_csv('/kaggle/input/large-covid19-ct-slice-dataset/meta_data_cap.csv')
covid = pd.read_csv('/kaggle/input/large-covid19-ct-slice-dataset/meta_data_covid.csv')

In [None]:
# merging the meta data files from the three classes
merged = pd.concat([normal,covid,cap],ignore_index=True, sort=False)
merged

In [None]:
merged.shape

In [None]:
# Number of missing values in each of the columns
merged.isnull().sum()

In [None]:
# categorical variables
cat_var = [col for col in merged.columns if merged[col].dtype =='O']
cat_var

In [None]:
merged['source_dataset']= merged["File name"].str.split('_',expand=True)[1]

merged.at[merged['Institution']== 'Babak Imaging Center, Tehran','source_dataset'] = "Afshar"
merged.at[merged['Diagnosis']== 'CAP','source_dataset'] = "Afshar"

merged

In [None]:
for col in ['Gender','Country','Image Source','Diagnosis','source_dataset']:
    print(col,':')
    print( merged[col].value_counts(dropna=False))
    print()

In [None]:
ax = sns.countplot('source_dataset',data=merged)

In [None]:
axis = sns.countplot(x='Diagnosis', data = merged, hue='source_dataset',palette="tab10")
axis.legend(bbox_to_anchor= (1, 1) )


In [None]:
merged.describe()

In [None]:
# Number of cases from each of the classes
ax = sns.countplot('Diagnosis',data=merged)

In [None]:
def augment(x):
# str to int and relpace missing values with '-'
    try:
        return int(x)
    except:
        return '-'
merged['Age'] = merged['Age'].iloc[:].apply(lambda x : augment(x))    

In [None]:
plt.figure()
sns.set(style="white", context="talk")

plt.figure(figsize=(20,10))
total1= merged.copy()
total1 = total1[total1.Gender != '-']
sns_plot = sns.countplot(x='Diagnosis', data = total1, hue='Gender',palette="tab10")

#plt.xticks(np.arange(3), ("Yes", "No","Missing"))
plt.ylabel('Number of Frames')
#plt.xticks(np.arange(3), ("Covid", "Normal","CAP"))
plt.xlabel('Class')
fig = sns_plot.get_figure()

plt.rcParams.update({'font.size': 50})
plt.figure(figsize=(20,10))
total2= merged.copy()
total2 = total2[total2.Age != '-']
sns_plot = sns.boxplot(x = 'Diagnosis',y = 'Age',data = total2,palette="tab10")
#plt.xticks(np.arange(3), ("Covid", "Normal","CAP"))
plt.ylabel('Age (year)')
plt.xlabel('Disease')
fig = sns_plot.get_figure()

total3= merged.copy()
total3 = total3[total3.Country != '-']

plt.figure(figsize=(20,10))
sns_plot = sns.countplot(x='Country', data = total3, palette="rocket", log=True,order = total3['Country'].value_counts().index)

plt.ylabel('Number of Frames', fontsize=27)
plt.xlabel('Country', fontsize=27)
plt.xticks(rotation=33, fontsize=23)

fig = sns_plot.get_figure()

In [None]:
total2= merged.copy()
total2 = total2[total2.Age != '-']
plt.figure(figsize=(10, 8), dpi=80)
box_plot = sns.boxplot(x = 'Diagnosis',y = 'Age',data = total2,palette="rocket")
plt.ylabel('Age (year)', fontsize=23)
plt.xlabel('Diagnosis', fontsize=23)
plt.xticks(rotation=33, fontsize=15)
sns.set(style="white", context="talk")
ax = box_plot.axes
lines = ax.get_lines()
categories = ax.get_xticks()

for cat in categories:
    # every 4th line at the interval of 6 is median line
    # 0 -> p25 1 -> p75 2 -> lower whisker 3 -> upper whisker 4 -> p50 5 -> upper extreme value
    y0 = round(lines[cat*6+3].get_ydata()[0],1) 
    y = round(lines[cat*6+2].get_ydata()[0],1) 
    y2 = round(lines[cat*6+4].get_ydata()[0],1) 
    
    ax.text(
        cat, 
        y0, 
        f'{y0}', 
        ha='center', 
        va='center', 
        fontweight='bold', 
        size=12,
        color='white',
        bbox=dict(facecolor='#445A64'))
    ax.text(
        cat, 
        y, 
        f'{y}', 
        ha='center', 
        va='center', 
        fontweight='bold', 
        size=12,
        color='white',
        bbox=dict(facecolor='#445A64'))
    ax.text(
        cat, 
        y2, 
        f'{y2}', 
        ha='center', 
        va='center', 
        fontweight='bold', 
        size=12,
        color='white',
        bbox=dict(facecolor='#445A64'))

box_plot.figure.tight_layout()

fig = box_plot.get_figure()    
    
    

In [None]:
gender_all = merged['Gender'].iloc[:].values

gender_normal = normal['Gender'].values
gender_covid = covid['Gender'].values
gender_cap = cap['Gender'].values

print('                    Gender                       ')
print('-------------------------------------------------')
print('Covid: Males: {m} , Females: {f}, Missing: {mm}'.format(m = np.sum(gender_covid=='M'), f =np.sum(gender_covid=='F'), mm =np.sum(gender_covid=='-')))
print('Normal: Males: {m} , Females: {f}, Missing: {mm}'.format(m = np.sum(gender_normal=='M'), f =np.sum(gender_normal=='F'), mm =np.sum(gender_normal=='-')))
print('Cap: Males: {m} , Females: {f}, Missing: {mm}'.format(m = np.sum(gender_cap=='M'), f =np.sum(gender_cap=='F'), mm =np.sum(gender_cap=='-')))

print('All: Males: {m} , Females: {f}, Missing: {mm}'.format(m = np.sum(gender_all=='M'), f =np.sum(gender_all=='F'), mm =np.sum(gender_all=='-')))

In [None]:
print('                    Age                       ')
print('------------------------------------------------')

merged_filtered = merged[merged.Age != '-'].reset_index()
age_all = merged_filtered['Age'].iloc[:].values


covid_filtered = covid[covid.Age != '-'].reset_index()
age_covid = covid_filtered['Age'].iloc[:].apply(lambda x : augment(x)).values

age_normal = normal['Age'].iloc[:].apply(lambda x : augment(x)).values
age_cap = cap['Age'].iloc[:].apply(lambda x : augment(x)).values

print('Covid Age : {m:.2f} +/- {s:.2f}'.format(m = np.mean(age_covid), s =np.std(age_covid)))
print('Normal Age : {m:.2f} +/- {s:.2f}'.format(m = np.mean(age_normal), s =np.std(age_normal)))
print('Cap Age : {m:.2f} +/- {s:.2f}'.format(m = np.mean(age_cap), s =np.std(age_cap)))
print('All Age : {m:.2f} +/- {s:.2f}'.format(m = np.mean(age_all), s =np.std(age_all)))


In [None]:
merged_filtered = merged[merged.Age != '-']
merged_filtered['Age'] = merged_filtered['Age'].iloc[:].apply(lambda x : augment(x))    
merged_filtered[['Gender','Age','Diagnosis']].groupby(['Diagnosis','Gender']).agg(['mean','max','count']).sort_values(by=[('Age','mean')], ascending=False)

In [None]:
#
ax = sns.distplot(total2.Age)

In [None]:
ax = sns.boxenplot(x='Diagnosis',y='Age',data=merged_filtered)