# Exploración inicial dataset de Covid-19

In [1]:
%matplotlib notebook
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(0)
pd.set_option('display.max_columns', None)

In [2]:
covid_csv_path = '../data/raw/covid_xray/metadata.csv'
covid_images_path = '../data/raw/covid_xray/images/'
covid_df = pd.read_csv(covid_csv_path)
covid_df.head()

Unnamed: 0,patientid,offset,sex,age,finding,survival,intubated,intubation_present,went_icu,needed_supplemental_O2,extubated,temperature,pO2_saturation,leukocyte_count,neutrophil_count,lymphocyte_count,view,modality,date,location,folder,filename,doi,url,license,clinical_notes,other_notes,Unnamed: 27
0,2,0.0,M,65.0,COVID-19,Y,,,,,,,,,,,PA,X-ray,"January 22, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-a-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,infiltrate in the upper lobe of the left lung,,
1,2,3.0,M,65.0,COVID-19,Y,,,,,,,,,,,PA,X-ray,"January 25, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-b-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
2,2,5.0,M,65.0,COVID-19,Y,,,,,,,,,,,PA,X-ray,"January 27, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-c-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
3,2,6.0,M,65.0,COVID-19,Y,,,,,,,,,,,PA,X-ray,"January 28, 2020","Cho Ray Hospital, Ho Chi Minh City, Vietnam",images,auntminnie-d-2020_01_28_23_51_6665_2020_01_28_...,10.1056/nejmc2001272,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,progressive infiltrate and consolidation,,
4,4,0.0,F,52.0,COVID-19,,,,,,,,,,,,PA,X-ray,"January 25, 2020","Changhua Christian Hospital, Changhua City, Ta...",images,nejmc2001573_f1a.jpeg,10.1056/NEJMc2001573,https://www.nejm.org/doi/full/10.1056/NEJMc200...,,diffuse infiltrates in the bilateral lower lungs,,


In [3]:
covid_df.describe()

Unnamed: 0,patientid,offset,age,temperature,pO2_saturation,leukocyte_count,neutrophil_count,lymphocyte_count
count,312.0,225.0,262.0,22.0,33.0,9.0,2.0,5.0
mean,82.983974,6.817778,54.553435,40.795455,88.948485,5.524444,3.59,1.132
std,51.974917,7.056383,15.687166,12.132894,11.226646,3.099456,2.771859,0.433786
min,2.0,0.0,12.0,36.5,50.0,0.22,1.63,0.63
25%,34.75,2.0,43.0,37.8,88.0,3.15,2.61,0.8
50%,83.5,5.0,55.0,38.1,92.0,6.37,3.59,1.2
75%,126.25,10.0,68.75,39.0,97.0,6.84,4.57,1.3
max,176.0,35.0,87.0,95.0,98.0,11.2,5.55,1.73


In [4]:
covid_df.count()

patientid                 312
offset                    225
sex                       273
age                       262
finding                   312
survival                   92
intubated                  48
intubation_present         43
went_icu                   14
needed_supplemental_O2      3
extubated                   3
temperature                22
pO2_saturation             33
leukocyte_count             9
neutrophil_count            2
lymphocyte_count            5
view                      312
modality                  312
date                      288
location                  214
folder                    312
filename                  312
doi                        99
url                       312
license                   167
clinical_notes            306
other_notes               215
Unnamed: 27                 4
dtype: int64

### Nos quedaremos sólo con las imágenes que sean de rayos X

In [5]:
covid_df['modality'].unique()

array(['X-ray', 'CT'], dtype=object)

In [6]:
covid_xray_df = covid_df[covid_df['modality'] == 'X-ray']
len(covid_xray_df)

268

### Existen distintos tipos de vistas en las radiografías, que se analizan a continuación

In [7]:
views = covid_xray_df['view'].unique()
for view in views:
    print(view, len(covid_xray_df[covid_xray_df['view'] == view]))

PA 177
L 25
AP 37
AP Supine 28
AP semi erect 1


In [8]:
covid_xray_ap_df = covid_xray_df[covid_xray_df['view'] == 'AP']
fig, ax = plt.subplots(2,2)
imgs_path = covid_xray_ap_df.sample(frac=1)[:4]['filename'].apply(lambda x: os.path.join(covid_images_path, x)).tolist()
for idx, img_path in enumerate(imgs_path):
    i, j = int(idx / 2), int(idx % 2)
    ax[i,j].imshow(cv2.imread(img_path))

<IPython.core.display.Javascript object>

In [9]:
covid_xray_pa_df = covid_xray_df[covid_xray_df['view'] == 'PA']
fig, ax = plt.subplots(2,2)
imgs_path = covid_xray_pa_df.sample(frac=1)[:4]['filename'].apply(lambda x: os.path.join(covid_images_path, x)).tolist()
for idx, img_path in enumerate(imgs_path):
    i, j = int(idx / 2), int(idx % 2)
    ax[i,j].imshow(cv2.imread(img_path))

<IPython.core.display.Javascript object>

### La mayoría de las imágenes son de vista PA
Posteroanterior, de acuerdo a la interpretación en https://github.com/ieee8023/covid-chestxray-dataset/blob/master/SCHEMA.md

In [10]:
findings = covid_xray_pa_df['finding'].unique()
for finding in findings:
    print(finding, len(covid_xray_pa_df[covid_xray_pa_df['finding'] == finding]))

COVID-19 115
ARDS 4
SARS 16
Pneumocystis 13
Streptococcus 13
No Finding 2
Chlamydophila 1
COVID-19, ARDS 10
Klebsiella 1
Legionella 2
