
# Tabular data EDA

In [1]:
import numpy as np
import pandas as pd
import pydicom
import os
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

In [3]:
import warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

In [4]:
def seed_everything(seed=2020):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(42)

In [5]:
ROOT = "../input/osic-pulmonary-fibrosis-progression"
BATCH_SIZE= 128

In [6]:
train = pd.read_csv(f"{ROOT}/train.csv")
train.drop_duplicates(keep=False, inplace=True, subset=['Patient','Weeks'])
test = pd.read_csv(f"{ROOT}/test.csv")

print("add infos")
sub = pd.read_csv(f"{ROOT}/sample_submission.csv")
sub['Patient'] = sub['Patient_Week'].apply(lambda x:x.split('_')[0])
sub['Weeks'] = sub['Patient_Week'].apply(lambda x: int(x.split('_')[-1]))
sub =  sub[['Patient','Weeks','Confidence','Patient_Week']]
sub = sub.merge(test.drop('Weeks', axis=1), on="Patient")

add infos


In [7]:
train.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker


In [8]:
test.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00419637202311204720264,6,3020,70.186855,73,Male,Ex-smoker
1,ID00421637202311550012437,15,2739,82.045291,68,Male,Ex-smoker
2,ID00422637202311677017371,6,1930,76.672493,73,Male,Ex-smoker
3,ID00423637202312137826377,17,3294,79.258903,72,Male,Ex-smoker
4,ID00426637202313170790466,0,2925,71.824968,73,Male,Never smoked


In [9]:
train['WHERE'] = 'train'
test['WHERE'] = 'val'
sub['WHERE'] = 'test'
data = train.append([test, sub])

In [10]:
print('Training Data:',train.info(), end = "\n\n\n")

print('Testing Data:',test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1535 entries, 0 to 1548
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        1535 non-null   object 
 1   Weeks          1535 non-null   int64  
 2   FVC            1535 non-null   int64  
 3   Percent        1535 non-null   float64
 4   Age            1535 non-null   int64  
 5   Sex            1535 non-null   object 
 6   SmokingStatus  1535 non-null   object 
 7   WHERE          1535 non-null   object 
dtypes: float64(1), int64(3), object(4)
memory usage: 107.9+ KB
Training Data: None


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        5 non-null      object 
 1   Weeks          5 non-null      int64  
 2   FVC            5 non-null      int64  
 3   Percent        5 non-null      float64
 4  

## Visualising Dataset

In [11]:
# Visualising Train DataSet 
fig = px.histogram(train, x="Sex")
fig.update_layout(title_text= "Patient Count in Training Dataset")
fig.show()

In [12]:
fig = px.histogram(train, x="SmokingStatus")
fig.update_layout(title_text= "Ex-Smoker , Never Smoked, Present Smoker")
fig.show()

In [13]:
# Age Distribution

fig = px.histogram(train, x="Age")
fig.update_layout(title_text= "Patient Count in Training Dataset")
fig.show()

In [14]:
fig = px.histogram(train, y="Sex" , color = "Age")
fig.update_layout(title_text= "Affected Patient wr Age")
fig.show()

In [15]:
fig = px.histogram(train, x="Age" , color = "SmokingStatus")
fig.update_layout(title_text= "Age wr Smoking Status")
fig.show()

In [16]:
print(train.shape, test.shape, sub.shape, data.shape)
print(train.Patient.nunique(), test.Patient.nunique(), sub.Patient.nunique(), 
      data.Patient.nunique())

(1535, 8) (5, 8) (730, 10) (2270, 10)
176 5 5 176


In [17]:
df = px.data.gapminder()
fig = px.area(train, x="Weeks", y="Percent", color = "SmokingStatus")
fig.update_layout(title_text= "Percent Affected wr Weeks and Smoking Status")
fig.show()

In [18]:
data['min_week'] = data['Weeks']
data.loc[data.WHERE=='test','min_week'] = np.nan
data['min_week'] = data.groupby('Patient')['min_week'].transform('min')

In [19]:
fig = px.scatter(x = train["Weeks"] , y = train["Percent"])

fig.update_layout(title_text= "Weeks vs Percent")

fig.show()

In [20]:
fig = px.histogram(train, x="FVC", color = "Sex")
fig.update_layout(title_text= "FVC wr Gender")
fig.show()

In [21]:
fig = px.histogram(train, x="FVC", color = "SmokingStatus")
fig.update_layout(title_text= "FVC wr Smoking Status")
fig.show()

In [22]:
fig = px.histogram(train, x="FVC", color = "SmokingStatus")
fig.update_layout(title_text= "FVC wr Smoking Status")
fig.show()

In [23]:
train.columns

Index(['Patient', 'Weeks', 'FVC', 'Percent', 'Age', 'Sex', 'SmokingStatus',
       'WHERE'],
      dtype='object')

In [24]:
parallel_diagram = train[['Weeks', 'Patient', 'FVC', 'Percent', 'Age', 'Sex', 'SmokingStatus']]

fig = px.parallel_categories(parallel_diagram, color_continuous_scale=px.colors.sequential.Inferno)
fig.update_layout(title='Parallel category diagram on trainset')
fig.show()

In [25]:
def individual_patient_detail(patient_id):
    patient_df = train[train['Patient'] == patient_id]

    
    fig = make_subplots(specs=[[{"secondary_y": True}]])
    
    fig.add_trace(go.Scatter(x=patient_df['Weeks'], y=patient_df['FVC'], mode='lines+markers+text', text=patient_df['FVC'], name='FVC'), secondary_y=False)
    fig.add_trace(go.Scatter(x=patient_df['Weeks'], y=patient_df['Percent'], mode='markers', text=round(patient_df['Percent'], 2), name='Percent'), secondary_y=True)
    fig.update_traces(textposition='top center')
    fig.update_layout(title_text=f'Forced Vital Capacity and Percent of {patient_id}',
                      xaxis_title="Weeks",
                      width=1000,
                      height=500)
    fig.update_yaxes(title_text="Forced vital capacity", secondary_y=False)
    fig.update_yaxes(title_text="Percent", secondary_y=True)
    
    fig.show()

In [26]:
for ss in train['SmokingStatus'].unique():
    for sample in random.sample(train[train['SmokingStatus'] == ss]['Patient'].tolist(), 2):
        individual_patient_detail(sample)