In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pandas_profiling import ProfileReport
import seaborn as sns
import pandas as pd

# Reading data

In [None]:
# Reading data
test_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
sample_sub= pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')
train_df=pd.read_csv("../input/osic-pulmonary-fibrosis-progression/train.csv")
train_df.head()

In [None]:
from pandas_profiling import ProfileReport 
ProfileReport(train_df,progress_bar=False) 

In [None]:
# How to read/plot an image

#pydicom: Python library for Dicom (Digital Imaging in Medicine)
import pydicom
import matplotlib.pyplot as plt

fig=plt.figure(figsize=(3, 3))
filename = '../input/osic-pulmonary-fibrosis-progression/train/ID00007637202177411956430/10.dcm'
ds = pydicom.dcmread(filename)
# fig.add_subplot(rows, columns, i)
plt.imshow(ds.pixel_array, cmap='gray')
plt.show()

In [None]:
# Patients in test set are also in the training set

print(test_df.Patient.nunique())
print ( train_df.Patient.nunique())
patients_train_ids= train_df.Patient.unique()
patient_test_list= test_df.Patient.unique()
list_p = train_df.Patient.isin( patient_test_list )
print(train_df[list_p].Patient.unique())

# **Statistics**


### Images

In [None]:
# Get total number of images + nb of images per patient

nb_image_patient={}
tot=0

for ids in patients_train_ids:
    path =f'../input/osic-pulmonary-fibrosis-progression/train/{ids}'
    nb_image_patient[ids] = len(list(os.listdir(path)))
    liste = list(os.listdir(path))
    tot+= len(liste)
    nb_image_patient[ids] = len(liste)
print ("Total number of images in training set: \n", tot)

In [None]:
plt.hist(nb_image_patient.values(),bins=20)
plt.show()

### Categorical variables

In [None]:
freq_sex=round(pd.crosstab(train_df.Sex,columns="count")/len(train_df.Sex),2)
labels = list(freq_sex.index)
sizes = list(freq_sex['count'])

fig, ax= plt.subplots(1,2,figsize=(15,5))
ax[0].pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=False, startangle=90, colors=['lightgrey','lightblue'])
ax[0].axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

ax[0].set_title('Sex')


freq_smoke=round(pd.crosstab(train_df.SmokingStatus,columns="count")/len(train_df.SmokingStatus),2)
labels = list(freq_smoke.index)
sizes = list(freq_smoke['count'])

ax[1].pie(sizes, labels=labels, autopct='%1.1f%%',
        shadow=False, startangle=90, colors=['lightgrey','lightblue','mediumblue'])
ax[1].axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

ax[1].set_title('Smoking Status')
plt.show()

In [None]:
sns.countplot(x='Sex', hue='SmokingStatus', palette="ch:.25", data=train_df);

### Numerical variables

In [None]:
plt.subplot(121)
pd.DataFrame.boxplot(train_df[['Age']])
plt.subplot(122)
pd.DataFrame.boxplot(train_df[['FVC']])
plt.subplots_adjust( hspace=0.25,wspace=0.55)
plt.show()

## Details of variables

In [None]:
#Age
print('Minimum aged patient:',min(train_df['Age']))
print('Maximum aged patient:',max(train_df['Age']),'\n')

fig = plt.figure()
sns.distplot(train_df['Age'])
plt.title('Age Distribution')
plt.xlabel('Age',size=12)

## FVC - the recorded lung capacity in ml = volume d'air expir√© en une haleine


In [None]:
print('Minimum FVC value:',min(train_df['FVC']))
print('Maximum FVC value:',max(train_df['FVC']),'\n')

fig, axes = plt.subplots(1, 3, figsize = (20, 6))
sns.distplot(train_df['FVC'], ax = axes[0])
axes[0].set_title('FVC Distribution')
plt.xlabel('FVC',size=12)

#Percent
print('Minimum Percentage:',min(train_df['Percent']))
print('Maximum Percentage:',max(train_df['Percent']))

sns.distplot(train_df['Percent'], ax = axes[1])
axes[1].set_title('Percentage Distribution')
plt.xlabel('Percent',size=12)

sns.scatterplot(data = train_df, x="FVC", y="Percent", hue = 'Sex',ax = axes[2])
fig.show()

## FVC evolution for 10 patients

In [None]:
train_df['Weeks_norm'] = train_df.groupby(by = 'Patient').Weeks.agg('diff')
train_df['Weeks_norm'].replace(np.nan,0,inplace=True)
train_df['Weeks_norm'] = train_df.groupby(by = 'Patient').Weeks_norm.agg('cumsum')

train_df['Visit'] = 1
train_df['Visit'] = train_df.groupby(by = 'Patient').Visit.cumsum()

In [None]:
patient["Sex"]

In [None]:
fig, axes = plt.subplots(1, 3, figsize = (20, 6))

deg= 1
pat = train_df.Patient.sample(n=10)

patient = train_df[train_df.Patient.isin(pat)]

sns.lineplot(data = patient, x="Weeks_norm", y="FVC", hue='Patient', ax = axes[0])
axes[0].set_title('FVC Evolution for the patients of the sample')

sns.lineplot(data = train_df, x="Weeks_norm", y="FVC", hue = 'Sex', ax = axes[1])
axes[1].set_title('FVC Evolution per Sex')

dff = patient[patient['Sex'] == 'Female'].sort_values(by = 'Weeks_norm')
dfm = patient[patient['Sex'] == 'Male'].sort_values(by = 'Weeks_norm')

Xf = dff['Weeks_norm']
yf= dff['FVC']

Xm= dfm['Weeks_norm']
ym= dfm['FVC']

cf = np.polyfit(Xf, yf,deg )
cm = np.polyfit( Xm, ym,deg )

p = np.poly1d(cm)
plt.plot(Xm, p(Xm))
p = np.poly1d(cf)
plt.plot(Xf, p(Xf))
plt.xlabel('Weeks')
plt.ylabel('FVC')
plt.title('FVC evolution per sex, smoothed')
plt.grid()

plt.show()

In [None]:
visites = train_df.groupby(by = 'Patient')['Weeks'].count()
sns.barplot(visites.index,visites)
plt.title('Nombre de visites par patient')

## Encoding categorical variables

In [None]:
# Replacing categorical variable 'Sex' for training set
a = train_df[train_df.loc[:,'Sex']== 'Female'].index.tolist()
train_df.loc[a,'Sex']=1
a = train_df[train_df.loc[:,'Sex']== 'Male'].index.tolist()
train_df.loc[a,'Sex']=0

# Replacing categorical variable 'Sex' for test set
##
a = test_df[test_df.loc[:,'Sex']== 'Male'].index.tolist()
test_df.loc[a,'Sex']= 0
#
a = test_df[test_df.loc[:,'Sex']== 'Female'].index.tolist()
test_df.loc[a,'Sex']= 1

In [None]:
# Replacing categorical variable 'SmokingStatus' for both set
SmokingCategories = train_df.SmokingStatus.unique()
print(SmokingCategories)
# 3 different categories and no none type
i = 0
for cat in SmokingCategories:
    a = train_df[train_df.loc[:,'SmokingStatus']== cat].index.tolist()
    b = test_df[test_df.loc[:,'SmokingStatus']== cat].index.tolist()
    train_df.loc[a,'SmokingStatus']= i
    test_df.loc[b,'SmokingStatus']= i
    i+=1