In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

from scipy import stats

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df_train = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')
df_test = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

In [None]:
print(f'Training Set Shape = {df_train.shape} - Patients = {df_train["Patient"].nunique()}')
print(f'Test Set Shape = {df_test.shape} - Patients = {df_test["Patient"].nunique()}')
print(f'The names of the features are {list(df_train.columns)}')

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
FVCMeasurements_train = df_train.rename(columns={'Weeks': 'FVCMeasurements'}).groupby('Patient').agg('count')['FVCMeasurements'].value_counts()
print(f'Training Set FVC Measurements Per Patient \n{("-") * 41}\n{FVCMeasurements_train}')

In [None]:
df_submission = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv' )
df_submission

In [None]:
def laplace_log_likelihood(y_true, y_pred, sigma):
    sigma_clipped = np.maximum(sigma, 70)
    delta_clipped = np.minimum(np.abs(y_true - y_pred), 1000)
    metric = - np.sqrt(2) * delta_clipped / sigma_clipped - np.log(np.sqrt(2) * sigma_clipped)
    return np.mean(metric)

In [None]:
laplace_log_likelihood(df_train['FVC'], df_train['FVC'], 70)

In [None]:
print(f'FVC Statistical Properties \n{"-" * 26}\n')
print(f'Mean: {df_train["FVC"].mean():.6}')
print(f'Median: {df_train["FVC"].median():.6}')
print(f'Std: {df_train["FVC"].std():.6}')
print(f'Min: {df_train["FVC"].min()}')
print(f'25%: {df_train["FVC"].quantile(0.25)}')
print(f'50%: {df_train["FVC"].quantile(0.5)}')
print(f'75%: {df_train["FVC"].quantile(0.75)}')
print(f'Max: {df_train["FVC"].max()}')
print(f'Skew: {df_train["FVC"].skew():.6}')
print(f'Kurtosis: {df_train["FVC"].kurtosis():.6}')
missing_values= df_train[df_train["FVC"].isnull()].shape[0]
training_samples= df_train.shape[0]
print(f'Missing Values: {missing_values}/{training_samples} ({missing_values * 100 / training_samples:.4}%)')

In [None]:
fig, axes = plt.subplots(figsize=(18, 6))
sns.distplot(df_train['FVC'], label='FVC').set_title('FVC Distribution in Training Set', pad=15, size =18)
plt.show()

In [None]:
for patient, df in list(df_train.groupby('Patient')):
    print(f'Patient: {patient} FVC Statistical Properties\n{"-" * 61}')
    print(f'Mean: {df["FVC"].mean():.6}')
    print(f'Median: {df["FVC"].median():.6}')
    print(f'Std: {df["FVC"].std():.6}')
    print(f'Min: {df["FVC"].min()}')
    print(f'Max: {df["FVC"].max()}')
    print(f'Skew: {df["FVC"].skew():.6}')
    print(f'Kurtosis: {df["FVC"].kurtosis():.6}')
    print(f'\n')

In [None]:
def plot_fvc(df, patient): 
    df[['Weeks', 'FVC']].set_index('Weeks').plot(figsize=(30, 6), label='_nolegend_')
    plt.tick_params(axis='x', labelsize=20)
    plt.tick_params(axis='y', labelsize=20)
    plt.xlabel('')
    plt.ylabel('')
    plt.title(f'Patient: {patient} - {df["Age"].tolist()[0]} years old - {df["Sex"].tolist()[0]} - {df["SmokingStatus"].tolist()[0]} ({len(df)} Measurements in {(df["Weeks"].max() - df["Weeks"].min())} Weeks Period)', size=25, pad=25)
    print(f'\n')
    plt.legend().set_visible(False)
    plt.show()
      
for patient, df in list(df_train.groupby('Patient')):  
    plot_fvc(df, patient)

In [None]:
g = sns.pairplot(df_train[['FVC', 'Weeks', 'Percent', 'Age']], aspect=1.4, height=5, diag_kind='kde', kind='reg')

g.axes[3, 0].set_xlabel('FVC', fontsize=25)
g.axes[3, 1].set_xlabel('Weeks', fontsize=25)
g.axes[3, 2].set_xlabel('Percent', fontsize=25)
g.axes[3, 3].set_xlabel('Age', fontsize=25)
g.axes[0, 0].set_ylabel('FVC', fontsize=25)
g.axes[1, 0].set_ylabel('Weeks', fontsize=25)
g.axes[2, 0].set_ylabel('Percent', fontsize=25)
g.axes[3, 0].set_ylabel('Age', fontsize=25)

g.axes[3, 0].tick_params(axis='x', labelsize=20)
g.axes[3, 1].tick_params(axis='x', labelsize=20)
g.axes[3, 2].tick_params(axis='x', labelsize=20)
g.axes[3, 3].tick_params(axis='x', labelsize=20)
g.axes[0, 0].tick_params(axis='y', labelsize=20)
g.axes[1, 0].tick_params(axis='y', labelsize=20)
g.axes[2, 0].tick_params(axis='y', labelsize=20)
g.axes[3, 0].tick_params(axis='y', labelsize=20)

g.fig.suptitle('Pair Plots for Continuous features', fontsize=35, y=1.08)

plt.show()

In [None]:
fig = plt.figure(figsize=(8, 8), dpi=100)

sns.heatmap(df_train.corr(), annot=True, square=True, cmap='BuPu', annot_kws={'size': 10}, fmt='.2f', linewidths=.5)   

plt.tick_params(axis='x', labelsize=10, rotation=0)
plt.tick_params(axis='y', labelsize=10, rotation=0)
plt.title('Correlation Matrix for Continuous Data', size=18, pad=18)

plt.show()

In [None]:
fig, axes = plt.subplots(ncols=2,figsize=(18, 6))

sns.barplot(x=df_train.groupby('Patient')['Sex'].first().value_counts().index, y=df_train.groupby('Patient')['Sex'].first().value_counts(), ax=axes[0], color="skyblue")
percentages0 = [(count / df_train.groupby('Patient')['Sex'].first().value_counts().sum() * 100).round(2) for count in df_train.groupby('Patient')['Sex'].first().value_counts()]

axes[0].set_ylabel('')
axes[0].set_xticks(np.arange(2), [f'Male (%{percentages0[0]})', f'Female (%{percentages0[1]})'])
axes[0].tick_params(axis='x', labelsize=12)
axes[0].tick_params(axis='y', labelsize=12)
axes[0].set_title('Sex Histogram', size=15, pad=15)


sns.barplot(x=df_train.groupby('Patient')['SmokingStatus'].first().value_counts().index, y=df_train.groupby('Patient')['SmokingStatus'].first().value_counts(),ax=axes[1], color="violet")
percentages1 = [(count / df_train.groupby('Patient')['SmokingStatus'].first().value_counts().sum() * 100).round(2) for count in df_train.groupby('Patient')['SmokingStatus'].first().value_counts()]

axes[1].set_ylabel('')
axes[1].set_xticks(np.arange(3), [f'Ex-smoker (%{percentages1[0]})', f'Never smoked (%{percentages1[1]})', f'Currently Smokes (%{percentages1[2]})'])
axes[1].tick_params(axis='x', labelsize=12)
axes[1].tick_params(axis='y', labelsize=12)
axes[1].set_title('Smoking Status histogram', size=15, pad=15)

plt.show()
print(f'Sex\n{"-" * 40}')
print(f'Number of males:')
print(df_train.groupby('Patient')['Sex'].first().value_counts()[0])
print(f'Number of females:')
print(df_train.groupby('Patient')['Sex'].first().value_counts()[1])
print(f'Male (%{percentages0[0]})', f'Female (%{percentages0[1]})')
print(f'\n')
print(f'Smoking Status\n{"-" * 40}')
print(f'Ex-smoker:')
print(df_train.groupby('Patient')['SmokingStatus'].first().value_counts()[0])
print(f'Never Smoked:')
print(df_train.groupby('Patient')['SmokingStatus'].first().value_counts()[1])
print(f'Currently Smokes:')
print(df_train.groupby('Patient')['SmokingStatus'].first().value_counts()[2])
print(f'Ex-smoker (%{percentages1[0]})', f'Never smoked (%{percentages1[1]})', f'Currently Smokes (%{percentages1[2]})')

In [None]:
g = sns.pairplot(df_train[['FVC', 'Weeks', 'Percent', 'Age', 'Sex']], hue='Sex', aspect=1.4, height=5, diag_kind='kde', kind='reg')
g.axes[3, 0].set_xlabel('FVC', fontsize=25)
g.axes[3, 1].set_xlabel('Weeks', fontsize=25)
g.axes[3, 2].set_xlabel('Percent', fontsize=25)
g.axes[3, 3].set_xlabel('Age', fontsize=25)
g.axes[0, 0].set_ylabel('FVC', fontsize=25)
g.axes[1, 0].set_ylabel('Weeks', fontsize=25)
g.axes[2, 0].set_ylabel('Percent', fontsize=25)
g.axes[3, 0].set_ylabel('Age', fontsize=25)
g.axes[3, 0].tick_params(axis='x', labelsize=20)
g.axes[3, 1].tick_params(axis='x', labelsize=20)
g.axes[3, 2].tick_params(axis='x', labelsize=20)
g.axes[3, 3].tick_params(axis='x', labelsize=20)
g.axes[0, 0].tick_params(axis='y', labelsize=20)
g.axes[1, 0].tick_params(axis='y', labelsize=20)
g.axes[2, 0].tick_params(axis='y', labelsize=20)
g.axes[3, 0].tick_params(axis='y', labelsize=20)
plt.legend(prop={'size': 20})
g._legend.remove()
g.fig.suptitle('Pair Plots Between Sex Groups', fontsize=40, y=1.1)
plt.show()

In [None]:
g = sns.pairplot(df_train[['FVC', 'Weeks', 'Percent', 'Age', 'SmokingStatus']], hue='SmokingStatus', aspect=1.4, height=5, diag_kind='kde', kind='reg')
g.axes[3, 0].set_xlabel('FVC', fontsize=25)
g.axes[3, 1].set_xlabel('Weeks', fontsize=25)
g.axes[3, 2].set_xlabel('Percent', fontsize=25)
g.axes[3, 3].set_xlabel('Age', fontsize=25)
g.axes[0, 0].set_ylabel('FVC', fontsize=25)
g.axes[1, 0].set_ylabel('Weeks', fontsize=25)
g.axes[2, 0].set_ylabel('Percent', fontsize=25)
g.axes[3, 0].set_ylabel('Age', fontsize=25)
g.axes[3, 0].tick_params(axis='x', labelsize=20)
g.axes[3, 1].tick_params(axis='x', labelsize=20)
g.axes[3, 2].tick_params(axis='x', labelsize=20)
g.axes[3, 3].tick_params(axis='x', labelsize=20)
g.axes[0, 0].tick_params(axis='y', labelsize=20)
g.axes[1, 0].tick_params(axis='y', labelsize=20)
g.axes[2, 0].tick_params(axis='y', labelsize=20)
g.axes[3, 0].tick_params(axis='y', labelsize=20)
plt.legend(prop={'size': 20})
g._legend.remove()
g.fig.suptitle('Pair Plots Between SmokingStatus Groups', fontsize=40, y=1.1)
plt.show()