In [None]:
import pandas as pd
import glob

# For plots we use plotly express and
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from plotly.colors import n_colors

**NOTE: The project is currently under development**

**Table of Content**

* Understand train.csv
* Understand test.csv
* Understand the dcm file


**Some preliminary understanding of the domain we are dealing with:**

**Forced vital capacity (FVC)** is the total amount of air exhaled during the FEV (Force Expiratory Volume) Test. It is important to add that FVC is a calculation of the volume of the lung. Since the amount of air you are able to exhale after a complete inhalation should represents the amount of air that the lung is able to accomodate at a point in time.



### Understanding train.csv 

First let us check and try to understand the data in train.csv

In [None]:
train_df = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/train.csv")
print(train_df.head())
print(train_df.shape)

Let's do a brief description of the data

How many unique patients do we have?

In [None]:
print(len(train_df['Patient'].unique()))
#print(train_df['Patient'].unique())

So we know that we have 1549 records with 176 unique patients.

In [None]:
print(train_df['Weeks'].describe())
# print(train_df[train_df['Weeks'].isnull()])

According to the data given, we know that the FVC is measured during several weeks interval over the course of 1 - 2 years, hence there is some time series visualization that could be plausible.

However we should have at the back of our mind that there is no specific week intervals but maybe we should first check that.

Question: What is the frequent week that measurements were taken?

In [None]:
value_count_week = train_df['Weeks'].value_counts()


data  = go.Bar(
            x = value_count_week.index,
            y = value_count_week.values,
            width = 1.0,
            marker_color='orangered')
layout = go.Layout(
            height = 500,
            width = 1200,
            xaxis=dict(range=[-5, 130], autorange=False, zeroline=False, title="Appointment week"),
            yaxis=dict(zeroline=False, title='Number of patients on week'),
            title = "Which weeks are people most coming in for checkup",
        )
fig  = go.Figure(data=data, layout=layout)
fig.show()

This graph is very interesting as we see that the patients do come in for their follow-up checkups in early weeks. Also, we can see that there is are no record sometimes of the zeroth week i.e first appointment hence first appoint could be some week in the future.

Here are the interesting questions.
* What are the characteristics of the majority of people up to the 20 weeks of checkup, what characterize the FCV values.
    - An interesting fact could be that majority stop going for check ups after the 20th week.
* Is it possible that patients stop coming for checkup after there FCV Value drops to a particular point.

We need to check if it is possible that the negatives are for the start of those patients that (if yes then we can assume that that is their first apppointment)

* Let us check for how many chekups each unique individual make within the first 20 weeks.
* Also check appointments count for the various smoker and non-smoker groups

In [None]:
from plotly.subplots import make_subplots
fig1 = px.scatter(train_df, x="Weeks", y="Patient", color='SmokingStatus',
                 title="Appointment distribution for all patients",
                 labels={"Weeks":"Appointment week (when they showed up)"})
fig1.update_yaxes(showticklabels=False)
fig1.show()

One critical observation to see from the appointment show up for the various patients is that you will see that the group that currently smokes tried to show up even more often up to the 40week mark compared to other groups

In [None]:
total_appointment = train_df.groupby(['Patient', 'SmokingStatus']).size().reset_index(name="total_appointment")
total_appointment_20wks = train_df[train_df["Weeks"] <= 20].groupby(['Patient']).size().reset_index(name="appt_within_20_Weeks")
total_appointment_40wks = train_df[train_df["Weeks"] <= 40].groupby(['Patient']).size().reset_index(name="appt_within_40_Weeks")

fig = make_subplots(rows=1, cols=3,
                   subplot_titles=("Total Appointment within 20 weeks",
                                   "Total Appointment within weeks",
                                   "Total Appointment count all weeks"))
fig.add_trace(go.Scatter(
    x=total_appointment_20wks['Patient'], y=total_appointment_20wks['appt_within_20_Weeks'],
    mode='markers'
), row=1, col=1)
fig.add_trace(go.Scatter(
    x=total_appointment_40wks['Patient'], y=total_appointment_40wks['appt_within_40_Weeks'],
    mode='markers'
), row=1, col=2)
fig.add_trace(go.Scatter(
    x=total_appointment['Patient'], y=total_appointment['total_appointment'],
    mode='markers'
), row=1, col=3)
fig.update_xaxes(showticklabels=False, title_text='Patient')
fig.update_xaxes(title_text='Count appointments')
fig.update_layout(showlegend=False, title_text="Comparing appointments within various week intervals")
fig.show()

In [None]:
total_appontment_20wks = train_df[train_df["Weeks"] <= 20].groupby(['Patient', 'SmokingStatus']).size().reset_index(name="appt_within_20_Weeks")
fig = px.scatter(total_appontment_20wks, x="Patient", y="appt_within_20_Weeks",
                 facet_col='SmokingStatus', color_discrete_sequence=["coral"])
fig.update_xaxes(showticklabels=False, title_text='Patient')
fig.update_yaxes(title_text='Count appointments')
fig.show()

total_appontment_40wks = train_df[train_df["Weeks"] <= 40].groupby(['Patient', 'SmokingStatus']).size().reset_index(name="appt_within_40_Weeks")
fig = px.scatter(total_appontment_40wks, x="Patient",
                 y="appt_within_40_Weeks", facet_col='SmokingStatus',
                 color_discrete_sequence=["goldenrod"])
fig.update_xaxes(showticklabels=False, title_text='Patient')
fig.update_yaxes(title_text='Count appointments')
fig.show()

fig = px.scatter(total_appointment, x="Patient",
                 y="total_appointment", facet_col='SmokingStatus',
                 color_discrete_sequence=["blueviolet"])
fig.update_xaxes(showticklabels=False, title_text='Patient')
fig.update_yaxes(title_text='Count appointments')
fig.show()

We have a record of usually 9 appontments for each of the ID given in the dataset and more of these appointments occured in the first 20 weeks.
It is obvious per according to the images above that their is really no special pattern of appointment for the various smoking status.
Let's hava a discussion around the appointment weeks and the FVC.

In [None]:
fig = px.scatter(train_df, x="Weeks", y="FVC", color='SmokingStatus',
                 title="Appointment distribution based on FCV level",
                 labels={"Weeks":"Appointment week (when they showed up)"})

fig.show()

In [None]:
smoking_status = train_df[train_df['SmokingStatus']=="Ex-smoker"]
fig = go.Figure()

colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', len(train_df['Patient'].unique()), colortype='rgb')

for (j, color) in zip(train_df['Patient'].unique(), colors):
    data = smoking_status[smoking_status['Patient'] == j]
    fig.add_trace(go.Violin(
        x = data['FVC'], line_color=color
    ))
fig.update_layout(
    showlegend=False, title_text="FVC distribution - Ex smokers"
)
fig.update_yaxes(showticklabels=False, title_text='Patient')
fig.update_traces(orientation='h', side='positive', width=12, points=False)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)

fig.show()



smoking_status = train_df[train_df['SmokingStatus']=="Never smoked"]
fig = go.Figure()
for (j, color) in zip(train_df['Patient'].unique(), colors):
    data = smoking_status[smoking_status['Patient'] == j]
    fig.add_trace(go.Violin(
        x = data['FVC'], line_color=color
    ))
fig.update_layout(
    showlegend=False, title_text="FVC distribution - Never smoked patients"
)
fig.update_yaxes(showticklabels=False, title_text='Patient')
fig.update_traces(orientation='h', side='positive', width=8, points=False)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)
fig.show()


smoking_status = train_df[train_df['SmokingStatus']=="Currently smokes"]
fig = go.Figure()
for (j, color) in zip(train_df['Patient'].unique(), colors):
    data = smoking_status[smoking_status['Patient'] == j]
    fig.add_trace(go.Violin(
        x = data['FVC'], line_color=color
    ))
fig.update_layout(
    showlegend=False, title_text="FVC distribution - Currently smokes"
)
fig.update_yaxes(showticklabels=False, title_text='Patient')
fig.update_traces(orientation='h', side='positive', width=8, points=False)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)
fig.show()

We can see obvious irregularity in the FVC distribution over the course of their appointment weeks, which is not very visible for Ex smokers and those that have never done smoking before.

Also, some interesting observations to corraborate the evaluation of appointment weeks above, we see that the data present quite a number of smokers appointment after the 10th week, We may need to go back and recalculate how many appointment ***after the first appointment rather than from point 0***

In [None]:
fig = px.violin(train_df, y="FVC", x="SmokingStatus", box=True, # draw box plot inside the violin
                points='all', # can be 'outliers', or False
               )
fig.show()

We can see a clear disparity between the FVC values of the smokers and the other groups, although, something is a little fishy in the violin plot for the never smoked individuals, Why would some values be close to the FVC of smokers?

In [None]:
smoking_status = train_df[train_df['SmokingStatus']=="Ex-smoker"]
fig = go.Figure()

colors = n_colors('rgb(5, 200, 200)', 'rgb(200, 10, 10)', len(train_df['Patient'].unique()), colortype='rgb')

for (j, color) in zip(train_df['Patient'].unique(), colors):
    data = smoking_status[smoking_status['Patient'] == j]
    fig.add_trace(go.Violin(
        x = data['Percent'], line_color=color
    ))
fig.update_layout(
    showlegend=False, title_text="Percent distribution - Ex smokers"
)
fig.update_yaxes(showticklabels=False, title_text='Patient')
fig.update_traces(orientation='h', side='positive', width=12, points=False)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)

fig.show()



smoking_status = train_df[train_df['SmokingStatus']=="Never smoked"]
fig = go.Figure()
for (j, color) in zip(train_df['Patient'].unique(), colors):
    data = smoking_status[smoking_status['Patient'] == j]
    fig.add_trace(go.Violin(
        x = data['Percent'], line_color=color
    ))
fig.update_layout(
    showlegend=False, title_text="Percent distribution - Never smoked patients"
)
fig.update_yaxes(showticklabels=False, title_text='Patient')
fig.update_traces(orientation='h', side='positive', width=8, points=False)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)
fig.show()


smoking_status = train_df[train_df['SmokingStatus']=="Currently smokes"]
fig = go.Figure()
for (j, color) in zip(train_df['Patient'].unique(), colors):
    data = smoking_status[smoking_status['Patient'] == j]
    fig.add_trace(go.Violin(
        x = data['Percent'], line_color=color
    ))
fig.update_layout(
    showlegend=False, title_text="Percent distribution - Currently smokes"
)
fig.update_yaxes(showticklabels=False, title_text='Patient')
fig.update_traces(orientation='h', side='positive', width=8, points=False)
fig.update_layout(xaxis_showgrid=False, xaxis_zeroline=False)
fig.show()

The percentage distribution also corraborate the PVC distribution well, the graph is very similar in distribution.

Although we do have a clear observation of difference in FVC distribution clearly seperating the smokers and non-smokers/ex-smokers, we don't yet see anything that differentiates the non-smokers and ex-smokers, let's try some mean, median scaling over the Percent and FVC and see what the distribution looks like.

In [None]:
statistical_df = train_df.groupby(['Patient', 'SmokingStatus', 'Age', 'Sex']).agg(
            {'FVC': ['mean', 'median', 'min', 'max']})
statistical_df.columns = ['fvc_mean', 'fvc_median', 'fvc_min', 'fvc_max']
statistical_df = statistical_df.reset_index()
statistical_df.head()

In [None]:
fig = px.box(statistical_df, x="Sex", y="fvc_mean", color="SmokingStatus")
fig.update_layout(
    showlegend=True, title_text="Mean FVC distribution among smokers"
)
fig.show()

fig = px.box(statistical_df, x="Sex", y="fvc_median", color="SmokingStatus")
fig.update_layout(
    showlegend=True, title_text="Median FVC distribution among smokers"
)
fig.show()

fig = px.box(statistical_df, x="Sex", y="fvc_min", color="SmokingStatus")
fig.update_layout(
    showlegend=True, title_text="Minimum FVC distribution among smokers"
)
fig.show()
fig = px.box(statistical_df, x="Sex", y="fvc_max", color="SmokingStatus")
fig.update_layout(
    showlegend=True, title_text="Maximum FVC distribution among smokers"
)
fig.show()

I think it is safe to assume that your lung capacity can actually improve as an ex-smoker because most FVC distributions of never smoked individuals and ex-smokers usually fall into the same distribution according to the data given.

The distibution accross the mean, median, min and max all point to the same observation above.

Let us check age distribution

We should also see the general age distribution that is provided by the data.

In [None]:
fig = px.violin(statistical_df, x='Sex', y="Age", color="SmokingStatus", box=True, points='all')
fig.update_layout(
    showlegend=True, title_text="Age distribution"
)
fig.show()

Finally, let us look at the FVC distribution over age for the various smoking status and sex.

In [None]:
fig = px.scatter(statistical_df, x='fvc_mean', y='Age', color='Sex', facet_col='SmokingStatus', facet_col_wrap=4)
fig.show()

In [None]:
train_df[train_df['Patient'] == "ID00336637202286801879145"]['Weeks'].diff()
# We got to find the time lapsed difference for each of the persons

We are safe when it comes to the weeks column, it seems all weeks were reported for all the patients, but what is it with the negative week numbers?

In [None]:
train_df[train_df['Patient'] == 'ID00007637202177411956430'].shape
#Something to keep in mind, You need to validate that the number of test recorded is the same and the numbe of dcm file provided for each of the Patient ID

Credit:

I took a lot of inspiration for this work from an interesting kaggler - [Laura Fink](https://www.kaggle.com/allunia), people you should check her page out, she did a lot of awesome stuffs

In [None]:
# smoking_status = train_df[train_df['SmokingStatus']=="Currently smokes"]
# fig = go.Figure()
# for j in train_df['Patient'].unique():
#     data = smoking_status[smoking_status['Patient'] == j]
#     fig.add_trace(go.Violin(
#         x = data['Weeks'], y=data['FVC']
#     ))
# fig.update_layout(
#     showlegend=False, title_text="FVC distribution - Currently smokes"
# )

# fig.show()


# for j in train_df['Patient'].unique():
#     data = smoking_status[smoking_status['Patient'] == j]
#     fig.add_trace(go.Scatter(
#         x = data['Weeks'], y=data['FVC']
#     ))
# fig.update_layout(
#     showlegend=False, title_text="FVC distribution - Ex smokers"
# )

# fig.show()