In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Pulmonary fibrosis

**Pulmonary fibrosis** is a lung disease that occurs when lung tissue becomes damaged and scarred. This thickened, stiff tissue makes it more difficult for your lungs to work properly. As pulmonary fibrosis worsens, you become progressively more short of breath.

![Pulmonary fibrosis](https://upload.wikimedia.org/wikipedia/commons/thumb/e/e1/IPF_amiodarone.JPG/300px-IPF_amiodarone.JPG)

# Training Data 

* In this notebook we are going to see some Explorative Data Analysis

    1. Finding the correlations of the columns
    2. Finding the Null Values in the data
    3. Finding the Different Ages
    4. Count of the sex 
          * Male or Female
    5. Class of the data 
    6. Plotting some images 

Here we are Just going take the train data for our EDA purpose 

In [None]:
data = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')

In [None]:
data.head()

In [None]:
#Setting the Patients column as the Index

data = data.set_index(['Patient'])

Finding the Information of the from here we got 
* total rows is 1549
* total columns 6
* All the datatypes
* size of the dataset

In [None]:
data.info()

<h1>Correlation</h1>

Here in this we are having 4 numeric columns and 2 object data type columns   
* So, here we are working for the correlation, correlation is for only the Numeric data to find the remaining columns we need encode from string to numbers

In [None]:
data.corr()

LabelEncoding is the technique in machine learning which help to encode the string data numeric, now we are going to use from sklearn 

* We are encoding Sex column into the numeric
* We are encoding Smoking Status column into the numeric

In [None]:
data

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
data['Sex'] = le.fit_transform(data['Sex'])
data['SmokingStatus'] = le.fit_transform(data['SmokingStatus'])

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

fig = plt.figure(figsize=(15, 10))
sns.heatmap(data.corr(), annot=True, 
           cbar_kws={"orientation": "horizontal"})
plt.show()

<h1>Null Values in the Data</h1>

In [None]:
val = data.isna().sum().values
lab = data.columns

plt.bar(lab, val)
plt.xlabel('Labels')
plt.ylabel("Empty or not")
plt.title("Null values")
plt.tight_layout()
plt.show()

<h1>Ages Counts<h1>

In [None]:
vals = data['Age'].value_counts().values
labs = data['Age'].unique()

figu = plt.figure(figsize=(15, 5))
fig = sns.barplot(labs,vals) 
plt.xlabel('Ages')
plt.ylabel("Total Pateints")
plt.title("Total number of patients with the particular age")
plt.tight_layout()
plt.show()

In [None]:

vals = data['Sex'].value_counts().values
labs = ['male', 'Female']
sns.barplot(vals, labs)
plt.xlabel('Labels')
plt.ylabel("Number of males or females")
plt.title("Sex Count")
plt.tight_layout()
plt.show()

<h1> Male and Female SexCount </h1>

In [None]:
vals = data['Sex'].value_counts().values
labs = ['male', 'female']

explode = []

for i in range(len(vals)):
    if max(vals) == vals[i]:
        explode.append(0.1)
    else:
        explode.append(0)
        

plt.pie(vals, explode, labs, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.title("Sex Count")
plt.tight_layout()
plt.show()

In [None]:
vals = data['SmokingStatus'].unique()
vals

<h1>Smoking Status</h1>

In [None]:
vals = data['SmokingStatus'].value_counts().values
labs = ['Ex-smoker', 'Never smoked', 'Currently smokes']

fig = plt.figure(figsize=(7, 7))

explode = []

for i in range(len(vals)):
    if max(vals) == vals[i]:
        explode.append(0.1)
    else:
        explode.append(0)
        

plt.pie(vals, explode, labs, autopct='%1.1f%%',
        shadow=True, startangle=90)
plt.title("Sex Count")
plt.tight_layout()
plt.show()

In [None]:
vals = data['SmokingStatus'].value_counts().values
labs = ['Ex-smoker', 'Never smoked', 'Currently smokes']


fig = plt.figure(figsize=(12, 5))
sns.barplot(vals, labs)

plt.xlabel('Labels')
plt.ylabel("Number of males or females")
plt.title("Sex Count")
plt.tight_layout()
plt.show()

<h1> Plotting the Images </h1>

Here we are plotting the images with help of matplotlib and pydicom, pydicom is a library to plot medical images which are ending with **.dcm**

In [None]:
path = '../input/osic-pulmonary-fibrosis-progression/train/ID00007637202177411956430'

images = [path + '/' + img for img in os.listdir(path) if img.endswith('dcm')]

In [None]:
import pydicom
w=10
h=10
fig=plt.figure(figsize=(14, 8))
columns = 3
rows = 3
for i in range(1, columns*rows+1):
    ds = pydicom.dcmread(images[i])
    fig.add_subplot(rows, columns, i)
    plt.imshow(ds.pixel_array, cmap='hsv') 
plt.show()

# FVC


**Forced vital capacity (FVC)** is the total amount of air exhaled during the FEV test. Forced expiratory volume and forced vital capacity are lung function tests that are measured during spirometry. ... Diagnose obstructive lung diseases such as asthma and chronic obstructive pulmonary disease (COPD).

In [None]:
fvc_female = []
fvc_male = []

for i in range(len(data)):
    
    if data['Sex'][i] == 1 :
        fvc_female.append(data['FVC'][i])
    elif data['Sex'][i] == 0:
        fvc_male.append(data['FVC'][i])
    else:
        pass

In [None]:
plt.scatter(fvc_female, list(range(len(fvc_female))), c='r', label='Male')
plt.scatter(fvc_male,list(range(len(fvc_male))), c='y', label='Female')
plt.xlabel("FVC")
plt.ylabel("Range")
plt.title("Forced vital capacity (FVC)")
plt.legend()
plt.show()

In [None]:
print(f"The Max week of the patient {max(data['Weeks'])}")
print(f"The Max week of the patient {min(data['Weeks'])}")

# Testing data

In [None]:
data = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
#Setting the Patients column as the Index

data = data.set_index(['Patient'])

# Images of the patients in testset

In [None]:
path = '../input/osic-pulmonary-fibrosis-progression/test/ID00419637202311204720264'

images = [path + '/' + img for img in os.listdir(path) if img.endswith('dcm')]

In [None]:
ds = pydicom.dcmread(images[0])
val = ds.pixel_array
img = np.array(val, dtype='f')
img

In [None]:
import pydicom
import cv2
w=10
h=10
fig=plt.figure(figsize=(14, 8))
columns = 3
rows = 3
for i in range(1, columns*rows+1):
    ds = pydicom.dcmread(images[i])
    fig.add_subplot(rows, columns, i)
    plt.imshow(ds.pixel_array, cmap='hsv') 
plt.show()

In [None]:
print(f"The Max week of the patient {max(data['Weeks'])}")
print(f"The Max week of the patient {min(data['Weeks'])}")

# Working On Images

In [None]:
train_Data = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')

In [None]:
vals = train_Data['Patient'].value_counts()[:25]
labs = train_Data['Patient'].unique()[:25]

fig = plt.figure(figsize=(12, 5))
sns.barplot(vals, labs)

plt.xlabel('Count')
plt.ylabel("Patients IDS")
plt.title("Unqiue patient Count out of duplicate")
plt.tight_layout()
plt.show()


In [None]:
nodupData = train_Data.drop_duplicates(subset = 'Patient', keep='first')

In [None]:
nodupData.set_index('Patient', inplace=True)

In [None]:
nodupData

In [None]:

Unique_patients = list(train_Data['Patient'].unique())


def getting_group(groupID):
    
    return train_Data.groupby('Patient').get_group(groupID)

print(getting_group(Unique_patients[0]).plot())
print(getting_group(Unique_patients[1]).plot())

In [None]:
test_Data = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')

In [None]:
Unique_patients == list(test_Data['Patient'].unique())

In [None]:
test_Data

In [None]:
samplt_Data = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/sample_submission.csv')

In [None]:
#samplt_Data['Patient_Week'].unique()

# Conclusion

We have see many visualizationa and we got all about the  data

I belive you have loved the repo 

Please makeUp vote if you like 