# import necessary dependencies

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

# list from directory

In [None]:
os.listdir("../input/osic-pulmonary-fibrosis-progression")

# read train, test, submission csv

In [None]:
train = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/train.csv")
test = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/test.csv")

# take a look

In [None]:
train.head()

In [None]:
test.head()

In [None]:
submission = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/sample_submission.csv")

In [None]:
submission.head()

# shape of train, test and submission

In [None]:
print(f"Train info {train.shape}")
print(f"test info {test.shape}")
print(f"submission info {submission.shape}")

# import visualization packages

In [None]:
import pydicom
import matplotlib.pyplot as plt
import seaborn as sns

# patient column unique id value_counts

In [None]:
print(f"Total Patient Id {train['Patient'].count()}")
print(f"NUmber of Unique Id {train['Patient'].value_counts().shape[0]}")

# smoking status bar plot

In [None]:
train["SmokingStatus"].value_counts().plot(kind="bar")

# take a look of .dcm extension

In [None]:
img = "../input/osic-pulmonary-fibrosis-progression/train/ID00009637202177434476278/100.dcm"
ds = pydicom.dcmread(img)
plt.figure(figsize = (5,5))
plt.imshow(ds.pixel_array, cmap=plt.cm.bone)

# take a random id and images

In [None]:
import random


def get_random(smokes):
    smoke_pat = train[train["SmokingStatus"]==smokes] 
    patientz = [i for i in smoke_pat["Patient"]] # patient id list
    r_st = random.choice(patientz) # random choice
    print(r_st)
    image_dir = f"../input/osic-pulmonary-fibrosis-progression/train/{r_st}" # image directory
    image_list = os.listdir(image_dir) # list of images
    c = []
    for t in image_list:
        first, exts = os.path.splitext(t) # split text
        first = int(first) # int
        c.append(first) # append
    d = [num for num in range(1, 31)] # num from 1 to 30
    gh = []
    for x in c:
        if x in d:
            gh.append(x) # if number is in list then append
    fig = plt.figure(figsize=(10, 10)) # figure
    columns = 5
    row = 6
    for ab in gh:
        files = image_dir + "/" + str(ab) + ".dcm" # file directory
        ds = pydicom.dcmread(files) # read dcm file
        fig.add_subplot(row, columns, ab) # add plot
        plt.imshow(ds.pixel_array, cmap=plt.cm.bone) # show images
    plt.suptitle(smokes) # title

# random ex-smoker patient

In [None]:
get_random("Ex-smoker")

In [None]:
get_random("Never smoked")

# submission data split and merge with test data

In [None]:
get_random("Currently smokes")

In [None]:
submission["Patient"] = submission["Patient_Week"].apply(lambda x:x.split("_")[0])
submission["Weeks"] = submission["Patient_Week"].apply(lambda x:x.split("_")[1])

submission =  submission[['Patient','Weeks', 'Confidence','Patient_Week']]
submission = submission.merge(test.drop('Weeks', axis=1), on="Patient")

In [None]:
submission.tail()

# shape of submission

In [None]:
submission.shape

# submission data patient unique id

In [None]:
submission["Patient"].unique()

# new column

In [None]:
train["Dataset"] = "train"
test["Dataset"] = "test"
submission["Dataset"] = "submission"

# Merge test and submission with train

In [None]:
dataset = train.append([test, submission])
dataset = dataset.reset_index()
dataset = dataset.drop(columns=['index'])

In [None]:
dataset.head()

# convert object to int64

In [None]:
dataset["Weeks"] = dataset["Weeks"].astype("int64")

# dataset information

In [None]:
dataset.info()

# make new column First_week and take min number

In [None]:
dataset["First_week"] = dataset["Weeks"]
dataset.loc[dataset.Dataset=='submission','First_week'] = np.nan
dataset["First_week"] = dataset.groupby('Patient')['First_week'].transform('min')

In [None]:
dataset.head()

In [None]:
dataset = dataset.merge(dataset[dataset["Weeks"] == dataset["First_week"]][["Patient", "FVC"]].rename({"FVC": "First_FVC"}, axis=1).groupby("Patient").first().reset_index(), on="Patient", how="left")

# check the week difference

In [None]:
dataset["Week_diff"] = dataset["Weeks"] - dataset["First_week"]
# dataset["FVC_diff"] = dataset["FVC"] - dataset["First_FVC"]

dataset = pd.concat([dataset,pd.get_dummies(dataset.Sex),pd.get_dummies(dataset.SmokingStatus)], axis=1)

dataset = dataset.drop(columns=['Sex', 'SmokingStatus'])

In [None]:
dataset.head()

In [None]:
dataset.info()

# split dataset

In [None]:
train = dataset[dataset["Dataset"]=="train"]
test = dataset[dataset["Dataset"]=="test"]
submission = dataset[dataset["Dataset"]=="submission"]

# Normalize data by StandardScalar

In [None]:
from sklearn.preprocessing import StandardScaler

col = ['Weeks', 'Percent', 'Age', 'First_week', 'First_FVC', 'Week_diff',
       'Female', 'Male', 'Currently smokes', 'Ex-smoker', 'Never smoked']

train_data = train[col]

# check null

In [None]:
train_data.isnull().any()

# Correlation check

In [None]:
plt.subplots(figsize=(14,10))
g = train.corr()
sns.heatmap(g, annot=True, fmt='.2', cmap="Dark2_r")

# correlation between label("FVC") and others

In [None]:
g["FVC"].sort_values(ascending=False)

# Normalize

In [None]:
stdscale = StandardScaler()
train_data[col] = stdscale.fit_transform(train_data[col])

In [None]:
train_data[col]

# import necessary packages

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

# Hyperparameter tunning

In [None]:
model_params = {
    "svr": {
        "model": SVR(gamma="auto"),
        "params": {
            "C": [1, 5, 10, 15, 20],
            "kernel":['linear', 'poly', 'rbf', 'sigmoid']
        }
    },
    "RandomForest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators":[100, 200],
        }
    },
    "LR": {
        "model": LinearRegression(),
        "params": {
            
        }
    },
    "Decision Tree": {
        "model": DecisionTreeRegressor(),
        "params": {
            "splitter": ["best", "random"],
            "criterion": ["mse", "mae"],
            "max_depth": [5, 10, 15],
        }
    }
}

# GridSearch

In [None]:
from sklearn.model_selection import GridSearchCV
scores = []
for model_name, param in model_params.items():
    clf = GridSearchCV(param["model"], param["params"], cv=10, return_train_score=False)
    clf.fit(train_data[col], train["FVC"])
    scores.append({
        "model": model_name,
        "best_score": clf.best_score_,
        "best_params": clf.best_params_,
    })

df = pd.DataFrame(scores, columns=["model", "best_score", "best_params"])
df

# best score given by Linear Regression

In [None]:
model = LinearRegression()

model.fit(train_data[col], train["FVC"])

# predict data

In [None]:
pred = model.predict(train_data)
pred

# mean squared error and mean absolute error

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

mse = mean_squared_error(train["FVC"], pred, squared=False)

print(mse)

mae = mean_absolute_error(train["FVC"], pred)
print(mae)

# plot of actual and prediction

In [None]:
a = list(train["FVC"])
b = list(pred)

sns.set_style('whitegrid')
f, ax = plt.subplots(figsize=(15, 5))
plt.plot(b[:10], c='green', label= 'predictions')
plt.plot(a[:10], c='red', label= 'actual')
plt.legend()

In [None]:
submission[col].isnull().any()

# Normalization

In [None]:
sub_data = submission[col]
sub_data = stdscale.fit_transform(sub_data[col])

# prediction

In [None]:
pred_2 = model.predict(sub_data)

# visualize

In [None]:
a = list(submission["FVC"])
b = list(pred_2)

sns.set_style('whitegrid')
f, ax = plt.subplots(figsize=(15, 5))
plt.plot(b, c='green', label= 'predictions')
plt.plot(a, c='red', label= 'actual')
plt.legend()

# confidence

In [None]:
submission["FVC_1"] = pred_2

confidence_dict={}
for id in submission['Patient'].unique():
    real=float(test[test['Patient']==id]['FVC'])
    predicted=float(submission[(submission['Patient']==id) & (submission['Weeks'].astype(int)==int(test[test['Patient']==id]['Weeks']))]['FVC_1'])
    confidence_dict[id]=abs(real-predicted)
    
    
confidence=[]
for i in range(len(submission)):
    confidence.append(confidence_dict[submission.iloc[i,0]])
submission['Confidence']=confidence

In [None]:
new = submission[["Patient_Week", "FVC_1", "Confidence"]]
new.rename(columns={"FVC_1":"FVC"}, inplace=True)

# create csv

In [None]:
new.to_csv("submission.csv", index=False)