In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf

from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import keras.backend as K

In [None]:
!pip install sklearn

In [None]:
import pathlib
train_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/train.csv')

In [None]:
train_df.head()

In [None]:
test_df = pd.read_csv('../input/osic-pulmonary-fibrosis-progression/test.csv')
test_df.head()

In [None]:
train_df['Source'] = 'train'
test_df['Source'] = 'test'

dataframe = train_df.append([test_df])
dataframe.reset_index(inplace = True)

In [None]:
'''
Add patient level Baseline information, only the information that the test dataset will also have
1. Number of visits
2. Visit Number (0,1,2,3,4)
4. Variation in Percent
5. Change in smoking status
6. Range of Percent

'''

In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('target')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [None]:
def own_ZScaler(df, columns):
    for col in columns:
        new_col_name = col + 'Z'
        col_min = df[col].min()
        col_max = df[col].max()        
        df[new_col_name] = (df[col] - col_min) / (col_max - col_min)

In [None]:
numeric_columns = ['FVC', 'Weeks', 'Age', "Percent"]

In [None]:
own_ZScaler(dataframe, numeric_columns)

In [None]:
dataframe.head()

In [None]:
dataframe['target'] = dataframe['FVCZ']

In [None]:
## Put All the Features Together
feature_columns = []

# numeric cols
for header in ['WeeksZ', 'AgeZ', "PercentZ"]:
  feature_columns.append(feature_column.numeric_column(header))

In [None]:
# indicator_columns

indicator_column_names = ['Sex','SmokingStatus']
for col_name in indicator_column_names:
  categorical_column = feature_column.categorical_column_with_vocabulary_list(
      col_name, dataframe[col_name].unique())
  indicator_column = feature_column.indicator_column(categorical_column)
  feature_columns.append(indicator_column)

In [None]:
'''## Try instead embedding columns
# embedding columns
embedded_column_names = ['Sex','SmokingStatus']
for col_name in embedded_column_names:
  m = len(dataframe[col_name].unique())
  categorical_column = feature_column.categorical_column_with_vocabulary_list(
      col_name, dataframe[col_name].unique())
  embedded_column = feature_column.embedding_column(categorical_column, dimension = min(50,m//2))
  feature_columns.append(embedded_column)'''


In [None]:
# Try crossed columns
# crossed columns
# need to create one hot encoded variables first
'''sex_smoker_feature = feature_column.crossed_column(['Sex', 'SmokingStatus'], hash_bucket_size=100)
feature_columns.append(feature_column.indicator_column(sex_smoker_feature))'''

In [None]:
feature_columns

In [None]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [None]:
# get back original data split
train_df = dataframe.loc[dataframe.Source == 'train']
test_df = dataframe.loc[dataframe.Source == 'test']

In [None]:
train, val = train_test_split(train_df, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')

In [None]:
### config file
if len(test_df) < 10:
    EPOCHS = 200
else:
    EPOCHS = 1000
batch_size = 128

In [None]:
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)

In [None]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(64, activation='relu'),
  layers.Dropout(.2),
  layers.Dense(64, activation='relu'),
  layers.Dropout(.2),
  layers.Dense(1, activation='linear')
])

ADAM = tf.keras.optimizers.Adam(lr = 0.001)

optimizer = ADAM

model.compile(optimizer= optimizer,
              loss='mae',
              metrics=['mae'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=EPOCHS)

In [None]:
test_df.head()

In [None]:
column_names = ["Patient", "Weeks", 'PercentZ' , 'AgeZ', 'Sex', 'SmokingStatus']
df = pd.DataFrame(columns = column_names)

for i in range(len(test_df)):
    Patient  = test_df.iloc[i]['Patient']
    PercentZ  = test_df.iloc[i]['PercentZ']
    AgeZ  = test_df.iloc[i]['AgeZ']
    Sex  = test_df.iloc[i]['Sex']
    SmokingStatus  = test_df.iloc[i]['SmokingStatus']
    for week in np.arange(-12,134):
        df = df.append({'Patient': Patient, 
                        "Weeks": week, 
                        'PercentZ' : PercentZ, 
                        'AgeZ': AgeZ, 
                        'Sex': Sex,
                        'SmokingStatus': SmokingStatus 
                         }, ignore_index=True)
df['target'] = 0

In [None]:
own_ZScaler(df, ['Weeks'])

In [None]:
df.head()

In [None]:
df['Weeks'] = df['Weeks'].astype(int)
df['WeeksZ'] = df['WeeksZ'].astype(float)
df['AgeZ'] = df['AgeZ'].astype(float)
df['PercentZ'] = df['PercentZ'].astype(float)
df['Patient'] = df['Patient'].astype(str)

In [None]:
test_ds = df_to_dataset(df, shuffle=False, batch_size=batch_size)

In [None]:
preds = model.predict(test_ds, batch_size = 100, verbose = 0)

In [None]:
preds

In [None]:
df['FVCZ'] = preds
df['Weeks'] = df['Weeks'].astype(str)
df['Confidence'] = 100

In [None]:
df['FVC'] =  (df['FVCZ']*(dataframe['FVC'].max() - dataframe['FVC'].min())) + dataframe['FVC'].min()

In [None]:
df['Patient'] = df['Patient'].str.cat(df['Weeks'],sep="_")
df.rename(columns={"Patient": "Patient_Week"}, inplace = True)

In [None]:
df = df[['Patient_Week','FVC', 'Confidence']]


In [None]:
df[["Patient_Week","FVC","Confidence"]].to_csv("submission.csv", index = False)

In [None]:
df.head()