## Imports & Setup

In [None]:
# installing necessary libraries
!pip install rich efficientnet_pytorch
!pip install --upgrade wandb

In [None]:
import numpy as np
import pandas as pd
import glob
import random
from pydicom import dcmread
import matplotlib.pyplot as plt
from rich.progress import track
from sklearn.metrics import mean_squared_error, r2_score

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

In [None]:
# import wandb
# wandb.login()

In [None]:
# set seeds
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

## Data Preparation

In [None]:
# read in data
train_df = pd.read_csv('/kaggle/input/osic-pulmonary-fibrosis-progression/train.csv')
test_df = pd.read_csv('/kaggle/input/osic-pulmonary-fibrosis-progression/test.csv')
# find num of patients in train_df
train_df['Patient'].nunique()

In [None]:
# encoding patient, sex, and smoking status as nums
train_df['Patient'] = train_df['Patient'].astype('category').cat.codes
train_df['Sex'] = train_df['Sex'].astype('category').cat.codes
train_df['SmokingStatus'] = train_df['SmokingStatus'].astype('category').cat.codes
train_df

In [None]:
# get shifted future values for Weeks and FVC -> goal is to predict FVC at next time point: FVC(t+1)
def shift(x: pd.DataFrame):
    x['Weeks(t+1)'] = x['Weeks'].shift(-1)
    x['FVC(t+1)'] = x['FVC'].shift(-1)
    x = x.iloc[:-1]
    return x

# get dataframe for each patient and map shift function to each dataframe
split = [y for _, y in train_df.groupby('Patient', as_index=False)]
split = list(map(shift, split))


# split data into train, validation, and test
split_len = len(split)
rand_idx = list(range(split_len))
random.shuffle(rand_idx)

# train: 70%, validation: 15%, test 15%
train_list = [split[x] for x in rand_idx[:int(split_len * 0.7)]]
val_list = [split[x] for x in rand_idx[int(split_len * 0.7): int(split_len * 0.85)]]
test_list = [split[x] for x in rand_idx[int(split_len * 0.85):]]

train_df = pd.concat(train_list)
val_df = pd.concat(val_list)
test_df = pd.concat(test_list)
train_df

In [None]:
# save dataframes
train_df.to_csv('/kaggle/working/train.csv')
val_df.to_csv('/kaggle/working/val.csv')
test_df.to_csv('/kaggle/working/test.csv')

In [None]:
# normalize FVC values and store FVC mean and standard deviation
target_mean = train_df['FVC(t+1)'].mean()
target_stdev = train_df['FVC(t+1)'].std()
mean_list = []
stdev_list = []
for c in train_df.columns:
    if c in ['FVC', 'FVC(t+1)']:
        mean = train_df[c].mean()
        stdev = train_df[c].std()
        mean_list.append(mean)
        stdev_list.append(stdev)
        train_df[c] = (train_df[c] - mean) / stdev
        val_df[c] = (val_df[c] - mean) / stdev
        test_df[c] = (test_df[c] - mean) / stdev

        
# remove any potential nans before start of training
train_df.dropna(inplace=True)
train_df.reset_index(drop=True, inplace=True)
val_df.dropna(inplace=True)
val_df.reset_index(drop=True, inplace=True)
test_df.dropna(inplace=True)
test_df.reset_index(drop=True, inplace=True)

## PyTorch Dataset and Model Classes

In [None]:
class OASISDataset(Dataset):
    def __init__(self, dataframe, sequence_length=10):
        self.sequence_length = sequence_length
        
        # get column names for target and features
        self.target = dataframe.columns[-1]
        self.features = dataframe.columns[:-1]
        
        # save df values as torch tensors
        self.y = torch.tensor(dataframe[self.target].values).float()
        self.X = torch.tensor(dataframe[self.features].values).float()

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        if i >= self.sequence_length - 1:
            # get a slice of dataframe for `sequence_length` values
            i_start = i - self.sequence_length + 1
            x = self.X[i_start:(i+1), :]
        else:
            # add padding of first value if not enough examples before it
            padding = self.X[0].repeat(self.sequence_length - i - 1, 1)
            x = self.X[0:(i+1), :]
            x = torch.cat((padding, x), 0)
            
        # return feature and target as dict
        return {
            'X': x.type(torch.float),
            'y': self.y[i].type(torch.float)
        }

In [None]:
class OASISModel(nn.Module):
    def __init__(self, input_size, hidden_units, num_layers):
        super(OASISModel, self).__init__()
        self.input_size = input_size
        self.hidden_units = hidden_units
        self.num_layers = num_layers

        self.lstm = nn.LSTM(
            input_size=self.input_size,
            hidden_size=self.hidden_units,
            batch_first=True,
            num_layers=self.num_layers
        )

        self.linear = nn.Linear(hidden_units, 1)

    def forward(self, x):
        # simple lstm with zero initialization, output to single linear layer
        batch_size = x.shape[0]
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_()
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_units).requires_grad_()

        _, (hn, _) = self.lstm(x, (h0, c0))
        out = self.linear(hn[0])
        return out

In [None]:
# model hyperparameters
batch_size = 16
sequence_length = 10
learning_rate = 5e-4
num_hidden_units = 10

# create dataset class with specified sequence length
train_dataset = OASISDataset(train_df, sequence_length=sequence_length)
valid_dataset = OASISDataset(val_df, sequence_length=sequence_length)
test_dataset = OASISDataset(test_df, sequence_length=sequence_length)

# create dataloader that does not shuffle data
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=len(test_dataset), shuffle=False)

In [None]:
# instantiate model
model = OASISModel(input_size=train_dataset.X.shape[1], hidden_units=num_hidden_units, num_layers=3)

In [None]:
def train_reg(model, train_oader, valid_loader, test_loader, epochs):
    # have torch recognize GPU if it exists, otherwise use CPU
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # load model onto the device
    model.to(device)

    # set loss to MSELoss and optimizer to Adam
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.L1Loss()

    print("start of training loop")

    # creating lists to store training and validation losses
    train_losses = []
    val_losses = []

#    wandb.init(project='osic_lstm', entity='ashcher51')
#    wandb.watch(model)
    
    # iterating model training for each epoch
    for epoch in track(range(epochs), description='Training...'):
        model.train()
        train_loss = 0
        train_rmse = 0
        train_rsq = 0
        count = 0

        for batch in train_dataloader:
            # set to zero so gradients are not accumulated
            optimizer.zero_grad()

            # get batch and load on to device
            X, y = batch['X'].to(device), batch['y'].to(device)
            
            # get model output and calculate loss
            out = model(X)
            loss = criterion(out, y.view(-1, 1))

            # update params
            loss.backward()
            optimizer.step()

            # add train_loss at particular training step to epoch training loss
            train_loss += loss.item() * y.size(0)

            # get predicted and actual values for targets
            predicted = out.cpu().detach().numpy()
            actual = y.cpu().detach().numpy()

            # use predicted and actual to calculate rmse and r^2 score
            train_rmse += mean_squared_error(actual, predicted, squared=False)
            train_rsq += r2_score(actual, predicted)

            count += 1

        # divide train_loss by number of training steps to get epoch train_loss
        train_loss /= len(train_dataloader.sampler)
        train_losses.append(train_loss)

        # divide rmse and r^2 to get epoch rmse and r^2 score
        train_rmse /= count
        train_rsq /= count

#        wandb.log({'train_loss':train_loss,'train_rmse':train_rmse,'train_rsq':train_rsq,})

        # Validation
        model.eval()
        val_loss = 0
        val_rmse = 0
        val_rsq = 0
        count = 0

        for batch in valid_dataloader:
            optimizer.zero_grad()

            X, y = batch['X'].to(device), batch['y'].to(device)

            out = model(X)
            loss = criterion(out, y.view(-1, 1))

            val_loss += loss.item() * y.size(0)

            predicted = out.cpu().detach().numpy()
            actual = y.cpu().detach().numpy()

            val_rmse += mean_squared_error(actual, predicted, squared=False)
            val_rsq += r2_score(actual, predicted)
        

            count += 1

        val_loss /= len(valid_dataloader.sampler)
        val_losses.append(val_loss)

        val_rmse /= count
        val_rsq /= count
            
#        wandb.log({'val_loss':val_loss,'val_rmse':val_rmse,'val_rsq':val_rsq,})


                
        # print metrics
        print(
            "\n",
            "\n",
            f"Epoch {epoch+1}/{epochs}:\n",
            f"Train loss: {train_loss:.3f}...\n",
            f"Valid loss: {val_loss:.3f}...\n",
            "\n",
            f"Train RMSE: {train_rmse:.3f}...\n",
            f"Valid RMSE: {val_rmse:.3f}...\n",
            "\n",
            f"Train R^2: {train_rsq}...\n",
            f"Valid R^2: {val_rsq}...\n",
        )

    # Test
    model.eval()

    test_rmse = 0
    test_rsq = 0
    count = 0

    for batch in test_dataloader:

        X, y = batch['X'].to(device), batch['y'].to(device)

        out = model(X)
        loss = criterion(out, y.view(-1, 1))

        test_predicted = out.cpu().detach().numpy()
        test_actual = y.cpu().detach().numpy()

        test_rmse += mean_squared_error(test_actual, test_predicted, squared=False)
        test_rsq += r2_score(test_actual, test_predicted)


        count += 1

    test_rmse /= count
    test_rsq /= count

#    wandb.log({'test_rmse':test_rmse,'test_rsq':test_rsq,})

    print(
        "\n",
        "\n",
        "\n",
        "Training Finished!\n",
        f"Test RMSE: {test_rmse:.3f}...\n",
        f"Test R^2: {test_rsq:.3f}...\n\n",
        f"Predictions: {test_predicted}\n",
        f"Actual: {test_actual} \n\n",
    )
    
    print(f'\nlen of predicted: {len(test_predicted)}')
    # display loss curves
    print('Loss Curves: ')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    # plot train losses per epoch
    plt.plot(list(range(epochs)), train_losses, label='train')
    # plot validation losses per epoch
    plt.plot(list(range(epochs)), val_losses, label='valid')
    plt.legend()
    plt.show()

    # display scatterplot
    plt.scatter(test_actual, test_predicted)
    # plt.scatter(test_actual * target_stdev + target_mean, test_predicted * target_stdev + target_mean) # <- scaled version of scatterplot
    plt.xlabel('Actual')
    plt.ylabel('Predicted')
    plt.show()
    
    # get sample patient progression
    sub_pred = [np.array(test_df.iloc[9, 2] * stdev_list[0] + mean_list[0])] + list(predicted[9:16] * stdev_list[1] + mean_list[1])
    sub_actual = [np.array(test_df.iloc[9, 2] * stdev_list[0] + mean_list[0])] + list(actual[9:16] * stdev_list[1] + mean_list[1])
    sub_time = [np.array(test_df.iloc[9, 1])] + list(test_df.iloc[9:16, -2])
    
    print(f'actual: {sub_actual}')
    print(f'pred: {sub_pred}')
    
    print(np.array(sub_time).reshape(1, 8))
    print(np.array(sub_pred).reshape(1, 8))
    print(np.array(sub_actual).reshape(1, 8))
    
    plt.plot(np.array(sub_time).reshape(8, 1), np.array(sub_pred).reshape(8, 1), label='predicted')
    plt.plot(np.array(sub_time).reshape(8, 1), np.array(sub_actual).reshape(8, 1), label='actual')
    plt.legend()
    plt.show()

    # get results and save test_df, scaled FVC, actual values, and predicted values to file
    res_df = pd.concat([test_df, pd.Series(test_df['FVC'] * stdev_list[0] + mean_list[0], name = 'FVC scaled'), pd.Series(np.array(test_actual) * stdev_list[1] + mean_list[1], name='actual'), pd.Series(np.array([predict[0] for predict in test_predicted]) * stdev_list[1] + mean_list[1], name='predicted')], axis=1)
    res_df.to_csv('/kaggle/working/results.csv', index=False)
    
#    wandb.finish()
    # simulate progression
#     sub = 0
#     weeks = [-1.442765, -1.123454, -0.963798, -0.857361, -0.804142, -0.484831, 0.153792, 0.792414, 2.687218]
#     FVC = 2.865436
#     percent = 2.188235
#     age = 0
#     sex = 0.513665
#     smoking = -0.397809
    
#     predicted_fvc = []
    
#     for i, week in enumerate(weeks[:-1]):
#         model_in = torch.tensor([sub, week, FVC, percent, age, sex, smoking, weeks[i+1]], dtype=torch.float)
#         model_out = model(model_in)
#         predicted_fvc.append(model_out * target_stdev + target_mean)
#         FVC = model_out
    
#     plt.plot(weeks, predicted_fvc)
#     plt.show()

In [None]:
train_reg(model, train_dataloader, valid_dataloader, test_dataloader, 500)