# OSIC Pulmonary Fibrosis Progression

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
import pydicom

from os import listdir

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torchvision import transforms
import torch.nn.functional as F
import torch.optim as optim
import torchvision.models as tmodels

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

torch.manual_seed(42)

In [None]:
# GPU
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
if use_cuda == False:
    print("WARNING: CPU will be used for training.")
else:
    print("GPU enabled")
    
BATCH_SIZE = 1
#device = torch.device("cpu"); print("WARNING: CPU will be used for training.")#

## Data

In [None]:
train_data = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/train.csv")
test_data = pd.read_csv("../input/osic-pulmonary-fibrosis-progression/test.csv")

bad_ids = ['ID00011637202177653955184', 'ID00052637202186188008618',# gdcm
          'ID00026637202179561894768', 'ID00078637202199415319443', 'ID00128637202219474716089', 'ID00132637202222178761324', # bad window
          'ID00214637202257820847190', 'ID00242637202264759739921', 'ID00248637202266698862378']                              # bad window
for bad_id in bad_ids: 
    train_data.drop(train_data[train_data.Patient == bad_id].index, inplace=True)
    
# test ids are in train table (wtf??)
test_ids = test_data.Patient.unique()
cheat_results = train_data[train_data.Patient.isin(test_ids)]

for bad_id in test_ids: 
    train_data.drop(train_data[train_data.Patient == bad_id].index, inplace=True)
    
# train data 
train_data.drop_duplicates(subset=['Patient', 'Weeks'], keep=False, inplace=True)

train_data['Sex'] = train_data['Sex'].map({'Male': 0,'Female': 1})
train_data['SmokingStatus'] = train_data['SmokingStatus'].map({'Never smoked': -1,
                                                     'Ex-smoker': 0, 
                                                     'Currently smokes': 1})
avg_age = 67
max_age = 100
train_data['Age'] = train_data['Age'] / max_age


# test data 
test_data['Sex'] = test_data['Sex'].map({'Male': 0,'Female': 1})
test_data['SmokingStatus'] = test_data['SmokingStatus'].map({'Never smoked': -1,
                                                     'Ex-smoker': 0, 
                                                     'Currently smokes': 1})
avg_age = 67
max_age = 100
test_data['Age'] = test_data['Age'] / max_age

test_data['ImagePath'] = '../input/preprocessed-scans-osic-pfp/' + test_data.Patient + '.npy'

In [None]:
# Percent is ignored

patient_id = []
age = []
sex = []
smoking_status = []

slope = []
intercept = []

first_week = []
first_fvc = []

image_path = []

patients = train_data.Patient.unique()[:]
for patient in patients:
    
    patient_id.append(patient)
    patient_table = train_data[train_data.Patient == patient].sort_values(by=['Weeks'])
        
    fvc = patient_table.FVC.to_numpy()
    weeks = patient_table.Weeks.to_numpy()
    
    age.append(patient_table.Age.values[0])
    sex.append(patient_table.Sex.values[0])
    smoking_status.append(patient_table.SmokingStatus.values[0])
    
    first_week.append(patient_table.Weeks.values[0])
    first_fvc.append(patient_table.FVC.values[0])
    
    eq = np.vstack([weeks, np.ones(len(weeks))]).T
    slope_, intercept_ = np.linalg.lstsq(eq, fvc, rcond=None)[0]
    
    slope.append(slope_)
    intercept.append(intercept_)
    
    image_path.append('../input/preprocessed-scans-osic-pfp/' + patient + '.npy')
    

patients_data = pd.DataFrame(data=patient_id, columns=["Patient"])
patients_data.loc[:, "Age"] = age
patients_data.loc[:, "Sex"] = sex
patients_data.loc[:, "SmokingStatus"] = smoking_status
patients_data.loc[:, "Weeks"] = first_week
patients_data.loc[:, "FVC"] = first_fvc
patients_data.loc[:, "Slope"] = slope
patients_data.loc[:, "Intercept"] = intercept
patients_data.loc[:, "ImagePath"] = image_path

# Normalize Slope in [0, 1] min:-30, max:15 
patients_data.Slope = (patients_data.Slope + 30) / 45 

print('patients_data shape:', patients_data.shape)
patients_data.head()

In [None]:
class PulmonaryFibrosisProgressionDataset(Dataset):
    
    def __init__(
            self,
            df,
            train: bool = False,
            test: bool = False,
            transform = None,
    ) -> None:
         
        self.train = train
        self.test = test

        self.df = df
        
        self.data_features = ['Age', 'Sex', 'SmokingStatus', 'ImagePath']
        self.target_features = ['Slope']#, 'Intercept']
        
        
        self.transform = transform
        
    
    def __getitem__(self, index: int):
        
        patient_data = self.df.iloc[index]
        
        dcm_path = '../input/osic-pulmonary-fibrosis-progression/train/{}/'.format('ID00007637202177411956430')

        files = listdir(dcm_path)
        file_nums = [np.int(each_file.split(".")[0]) for each_file in files]
        sorted_file_nums = np.sort(file_nums)[::-1]

        tensor_images = torch.zeros((3, 512, 512))
        #tensor_images[0] = self.transform(pydicom.dcmread(dcm_path + str(sorted_file_nums[len(sorted_file_nums)//2 - 1]) + ".dcm" ).pixel_array.astype(np.float32))
        #tensor_images[1] = self.transform(pydicom.dcmread(dcm_path + str(sorted_file_nums[len(sorted_file_nums)//2    ]) + ".dcm" ).pixel_array.astype(np.float32))
        #tensor_images[1] = self.transform(pydicom.dcmread(dcm_path + str(sorted_file_nums[len(sorted_file_nums)//2 + 1]) + ".dcm" ).pixel_array.astype(np.float32))
        
        
        #loaded_images = np.load(patient_data['ImagePath'])
        #tensor_images = torch.zeros((1, 512, 512))
        #
        #for i in [0,1,2]:
        #    tensor_images[i] = self.transform(-1*loaded_images[4+i])
            
        
        tabular_data = patient_data[self.data_features[:-1]].to_numpy(dtype=float)
        
        if self.train:
            target = patient_data[self.target_features].to_numpy(dtype=float)
            
        elif self.test:
            target = np.zeros(2, dtype=float)

        return tensor_images, tabular_data, target
    
    def __len__(self) -> int:
        return len(self.df)

In [None]:
transform = transforms.Compose([
                transforms.ToTensor(),
                #transforms.Normalize(-2000, 5000),
                #transforms.RandomAffine(degrees=(-5, 5), translate=(0.0, 0.0))
             ])

In [None]:
a = PulmonaryFibrosisProgressionDataset(patients_data, train=True, transform=transform)

a.__getitem__(0)[0].shape

In [None]:
image = a.__getitem__(0)[0].permute(1, 2, 0)

In [None]:
sns.distplot(image.flatten())

## Model

In [None]:
class ModelM3(nn.Module):
    def __init__(self):
        super(ModelM3, self).__init__()
    
        final_image_width = 10
        features_length = 3
        concat_length = final_image_width + features_length
        
        self.densenet = tmodels.densenet161(pretrained=True)
        self.fc1 = nn.Linear(1000, final_image_width)
        self.fc2 = nn.Linear(concat_length, concat_length)
        self.fc3 = nn.Linear(concat_length, 1)
        
        
    def forward(self, x):
        
        image = x[0]
        data = x[1]
        #target_length = len(x[2][0])
                
        result = self.densenet(image)
        result = torch.tanh(self.fc1(result))
        result = torch.cat((result, data), dim=1)
        result = nn.Dropout()(result)
        result = torch.tanh(self.fc2(result))
        result = torch.tanh(self.fc3(result))
        
        
        return result

In [None]:
def get_n_params(model):
    np=0
    for p in list(model.parameters()):
        np += p.nelement()
    return np

In [None]:
class EMA:
    def __init__(self, model, decay):
        self.decay = decay
        self.shadow = {}
        self.original = {}

        for name, param in model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data.clone()

    def __call__(self, model, num_updates):
        decay = min(self.decay, (1.0 + num_updates) / (10.0 + num_updates))
        for name, param in model.named_parameters():
            if param.requires_grad:
                assert name in self.shadow
                new_average = (1.0 - decay) * param.data + decay * self.shadow[name]
                self.shadow[name] = new_average.clone()

    def assign(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad:
                assert name in self.shadow
                self.original[name] = param.data.clone()
                param.data = self.shadow[name]

    def resume(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad:
                assert name in self.shadow
                param.data = self.original[name]

In [None]:
criterion = F.l1_loss
loss_points = {}
loss_points['train'] = []
loss_points['test'] = []

def train(epoch, model):
    model.train()
    
    g_step = 0

    for batch_idx, (image, data, target) in enumerate(train_loader):
        # send to device
        image, data, target = image.to(device), data.float().to(device), target.float().to(device)
        
        optimizer.zero_grad()
        output = model([image, data, target])
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        
        loss_points['train'].append(loss.item())
        
        g_step += 1
        ema(model, g_step)
        
        if batch_idx % 20 == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\t output: {:.3f},\t target: {:.3f}'.format(
                epoch+1, (batch_idx+1) * len(data), len(train_loader.dataset),
                100. * (batch_idx+1) / len(train_loader), loss.item()**2, output[0][0], target[0][0]))
                
def test(model):
    model.eval()
    ema.assign(model)
    test_loss = 0
    predictions = []
    targets = []
    
    with torch.no_grad():
        for image, data, target in test_loader:
            # send to device
            image, data, target = image.to(device), data.float().to(device), target.float().to(device)

            output = model([image, data, target])
            targets.append(target)
            
            loss_points['test'].append(criterion(output, target).item())
            
            test_loss += criterion(output, target, reduction='sum').item()**2 # sum up batch loss                                                               
            predictions.append(output)
            
            print('output: {:.3f},\t target: {:.3f}'.format(output[0][0], target[0][0]))
    
    ema.resume(model)
    
    outputs = np.array([predictions[i].item() for i in range(len(predictions))], dtype=float)
    targets = np.array([targets[i].item() for i in range(len(targets))], dtype=float)
        
    test_loss /= len(test_loader.dataset)
    correlation = np.corrcoef(outputs, targets)[0][1]
    print('\nTest set: Average loss: {:.4f}\t Correlation: {:.4f}\n'.format(test_loss, correlation))
    
    return test_loss


def predict(model):
    
    model.eval()
    #ema.assign(model)
    predictions = {}
    
    with torch.no_grad():
        for i, (image, data, target) in enumerate(prediction_loader):
            
            print('Predicting patient:', patients_testing_data.iloc[i].Patient)
            # send to device
            image, data, target = image.to(device), data.float().to(device), target.float().to(device)

            output = model([image, data, target])
            
            predictions[patients_testing_data.iloc[i].Patient] = output.item()
            
    #ema.resume(model)
    
    return predictions

In [None]:
def score(fvc_true, fvc_pred, sigma):
    sigma_clip = np.maximum(sigma, 70) # changed from 70, trie 66.7 too
    delta = np.abs(fvc_true - fvc_pred)
    delta = np.minimum(delta, 1000)
    sq2 = np.sqrt(2)
    metric = (delta / sigma_clip)*sq2 + np.log(sigma_clip* sq2)
    return np.mean(metric)

## KFold

In [None]:
#transform = transforms.Compose([
#                transforms.ToTensor(),
#                transforms.Normalize(572.5, 255.),
#                #transforms.RandomAffine(degrees=(-5, 5), translate=(0.0, 0.0))
#             ])

In [None]:
kfold = KFold(n_splits=5, shuffle=True)
kfold_split = kfold.split(patients_data[:])

In [None]:
n_epochs = 30
ema_decay = 0.999

model = ModelM3()
initial_state = model.state_dict()

best_state = initial_state
best_fold = 0
test_loss = 100
best_test_ids = 0

In [None]:
model.load_state_dict(initial_state)

In [None]:
for fold, (train_ids, test_ids) in enumerate(kfold_split):

    patients_training_data = patients_data.iloc[train_ids]
    patients_testing_data  = patients_data.iloc[test_ids]

    if fold == 0:
        print('Train samples:', len(patients_training_data))
        print('Test samples:', len(patients_testing_data))

    
    print('-'*100)
    print('Fold number:', fold)
    
    
    train_loader = torch.utils.data.DataLoader(
        PulmonaryFibrosisProgressionDataset(patients_training_data, train=True, transform=transform),
        batch_size=BATCH_SIZE, shuffle=True)

    test_loader = torch.utils.data.DataLoader(
        PulmonaryFibrosisProgressionDataset(patients_testing_data, train=True, transform=transform),
        batch_size=BATCH_SIZE)

    model = ModelM3()
    model.load_state_dict(initial_state)
    model.to(device)

    optimizer = optim.Adam(model.parameters(), lr=0.01)

    ema = EMA(model, decay=ema_decay)


    for epoch in range(n_epochs):

        train(epoch, model)
        current_loss = test(model)
        
        if current_loss < test_loss:
            test_loss = current_loss
            best_state = model.state_dict()
            best_fold = fold
            best_test_ids = test_ids

    print('\n\n')

    
    break

print('Best fold:', best_fold)
torch.save(best_state, '../working/best_state.pt')
print('Done!')

## Predictions

In [None]:
patients_testing_data  = test_data#.iloc[best_test_ids]

prediction_loader = torch.utils.data.DataLoader(
    PulmonaryFibrosisProgressionDataset(patients_testing_data, test=True, transform=transform),
    batch_size=BATCH_SIZE)

model = ModelM3()
model.load_state_dict(torch.load('../input/best-state/best_state.pt'))
model.to(device)

ema = EMA(model, decay=ema_decay)

#test(model)
predictions = predict(model)




print('\n\n')
print('Done!')

In [None]:
predictions

In [None]:
a = np.array(list(predictions.values()))

In [None]:
print(a.mean()-a.std(), a.mean()+a.std())

In [None]:
sns.histplot(patients_data.Slope.values, bins=200)

In [None]:
sns.histplot(a, bins=200)
plt.xlim(0, 1)

## Post processing

In [None]:
results = pd.DataFrame(columns=['Patient_week', 'FVC', 'Confidence'])
for patient, slope in predictions.items():
    
    true_fvc = test_data[test_data.Patient == patient].FVC.values[0]
    true_weeks = test_data[test_data.Patient == patient].Weeks.values[0]

    slope = slope*45 - 30
    intercept = true_fvc - slope*true_weeks
    
    predicted_weeks = list(range(-12, 134))
    predicted_fvc = [slope*weeks + intercept for weeks in predicted_weeks]
    
    cheat_results_weeks = cheat_results[cheat_results.Patient == patient].Weeks.values
    cheat_results_fvc = cheat_results[cheat_results.Patient == patient].FVC.values
    
    
    patient_week = [patient+'_'+str(week) for week in predicted_weeks]
    
    for i in range(len(patient_week)):
        results = results.append({'Patient_week': patient_week[i], 
                                  'FVC': predicted_fvc[i], 
                                  'Confidence': 70
                                 }, ignore_index=True)
    
    
    #to_be_appended = np.array([patient_week, predicted_fvc]).T
    #print(to_be_appended)
    
    
    #sns.scatterplot(x=predicted_weeks, y=predicted_fvc)
    #sns.scatterplot(x=cheat_results_weeks, y=cheat_results_fvc)

    
results

In [None]:
results.to_csv('../working/submission.csv',index=False)

In [None]:
sns.lineplot(x=range(len(loss_points['train'])), y=loss_points['train'])

In [None]:
sns.lineplot(x=range(len(loss_points['test'])), y=loss_points['test'])

In [None]:
dir(tmodels)