# Fitting the data
You need to find a parameterized curve that can match all of these predictions. The simplest thing we can do is linear regression, and it seems to work fairly well with the known error of 70ml. We want to use a neural network to predict the slope and intercept for each FVC curve to do so we need to find the slope and intercept that best fits the data.

To deal with all of the issues associated with this fitting we can try to use bayesian programming as outlined in [this](https://docs.pymc.io/notebooks/GLM-robust.html) tutorial and as performed below. This could potentially allow us to find the bayesian estimate for confidence etc using a neural network,

In [None]:
import numpy as np
import pandas as pd
import pydicom
import os
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

In [None]:
import tensorflow as tf
import tensorflow.keras.backend as K
import tensorflow.keras.layers as L
import tensorflow.keras.models as M

In [None]:
import pymc3 as pm
import theano

In [None]:
def seed_everything(seed=2020):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    
seed_everything(42)

In [None]:
ROOT = "../input/osic-pulmonary-fibrosis-progression"
BATCH_SIZE=128

In [None]:
tr = pd.read_csv(f"{ROOT}/train.csv")

In [None]:
tr.head()

In [None]:
patient_ids = tr['Patient'].unique()
for pid in patient_ids[:20]:
    mx = tr['Patient']==pid
    plt.plot(tr['Weeks'].loc[mx],tr['FVC'].loc[mx])
plt.show()

In [None]:
n_obs = []
for pid in patient_ids[:20]:
    mx = tr['Patient']==pid
    plt.plot(tr['Weeks'].loc[mx],tr['Percent'].loc[mx])
plt.show()

# Fit the data and sample curves
Each sample is scaled by a different amount in the below fit and so the slopes and intercepts are not comparable, they need to be transformed to the full range.

In [None]:
patient_ids = tr['Patient'].unique()
nsamples = 100
names = []; samples = np.zeros((2*len(patient_ids),nsamples))
for i in range(len( patient_ids )):
# for i in range(3):
    pid = patient_ids[i]
    mx = tr['Patient']==pid
    x_sample = tr['Weeks'].loc[mx]
    y_sample = tr['FVC'].loc[mx]

    xarr = np.array(x_sample)
    xd = (xarr-min(xarr))/(max(xarr)-min(xarr))

    data = dict(x=xd, y=np.array(y_sample))

    with pm.Model() as model:
        family = pm.glm.families.StudentT()
        pm.glm.GLM.from_formula('y ~ x', data, family=family)
        trace = pm.sample(nsamples, cores=2)
    
    plt.figure(figsize=(7, 5))
    adjust = ((max(xarr)-min(xarr))+min(xarr))
    mslp = trace['x'][-nsamples:]
    aslp = mslp/adjust
    mint = trace['Intercept'][-nsamples:]

    plt.plot(x_sample, y_sample, 'x', label='data')
    rng = np.linspace(min(xarr)-5,max(xarr)+5,11)
    plt.plot(np.array([rng]*100).T,(np.outer(aslp[:100],rng)+mint[:100,np.newaxis]).T ,'k',alpha=0.05)
    
    samples[2*i:2*i+2] = np.vstack((aslp,mint))
    names+=[pid]
    
    plt.show()

In [None]:
# import matplotlib as mpl
# plt.hist2d(samples[0],samples[1])
# plt.show()

In [None]:
np.save('samples',samples)
np.save('names',np.array(names))