In [1]:
import numpy as np
import pandas as pd

import lightgbm as lgb

import random
from tqdm.notebook import tqdm

## Utils

In [2]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)

## Config

In [3]:
ID = 'Patient_Week'
TARGET = 'FVC'
SEED = 42
seed_everything(seed=SEED)

N_FOLD = 4

## Data Loading

In [4]:
path = '/home/prakhar/Desktop/ml/OSIC-Pulmonary-Fibrosis-Progression/input/'

In [5]:
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')
ss = pd.read_csv(path + 'sample_submission.csv')

In [6]:
train.head()

Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker


### Constructing input

In [7]:
train[ID] = train['Patient'].astype('str') + '_' + train['Weeks'].astype('str')
print(train.shape)
train.head()

(1549, 8)


Unnamed: 0,Patient,Weeks,FVC,Percent,Age,Sex,SmokingStatus,Patient_Week
0,ID00007637202177411956430,-4,2315,58.253649,79,Male,Ex-smoker,ID00007637202177411956430_-4
1,ID00007637202177411956430,5,2214,55.712129,79,Male,Ex-smoker,ID00007637202177411956430_5
2,ID00007637202177411956430,7,2061,51.862104,79,Male,Ex-smoker,ID00007637202177411956430_7
3,ID00007637202177411956430,9,2144,53.950679,79,Male,Ex-smoker,ID00007637202177411956430_9
4,ID00007637202177411956430,11,2069,52.063412,79,Male,Ex-smoker,ID00007637202177411956430_11


In [17]:
output = pd.DataFrame()
gb = train.groupby('Patient')
tk0 = tqdm(gb, total=len(gb))

for _, usr_df in tk0:
    usr_output = pd.DataFrame()
    for week, tmp in usr_df.groupby('Weeks'):
        rename_cols = {'Weeks' : 'base_week', 'FVC' : 'base_FVC', 'Percent' : 'base_percent', 'Age' : 'base_age'}
        tmp = tmp.drop(columns='Patient_Week').rename(columns=rename_cols)
        drop_cols = ['Sex', 'SmokingStatus', 'Percent', 'Age']
        _usr_output = usr_df.drop(columns=drop_cols).rename(columns={'Weeks' : 'predict_week'}).merge(tmp, on='Patient')
        _usr_output['weeks_passed'] = _usr_output['predict_week'] - _usr_output['base_week']
        usr_output = pd.concat([usr_output, _usr_output])
    output = pd.concat([output, usr_output])

output.head()

HBox(children=(FloatProgress(value=0.0, max=176.0), HTML(value='')))




Unnamed: 0,Patient,predict_week,FVC,Patient_Week,base_week,base_FVC,base_percent,base_age,Sex,SmokingStatus,weeks_passed
0,ID00007637202177411956430,-4,2315,ID00007637202177411956430_-4,-4,2315,58.253649,79,Male,Ex-smoker,0
1,ID00007637202177411956430,5,2214,ID00007637202177411956430_5,-4,2315,58.253649,79,Male,Ex-smoker,9
2,ID00007637202177411956430,7,2061,ID00007637202177411956430_7,-4,2315,58.253649,79,Male,Ex-smoker,11
3,ID00007637202177411956430,9,2144,ID00007637202177411956430_9,-4,2315,58.253649,79,Male,Ex-smoker,13
4,ID00007637202177411956430,11,2069,ID00007637202177411956430_11,-4,2315,58.253649,79,Male,Ex-smoker,15


In [23]:
output.head(20)

Unnamed: 0,Patient,predict_week,FVC,Patient_Week,base_week,base_FVC,base_percent,base_age,Sex,SmokingStatus,weeks_passed
0,ID00007637202177411956430,-4,2315,ID00007637202177411956430_-4,-4,2315,58.253649,79,Male,Ex-smoker,0
1,ID00007637202177411956430,5,2214,ID00007637202177411956430_5,-4,2315,58.253649,79,Male,Ex-smoker,9
2,ID00007637202177411956430,7,2061,ID00007637202177411956430_7,-4,2315,58.253649,79,Male,Ex-smoker,11
3,ID00007637202177411956430,9,2144,ID00007637202177411956430_9,-4,2315,58.253649,79,Male,Ex-smoker,13
4,ID00007637202177411956430,11,2069,ID00007637202177411956430_11,-4,2315,58.253649,79,Male,Ex-smoker,15
5,ID00007637202177411956430,17,2101,ID00007637202177411956430_17,-4,2315,58.253649,79,Male,Ex-smoker,21
6,ID00007637202177411956430,29,2000,ID00007637202177411956430_29,-4,2315,58.253649,79,Male,Ex-smoker,33
7,ID00007637202177411956430,41,2064,ID00007637202177411956430_41,-4,2315,58.253649,79,Male,Ex-smoker,45
8,ID00007637202177411956430,57,2057,ID00007637202177411956430_57,-4,2315,58.253649,79,Male,Ex-smoker,61
0,ID00007637202177411956430,-4,2315,ID00007637202177411956430_-4,5,2214,55.712129,79,Male,Ex-smoker,-9


In [None]:
# contructing test input

test = test.rename(columns={'Weeks' : 'base_week', 'FVC' : 'base_FVC', 'Percent' : 'base_percent', 'Age' : 'base_age'})

ss['Patient'] = ss['Patient_Week'].apply(lambda x : x.split('_')[0])
ss['predict_week'] = ss['Patient_Week'].apply(lambda x : x.split('_')[1]).astype(int)

test = ss.drop(columns=['FVC', 'Confidence']).merge(test, on='Patient')
test['week']