In [159]:
from collections import deque
import numpy as np
import pandas as pd
import sys
import os
import pathlib
import lightgbm as lgb
from PIL import Image
import pydicom as dicom
import sklearn
import tensorflow as tf
from tensorflow import keras
from typing import Deque, Dict, Any, List
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [144]:
# Characters such as empty strings '' or numpy.inf are considered NA values
pd.set_option('use_inf_as_na', True)
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)

In [161]:
INPUT_SHAPE = (224, 224, 3)
BATCH_SIZE = 32
TARGET = ['fvc_last_3', 'fvc_last_2', 'fvc_last_1']
NON_FEATURES = set(TARGET) | {'pid', 'week_last_1', 'week_last_2', 'week_last_3'}
CATEGORICALS = {'sex', 'smoking'}



KAGGLE_KERNEL_RUN_TYPE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost')
IS_KAGGLE = KAGGLE_KERNEL_RUN_TYPE != 'Localhost'
INPUT = 'input'
OUTPUT = 'output/test'
RESOURCE = ''
MODEL_NAME = 'efficientnetb0'
CV_MODEL_FILE = f'models/{MODEL_NAME}/best_model.h5'
LGB1_MODEL_FILE = f'models/{MODEL_NAME}/lgb_fvc_last_1.txt'
LGB2_MODEL_FILE = f'models/{MODEL_NAME}/lgb_fvc_last_2.txt'
LGB3_MODEL_FILE = f'models/{MODEL_NAME}/lgb_fvc_last_3.txt'

if IS_KAGGLE:
    INPUT = '/kaggle/input'
    OUTPUT = '.'
    RESOURCE = 'birdsongrecognition/kaggle-birdsong-recognition-1.0'
    CV_MODEL_FILE = f'{INPUT}/{RESOURCE}/{CV_MODEL_FILE}'
    sys.path.append(f'{INPUT}/kagglerig/src')
else:
    !pip install lib/kaggle-rig-0.2.0.tar.gz
    

TAB_DATA = f'{INPUT}/test.csv'
IMG_DATA = f'{INPUT}/test'
print(f"""IS_KAGGLE={IS_KAGGLE}
KAGGLE_KERNEL_RUN_TYPE={KAGGLE_KERNEL_RUN_TYPE}
CV_MODEL_FILE={CV_MODEL_FILE}
LGB1_MODEL_FILE={LGB1_MODEL_FILE}
LGB2_MODEL_FILE={LGB2_MODEL_FILE}
LGB3_MODEL_FILE={LGB3_MODEL_FILE}
""")

    

import krig
krig.seed_everything()

Processing ./lib/kaggle-rig-0.2.0.tar.gz
Building wheels for collected packages: kaggle-rig
  Building wheel for kaggle-rig (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle-rig: filename=kaggle_rig-0.2.0-py3-none-any.whl size=6539 sha256=efaad4bd74c972d5383557f859ace4d1bdcd3a37c149e51381e2bb7984217373
  Stored in directory: /home/pankun/.cache/pip/wheels/03/99/72/07676e0abd65551d5261aee9360ad1c85fcaca1c409857a40a
Successfully built kaggle-rig
Installing collected packages: kaggle-rig
  Attempting uninstall: kaggle-rig
    Found existing installation: kaggle-rig 0.2.0
    Uninstalling kaggle-rig-0.2.0:
      Successfully uninstalled kaggle-rig-0.2.0
Successfully installed kaggle-rig-0.2.0
IS_KAGGLE=False
KAGGLE_KERNEL_RUN_TYPE=Localhost
CV_MODEL_FILE=models/efficientnetb0/best_model.h5
LGB1_MODEL_FILE=models/efficientnetb0/lgb_fvc_last_1.txt
LGB2_MODEL_FILE=models/efficientnetb0/lgb_fvc_last_2.txt
LGB3_MODEL_FILE=models/efficientnetb0/lgb_fvc_last_3.txt



# Preprocess images

In [146]:
def load_scan(path):
    slices = [dicom.read_file(f'{path}/{s}') for s in os.listdir(path)]
    #slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    return slices


def to_hu(slices, padding=-2000):
    """Convert to Hounsfield units (HU)"""
    frames = np.stack([s.pixel_array for s in slices])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    frames = frames.astype(np.int16)
    for i in range(len(slices)):
        f = frames[i]
        s = slices[i]
        if "PixelPaddingValue" in s:
            padding = np.int16(s.PixelPaddingValue)
        slope = np.float64(s.RescaleSlope)
        intercept = np.int16(s.RescaleIntercept)
        # Set outside-of-scan pixels to 0
        f[f <= padding] = 0 
        if slope != 1:
            f = slope * f.astype(np.float64)
            print(f'f.dtype={f.dtype}')
            f = f.astype(np.int16)  
        f += intercept
    return frames.astype(np.int16)


def window(frames, hu_min=-1000, hu_max=600):
    rng = hu_max - hu_min
    norm = (frames - hu_min) / rng
    norm[norm < 0] = 0
    norm[norm > 1] = 1
    norm = (norm * 255).astype(np.uint8)
    res = []
    for f in norm:
        channel = f.T
        rgb = np.array([channel, channel, channel]).T
        res.append(rgb)
    return np.array(res, dtype=np.uint8)


def resize(frames, target_size=(600, 600), slices=None):
    res = []
    for i in range(len(frames)):
        f = frames[i]
        im = Image.fromarray(f, mode='RGB')
        if slices is not None:
            s = slices[i]
            rows, cols = float(s.PixelSpacing[0]), float(s.PixelSpacing[1]) 
            height = int(f.shape[0] * rows)
            width = int(f.shape[1] * cols)
            #print(f'original shape=({height}, {width})')
            im = im.resize((width, height), Image.NEAREST)
        im = im.resize(target_size, Image.NEAREST)
        res.append(np.asarray(im))
    return np.array(res, dtype=np.uint8)


def preprocess(dir):
    slices = load_scan(dir)
    res = to_hu(slices)
    res = window(res)
    res = resize(res, slices=None)
    return res

In [147]:
patients = [dir for dir in os.listdir(IMG_DATA)]
print(f'{len(patients)} patients image data')

5 patients image data


In [148]:
for patient in patients:
    dir = f'{OUTPUT}/{patient}'
    pathlib.Path(dir).mkdir(parents=True, exist_ok=True)
    try:
        frames = preprocess(f"{IMG_DATA}/{patient}")
    except Exception as ex:
        print(f'patient={patient}, Error={ex}')
        continue
    for i in range(len(frames)):
        im =  Image.fromarray(frames[i], mode='RGB')
        im.save(f"{dir}/{i + 1}.png")

# Get last three FVC readings per patient

In [149]:
test = pd.read_csv(TAB_DATA)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        5 non-null      object 
 1   Weeks          5 non-null      int64  
 2   FVC            5 non-null      int64  
 3   Percent        5 non-null      float64
 4   Age            5 non-null      int64  
 5   Sex            5 non-null      object 
 6   SmokingStatus  5 non-null      object 
dtypes: float64(1), int64(3), object(3)
memory usage: 408.0+ bytes


In [150]:
def explode(row: Dict[str, Any], path: str) -> List[Dict[str, Any]]:
    res: List[Dict[str, Any]] = []
    pid = row['pid']
    dir = f'{path}/{pid}'
    for filename in os.listdir(dir):
        r = dict(row)
        r['img'] = f'{pid}/{filename}'
        res.append(r)
    return res


def set_last_visits(
    row: Dict[str, Any], 
    last_weeks: Deque[int],
    last_fvc: Deque[float]
) -> None:
    if len(last_fvc) == 0:
        raise ValueError('there should be at least one fvc reading per patient')
    elif len(last_fvc) == 1:
        last_fvc.append(last_fvc[0])
        last_fvc.append(last_fvc[0])
    elif len(last_fvc) == 2:
        last_fvc.append(last_fvc[1])
    elif len(last_fvc) > 3:
        raise ValueError('get last 3 fvc readings per patient')
    if len(last_weeks) == 0:
        raise ValueError('there should be at least one week number per patient')
    elif len(last_weeks) == 1:
        last_weeks.append(last_weeks[0])
        last_weeks.append(last_weeks[0])
    elif len(last_weeks) == 2:
        last_weeks.append(last_weeks[1])
    elif len(last_weeks) > 3:
        raise ValueError('get last 3 fvc readings per patient')
    row['fvc_last_1'] = last_fvc[2]
    row['fvc_last_2'] = last_fvc[1]
    row['fvc_last_3'] = last_fvc[0]
    row['week_last_1'] = last_weeks[2]
    row['week_last_2'] = last_weeks[1]
    row['week_last_3'] = last_weeks[0]




rows = []
row: Dict[str, Any] = {}
prev = None
last_weeks: Deque[int] = deque()
last_fvc: Deque[float] = deque()
for t in test.itertuples():
    # new patient
    if prev is not None and prev != t.Patient:
        set_last_visits(row, last_weeks, last_fvc)
        rows += explode(row, OUTPUT)
    if prev is None or prev != t.Patient:
        row = {}
        last_weeks = deque()
        last_fvc = deque()
        row['pid'] = t.Patient
        row['age'] = t.Age
        row['sex'] = t.Sex
        row['smoking'] = t.SmokingStatus
        row['week_1'] = t.Weeks
        row['fvc_1'] = t.FVC
        row['percent_1'] = t.Percent
    prev = t.Patient
    last_weeks.append(t.Weeks)
    if len(last_weeks) == 4:
        last_weeks.popleft()
    last_fvc.append(t.FVC)
    if len(last_fvc) == 4:
        last_fvc.popleft()
    

# add the last patient!
if len(row) != 0:
    set_last_visits(row, last_weeks, last_fvc)
    rows += explode(row, OUTPUT)

    
schema = {
    'pid': str,
    'img': str,
    'age': np.uint8,
    'sex': str,
    'smoking': str,
    'week_1': np.int16,
    'fvc_1': np.uint16,
    'percent_1': np.float32,
    'fvc_last_1': np.uint16,
    'fvc_last_2': np.uint16,
    'fvc_last_3': np.uint16,
    'week_last_1': np.int16,
    'week_last_2': np.int16,
    'week_last_3': np.int16,
}
test = pd.DataFrame.from_records(rows)
test = test.astype(schema)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1261 entries, 0 to 1260
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   pid          1261 non-null   object 
 1   age          1261 non-null   uint8  
 2   sex          1261 non-null   object 
 3   smoking      1261 non-null   object 
 4   week_1       1261 non-null   int16  
 5   fvc_1        1261 non-null   uint16 
 6   percent_1    1261 non-null   float32
 7   fvc_last_1   1261 non-null   uint16 
 8   fvc_last_2   1261 non-null   uint16 
 9   fvc_last_3   1261 non-null   uint16 
 10  week_last_1  1261 non-null   int16  
 11  week_last_2  1261 non-null   int16  
 12  week_last_3  1261 non-null   int16  
 13  img          1261 non-null   object 
dtypes: float32(1), int16(4), object(4), uint16(4), uint8(1)
memory usage: 65.4+ KB


# Inference on image data

In [151]:
model = keras.models.load_model(CV_MODEL_FILE)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
efficientnetb0 (Functional)  (None, 1280)              4049571   
_________________________________________________________________
batch_normalization (BatchNo (None, 1280)              5120      
_________________________________________________________________
dense (Dense)                (None, 1280)              1639680   
_________________________________________________________________
dropout (Dropout)            (None, 1280)              0         
_________________________________________________________________
output (Dense)               (None, 3)                 3843      
Total params: 5,698,214
Trainable params: 1,646,083
Non-trainable params: 4,052,131
_________________________________________________________________


In [152]:
target_size = (INPUT_SHAPE[0], INPUT_SHAPE[1])
color_mode='rgb'
class_mode='multi_output'
idg = keras.preprocessing.image.ImageDataGenerator()
test_gen = idg.flow_from_dataframe(
    dataframe = test,
    x_col='img',
    y_col=TARGET,
    directory=OUTPUT,
    target_size=target_size,
    color_mode=color_mode,
    shuffle=False,
    batch_size=BATCH_SIZE,
    class_mode=class_mode
)
preds = model.predict(test_gen, use_multiprocessing=False, workers=4, verbose=1)
print(f'preds.shape={preds.shape}')

Found 1261 validated image filenames.
preds.shape=(1261, 3)


In [153]:
preds = preds.T
test['fvc_last_3_cv'] = preds[0]
test['fvc_last_2_cv'] = preds[1]
test['fvc_last_1_cv'] = preds[2]
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1261 entries, 0 to 1260
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   pid            1261 non-null   object 
 1   age            1261 non-null   uint8  
 2   sex            1261 non-null   object 
 3   smoking        1261 non-null   object 
 4   week_1         1261 non-null   int16  
 5   fvc_1          1261 non-null   uint16 
 6   percent_1      1261 non-null   float32
 7   fvc_last_1     1261 non-null   uint16 
 8   fvc_last_2     1261 non-null   uint16 
 9   fvc_last_3     1261 non-null   uint16 
 10  week_last_1    1261 non-null   int16  
 11  week_last_2    1261 non-null   int16  
 12  week_last_3    1261 non-null   int16  
 13  img            1261 non-null   object 
 14  fvc_last_3_cv  1261 non-null   float32
 15  fvc_last_2_cv  1261 non-null   float32
 16  fvc_last_1_cv  1261 non-null   float32
dtypes: float32(4), int16(4), object(4), uint16(4), uint8

In [154]:
test.drop(['img'], axis=1, inplace=True)

In [155]:
def set_dist(row, fvc_last_1_cv, fvc_last_2_cv, fvc_last_3_cv) -> None:
    quantiles = [0, 0.5, 0.75, 0.9, 0.95, 0.99, 1]
    s = pd.Series(fvc_last_1_cv)
    qs = s.quantile(quantiles).to_numpy()
    row['fvc_last_1_min'] = qs[0]
    row['fvc_last_1_p50'] = qs[1]
    row['fvc_last_1_p75'] = qs[2]
    row['fvc_last_1_p90'] = qs[3]
    row['fvc_last_1_p95'] = qs[4]
    row['fvc_last_1_p99'] = qs[5]
    row['fvc_last_1_max'] = qs[6]
    s = pd.Series(fvc_last_2_cv)
    qs = s.quantile(quantiles).to_numpy()
    row['fvc_last_2_min'] = qs[0]
    row['fvc_last_2_p50'] = qs[1]
    row['fvc_last_2_p75'] = qs[2]
    row['fvc_last_2_p90'] = qs[3]
    row['fvc_last_2_p95'] = qs[4]
    row['fvc_last_2_p99'] = qs[5]
    row['fvc_last_2_max'] = qs[6]
    s = pd.Series(fvc_last_3_cv)
    qs = s.quantile(quantiles).to_numpy()
    row['fvc_last_3_min'] = qs[0]
    row['fvc_last_3_p50'] = qs[1]
    row['fvc_last_3_p75'] = qs[2]
    row['fvc_last_3_p90'] = qs[3]
    row['fvc_last_3_p95'] = qs[4]
    row['fvc_last_3_p99'] = qs[5]
    row['fvc_last_3_max'] = qs[6]



rows = []
row: Dict[str, Any] = {}
prev = None
fvc_last_1_cv = []
fvc_last_2_cv = []
fvc_last_3_cv = []
for t in test.itertuples():
    # new patient
    if prev is not None and prev != t.pid:
        set_dist(row, fvc_last_1_cv, fvc_last_2_cv, fvc_last_3_cv)
        rows.append(row)
    if prev is None or prev != t.pid:
        row = {}
        fvc_last_1_cv = []
        fvc_last_2_cv = []
        fvc_last_3_cv = []
        row['pid'] = t.pid
        row['age'] = t.age
        row['sex'] = t.sex
        row['smoking'] = t.smoking
        row['week_1'] = t.week_1
        row['fvc_1'] = t.fvc_1
        row['percent_1'] = t.percent_1
        row['fvc_last_1'] = t.fvc_last_1
        row['fvc_last_2'] = t.fvc_last_2
        row['fvc_last_3'] = t.fvc_last_3
        row['week_last_1'] = t.week_last_1
        row['week_last_2'] = t.week_last_2
        row['week_last_3'] = t.week_last_3
    prev = t.pid
    fvc_last_1_cv.append(t.fvc_last_1_cv)
    fvc_last_2_cv.append(t.fvc_last_2_cv)
    fvc_last_3_cv.append(t.fvc_last_3_cv)
    
# add the last patient!
if len(row) != 0:
    set_dist(row, fvc_last_1_cv, fvc_last_2_cv, fvc_last_3_cv)
    rows.append(row)


schema = {
    'pid': str,
    'age': np.uint8,
    'sex': str,
    'smoking': str,
    'week_1': np.int16,
    'fvc_1': np.uint16,
    'percent_1': np.float32,
    'fvc_last_1': np.uint16,
    'fvc_last_2': np.uint16,
    'fvc_last_3': np.uint16,
    'week_last_1': np.int16,
    'week_last_2': np.int16,
    'week_last_3': np.int16,
    'fvc_last_1_min': np.float32,
    'fvc_last_1_p50': np.float32,
    'fvc_last_1_p75': np.float32,
    'fvc_last_1_p90': np.float32,
    'fvc_last_1_p95': np.float32,
    'fvc_last_1_p99': np.float32,
    'fvc_last_1_max': np.float32,
    'fvc_last_2_min': np.float32,
    'fvc_last_2_p50': np.float32,
    'fvc_last_2_p75': np.float32,
    'fvc_last_2_p90': np.float32,
    'fvc_last_2_p95': np.float32,
    'fvc_last_2_p99': np.float32,
    'fvc_last_2_max': np.float32,
    'fvc_last_3_min': np.float32,
    'fvc_last_3_p50': np.float32,
    'fvc_last_3_p75': np.float32,
    'fvc_last_3_p90': np.float32,
    'fvc_last_3_p95': np.float32,
    'fvc_last_3_p99': np.float32,
    'fvc_last_3_max': np.float32,
}
test = pd.DataFrame.from_records(rows)
test = test.astype(schema)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 34 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pid             5 non-null      object 
 1   age             5 non-null      uint8  
 2   sex             5 non-null      object 
 3   smoking         5 non-null      object 
 4   week_1          5 non-null      int16  
 5   fvc_1           5 non-null      uint16 
 6   percent_1       5 non-null      float32
 7   fvc_last_1      5 non-null      uint16 
 8   fvc_last_2      5 non-null      uint16 
 9   fvc_last_3      5 non-null      uint16 
 10  week_last_1     5 non-null      int16  
 11  week_last_2     5 non-null      int16  
 12  week_last_3     5 non-null      int16  
 13  fvc_last_1_min  5 non-null      float32
 14  fvc_last_1_p50  5 non-null      float32
 15  fvc_last_1_p75  5 non-null      float32
 16  fvc_last_1_p90  5 non-null      float32
 17  fvc_last_1_p95  5 non-null      float32

# Inference on tabular data

In [160]:
m1 = lgb.Booster(model_file=LGB1_MODEL_FILE)
m2 = lgb.Booster(model_file=LGB2_MODEL_FILE)
m3 = lgb.Booster(model_file=LGB3_MODEL_FILE)

In [163]:
sex_encoder = {'Male': 0.7873563218390804, 'Female': 0.21264367816091953}
smoking_encoder = {'Ex-smoker': 0.6666666666666666, 'Never smoked': 0.28160919540229884, 'Currently smokes': 0.05172413793103448}
test['sex'] = test['sex'].map(sex_encoder).fillna(0)
assert not test['sex'].isna().any()
test['smoking'] = test['smoking'].map(smoking_encoder).fillna(0)
assert not test['smoking'].isna().any()


In [164]:
FEATURES = set(test.columns) - NON_FEATURES
FEATURES = list(FEATURES)
FEATURES.sort()
print(f'{len(FEATURES)} FEATURES={FEATURES}')

27 FEATURES=['age', 'fvc_1', 'fvc_last_1_max', 'fvc_last_1_min', 'fvc_last_1_p50', 'fvc_last_1_p75', 'fvc_last_1_p90', 'fvc_last_1_p95', 'fvc_last_1_p99', 'fvc_last_2_max', 'fvc_last_2_min', 'fvc_last_2_p50', 'fvc_last_2_p75', 'fvc_last_2_p90', 'fvc_last_2_p95', 'fvc_last_2_p99', 'fvc_last_3_max', 'fvc_last_3_min', 'fvc_last_3_p50', 'fvc_last_3_p75', 'fvc_last_3_p90', 'fvc_last_3_p95', 'fvc_last_3_p99', 'percent_1', 'sex', 'smoking', 'week_1']


In [166]:
x_test = test[FEATURES]
test['pred_fvc_last_1'] = m1.predict(x_test)
test['pred_fvc_last_2'] = m2.predict(x_test)
test['pred_fvc_last_3'] = m3.predict(x_test)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   pid              5 non-null      object 
 1   age              5 non-null      uint8  
 2   sex              5 non-null      float64
 3   smoking          5 non-null      float64
 4   week_1           5 non-null      int16  
 5   fvc_1            5 non-null      uint16 
 6   percent_1        5 non-null      float32
 7   fvc_last_1       5 non-null      uint16 
 8   fvc_last_2       5 non-null      uint16 
 9   fvc_last_3       5 non-null      uint16 
 10  week_last_1      5 non-null      int16  
 11  week_last_2      5 non-null      int16  
 12  week_last_3      5 non-null      int16  
 13  fvc_last_1_min   5 non-null      float32
 14  fvc_last_1_p50   5 non-null      float32
 15  fvc_last_1_p75   5 non-null      float32
 16  fvc_last_1_p90   5 non-null      float32
 17  fvc_last_1_p95   5 n

In [167]:
test.head()

Unnamed: 0,pid,age,sex,smoking,week_1,fvc_1,percent_1,fvc_last_1,fvc_last_2,fvc_last_3,week_last_1,week_last_2,week_last_3,fvc_last_1_min,fvc_last_1_p50,fvc_last_1_p75,fvc_last_1_p90,fvc_last_1_p95,fvc_last_1_p99,fvc_last_1_max,fvc_last_2_min,fvc_last_2_p50,fvc_last_2_p75,fvc_last_2_p90,fvc_last_2_p95,fvc_last_2_p99,fvc_last_2_max,fvc_last_3_min,fvc_last_3_p50,fvc_last_3_p75,fvc_last_3_p90,fvc_last_3_p95,fvc_last_3_p99,fvc_last_3_max,pred_fvc_last_1,pred_fvc_last_2,pred_fvc_last_3
0,ID00419637202311204720264,73,0.787356,0.666667,6,3020,70.186852,3020,3020,3020,6,6,6,736.935303,868.139404,905.970642,921.780396,932.586914,957.406067,965.72522,763.85675,899.869629,939.067139,955.467529,966.650757,992.382996,1001.005371,740.670227,872.800415,910.872681,926.764465,937.617554,962.619812,970.992981,2568.683442,2635.628468,2638.685509
1,ID00421637202311550012437,68,0.787356,0.666667,15,2739,82.045288,2739,2739,2739,15,15,15,1025.081665,1147.988525,1180.419189,1205.130249,1227.766602,1239.548706,1242.770508,1062.468628,1189.860474,1223.465576,1249.096069,1272.548218,1284.748901,1288.094727,1030.665405,1154.449707,1187.039917,1211.926636,1234.678955,1246.530518,1249.79895,2835.694731,2802.428699,2889.822208
2,ID00422637202311677017371,73,0.787356,0.666667,6,1930,76.672493,1930,1930,1930,6,6,6,802.83429,1031.493042,1064.319824,1095.036987,1107.376831,1129.85376,1148.537109,832.119629,1069.132446,1103.149658,1134.930542,1147.772583,1171.04834,1190.386841,807.060486,1037.20459,1070.212646,1101.125488,1113.577759,1136.18396,1154.989014,2234.762258,2272.575092,2321.996163
3,ID00423637202312137826377,72,0.787356,0.666667,17,3294,79.258904,3294,3294,3294,17,17,17,897.556519,1114.310425,1161.867554,1184.512329,1194.188599,1211.555664,1220.291138,930.203918,1154.923096,1204.234863,1227.696411,1237.732178,1255.727661,1264.777466,902.229431,1120.449463,1168.399048,1191.111084,1200.887451,1218.325684,1227.120361,2766.25299,2759.971053,2846.383962
4,ID00426637202313170790466,73,0.787356,0.281609,0,2925,71.824966,2925,2925,2925,0,0,0,451.566132,771.557312,812.948059,842.767761,866.979553,893.517029,916.098206,467.886017,799.567871,842.478882,873.384216,898.481689,925.997559,949.406677,453.137817,775.273438,816.989685,846.955078,871.332153,898.073914,920.808716,2545.223582,2595.816039,2596.297996


# Submission

In [170]:
week_max = 133
rows = []
row: Dict[str, Any] = {}
for t in test.itertuples():
    gap = week_max - t.week_1
    breakpoint_1 = int(t.week_1 + (gap * 1/3))
    breakpoint_2 = int(t.week_1 + (gap * 2/3))
    week = t.week_1 + 1
    for i in range(week, week_max + 1):
        row = {}
        if i >= breakpoint_2:
            row['FVC'] = int(t.pred_fvc_last_1)
        elif i >= breakpoint_1:
            row['FVC'] = int(t.pred_fvc_last_2)
        else:
            row['FVC'] = int(t.pred_fvc_last_3)
        row['Patient_Week'] = f'{t.pid}_{i}'
        row['Confidence'] = 70
        rows.append(row)
    
    
cols = ['Patient_Week', 'FVC', 'Confidence']
sub = pd.DataFrame.from_records(rows, columns=cols)
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 621 entries, 0 to 620
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Patient_Week  621 non-null    object
 1   FVC           621 non-null    int64 
 2   Confidence    621 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 14.7+ KB


In [171]:
sub.to_csv('submission.csv', index=False)

# DEBUG

In [156]:
!pip list

Package                Version
---------------------- ---------
absl-py                0.10.0
argon2-cffi            20.1.0
astunparse             1.6.3
async-generator        1.10
attrs                  20.2.0
backcall               0.2.0
bleach                 3.2.1
cachetools             4.1.1
certifi                2020.6.20
cffi                   1.14.3
chardet                3.0.4
cycler                 0.10.0
decorator              4.4.2
defusedxml             0.6.0
entrypoints            0.3
gast                   0.3.3
google-auth            1.21.2
google-auth-oauthlib   0.4.1
google-pasta           0.2.0
grpcio                 1.32.0
h5py                   2.10.0
idna                   2.10
imageio                2.9.0
importlib-metadata     1.7.0
ipykernel              5.3.4
ipython                7.18.1
ipython-genutils       0.2.0
jedi                   0.17.2
Jinja2                 2.11.2
joblib                 0.16.0
json5                 