In [1]:
from collections import deque
import numpy as np
import pandas as pd
import sys
import os
import pathlib
import lightgbm as lgb
from PIL import Image
import pydicom as dicom
import sklearn
import tensorflow as tf
from tensorflow import keras
from typing import Deque, Dict, Any, List, Set, Tuple, NamedTuple
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


In [2]:
# Characters such as empty strings '' or numpy.inf are considered NA values
pd.set_option('use_inf_as_na', True)
pd.set_option('display.max_columns', 999)
pd.set_option('display.max_rows', 999)

In [3]:
MODEL_NAME = 'effnetb4_20201003_043455'
CONFIDENCE = 400
INPUT_SHAPE = (380, 380, 3)
BATCH_SIZE = 32
TARGET = ['fvc_last_3', 'fvc_last_2', 'fvc_last_1']
NON_FEATURES = set(TARGET) | {'pid', 'week_last_1', 'week_last_2', 'week_last_3'}
CATEGORICALS = {'sex', 'smoking'}



KAGGLE_KERNEL_RUN_TYPE = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost')
IS_KAGGLE = KAGGLE_KERNEL_RUN_TYPE != 'Localhost'
# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
INPUT = 'input'
DATA = INPUT
OUTPUT = 'output/test'
RESOURCE = ''
CV_MODEL_FILE = f'models/{MODEL_NAME}/best_model.h5'
LGB1_MODEL_FILE = f'models/{MODEL_NAME}/lgb_fvc_last_1.txt'
LGB2_MODEL_FILE = f'models/{MODEL_NAME}/lgb_fvc_last_2.txt'
LGB3_MODEL_FILE = f'models/{MODEL_NAME}/lgb_fvc_last_3.txt'

if IS_KAGGLE:
    INPUT = '/kaggle/input'
    DATA = f'{INPUT}/osic-pulmonary-fibrosis-progression'
    OUTPUT = '/kaggle/temp'
    RESOURCE = f'{INPUT}/osicpulmonaryfibrosis/kaggle-osic-pfp-1.0'
    CV_MODEL_FILE = f'{RESOURCE}/{CV_MODEL_FILE}'
    LGB1_MODEL_FILE = f'{RESOURCE}/{LGB1_MODEL_FILE}'
    LGB2_MODEL_FILE = f'{RESOURCE}/{LGB2_MODEL_FILE}'
    LGB3_MODEL_FILE = f'{RESOURCE}/{LGB3_MODEL_FILE}'
    sys.path.append(f'{INPUT}/kagglerig/src')
else:
    !pip install lib/kaggle-rig-0.2.0.tar.gz
    

TAB_DATA = f'{DATA}/test.csv'
IMG_DATA = f'{DATA}/test'
print(f"""IS_KAGGLE={IS_KAGGLE}
KAGGLE_KERNEL_RUN_TYPE={KAGGLE_KERNEL_RUN_TYPE}
CV_MODEL_FILE={CV_MODEL_FILE}
LGB1_MODEL_FILE={LGB1_MODEL_FILE}
LGB2_MODEL_FILE={LGB2_MODEL_FILE}
LGB3_MODEL_FILE={LGB3_MODEL_FILE}
OUTPUT={OUTPUT}
""")

    

import krig
krig.seed_everything()

Processing ./lib/kaggle-rig-0.2.0.tar.gz
Building wheels for collected packages: kaggle-rig
  Building wheel for kaggle-rig (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle-rig: filename=kaggle_rig-0.2.0-py3-none-any.whl size=6539 sha256=ce5a407b5b74fd65c5c1bca55203201b3ac6bba3fc7d6cc9e8cc1bf930aa7c87
  Stored in directory: /home/pankun/.cache/pip/wheels/03/99/72/07676e0abd65551d5261aee9360ad1c85fcaca1c409857a40a
Successfully built kaggle-rig
Installing collected packages: kaggle-rig
  Attempting uninstall: kaggle-rig
    Found existing installation: kaggle-rig 0.2.0
    Uninstalling kaggle-rig-0.2.0:
      Successfully uninstalled kaggle-rig-0.2.0
Successfully installed kaggle-rig-0.2.0
IS_KAGGLE=False
KAGGLE_KERNEL_RUN_TYPE=Localhost
CV_MODEL_FILE=models/effnetb4_20201001_204343/best_model.h5
LGB1_MODEL_FILE=models/effnetb4_20201001_204343/lgb_fvc_last_1.txt
LGB2_MODEL_FILE=models/effnetb4_20201001_204343/lgb_fvc_last_2.txt
LGB3_MODEL_FILE=models/effnetb4_20201001_204343/lg

# Preprocess images

In [4]:
def load_scan(path):
    slices = [dicom.read_file(f'{path}/{s}') for s in os.listdir(path)]
    #slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
    return slices


def to_hu(slices, padding=-2000):
    """Convert to Hounsfield units (HU)"""
    frames = np.stack([s.pixel_array for s in slices])
    # Convert to int16 (from sometimes int16), 
    # should be possible as values should always be low enough (<32k)
    frames = frames.astype(np.int16)
    for i in range(len(slices)):
        f = frames[i]
        s = slices[i]
        if "PixelPaddingValue" in s:
            padding = np.int16(s.PixelPaddingValue)
        slope = np.float64(s.RescaleSlope)
        intercept = np.int16(s.RescaleIntercept)
        # Set outside-of-scan pixels to 0
        f[f <= padding] = 0 
        if slope != 1:
            f = slope * f.astype(np.float64)
            print(f'f.dtype={f.dtype}')
            f = f.astype(np.int16)  
        f += intercept
    return frames.astype(np.int16)


def window(frames, hu_min=-1000, hu_max=600):
    rng = hu_max - hu_min
    norm = (frames - hu_min) / rng
    norm[norm < 0] = 0
    norm[norm > 1] = 1
    norm = (norm * 255).astype(np.uint8)
    res = []
    for f in norm:
        channel = f.T
        rgb = np.array([channel, channel, channel]).T
        res.append(rgb)
    return np.array(res, dtype=np.uint8)


class Features(NamedTuple):
    lung_area: int
    tissue_area: int


def extract(frames, slices, hu_min=-1000, hu_max=600, tissue=(100, 300), lung=(-700, -600)) -> List[Features]:
    rng = hu_max - hu_min
    tissue_min = int((tissue[0] - hu_min) / rng * 255)
    tissue_max = int((tissue[1] - hu_min) / rng * 255)
    lung_min = int((lung[0] - hu_min) / rng * 255)
    lung_max = int((lung[1] - hu_min) / rng * 255)
    #print(f'tissue pixel range={(tissue_min, tissue_max)}, lung pixel range={(lung_min, lung_max)}')
    res = []
    for i in range(len(frames)):
        f = frames[i]
        im = Image.fromarray(f, mode='RGB')
        s = slices[i]
        rows, cols = float(s.PixelSpacing[0]), float(s.PixelSpacing[1]) 
        height = int(f.shape[0] * rows)
        width = int(f.shape[1] * cols)
        im = im.resize((width, height))
        a = np.asarray(im)
        lung_area = int(np.sum(np.ma.masked_inside(a, lung_min, lung_max).mask) / 3)
        tissue_area = int(np.sum(np.ma.masked_inside(a, tissue_min, tissue_max).mask) / 3)
        sa = Features(lung_area=lung_area, tissue_area=tissue_area)
        res.append(sa)
    return res


def resize(frames, target_size=(600, 600)):
    res = []
    for i in range(len(frames)):
        f = frames[i]
        im = Image.fromarray(f, mode='RGB')
        im = im.resize(target_size)
        res.append(np.asarray(im))
    return np.array(res, dtype=np.uint8)


def preprocess(dir):
    slices = load_scan(dir)
    res = to_hu(slices)
    res = window(res)
    sas = extract(res, slices)
    res = resize(res)
    return res, sas

In [5]:
patients = [dir for dir in os.listdir(IMG_DATA)]
print(f'{len(patients)} patients image data')

5 patients image data


In [6]:
rows = []
noscan_patients: Set[str] = set()
for patient in patients:
    dir = f'{OUTPUT}/{patient}'
    pathlib.Path(dir).mkdir(parents=True, exist_ok=True)
    try:
        if patient == 'ID00426637202313170790466':
            raise ValueError('deliberately omitting a public test set patient')
        frames, features = preprocess(f"{IMG_DATA}/{patient}")
    except Exception as ex:
        print(f'patient={patient}, Error={ex}')
        noscan_patients.add(patient)
        continue
    for i in range(len(frames)):
        im =  Image.fromarray(frames[i], mode='RGB')
        filename = f"{i + 1}.png"
        im.save(f"{dir}/{filename}")
        row = {}
        row['img'] = f"{patient}/{filename}"
        row['lung_area'] = features[i].lung_area
        row['tissue_area'] = features[i].tissue_area
        rows.append(row)

patient=ID00426637202313170790466, Error=deliberately omitting a public test set patient


In [7]:
imf = pd.DataFrame.from_records(rows)
imf = imf.astype({
    'img': str,
    'lung_area': np.uint32,
    'tissue_area': np.uint32,
})
imf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 853 entries, 0 to 852
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   img          853 non-null    object
 1   lung_area    853 non-null    uint32
 2   tissue_area  853 non-null    uint32
dtypes: object(1), uint32(2)
memory usage: 13.5+ KB


# Put image file location in table

In [8]:
data = pd.read_csv(TAB_DATA)
data.set_index(['Patient'], drop=False, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, ID00419637202311204720264 to ID00426637202313170790466
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Patient        5 non-null      object 
 1   Weeks          5 non-null      int64  
 2   FVC            5 non-null      int64  
 3   Percent        5 non-null      float64
 4   Age            5 non-null      int64  
 5   Sex            5 non-null      object 
 6   SmokingStatus  5 non-null      object 
dtypes: float64(1), int64(3), object(3)
memory usage: 320.0+ bytes


In [9]:
def explode(row: Dict[str, Any], path: str) -> List[Dict[str, Any]]:
    res: List[Dict[str, Any]] = []
    pid = row['pid']
    dir = f'{path}/{pid}'
    for filename in os.listdir(dir):
        r = dict(row)
        r['img'] = f'{pid}/{filename}'
        res.append(r)
    return res


def set_last_visits(
    row: Dict[str, Any], 
    last_weeks: Deque[int],
    last_fvc: Deque[float]
) -> None:
    if len(last_fvc) == 0:
        raise ValueError('there should be at least one fvc reading per patient')
    elif len(last_fvc) == 1:
        last_fvc.append(last_fvc[0])
        last_fvc.append(last_fvc[0])
    elif len(last_fvc) == 2:
        last_fvc.append(last_fvc[1])
    elif len(last_fvc) > 3:
        raise ValueError('get last 3 fvc readings per patient')
    if len(last_weeks) == 0:
        raise ValueError('there should be at least one week number per patient')
    elif len(last_weeks) == 1:
        last_weeks.append(last_weeks[0])
        last_weeks.append(last_weeks[0])
    elif len(last_weeks) == 2:
        last_weeks.append(last_weeks[1])
    elif len(last_weeks) > 3:
        raise ValueError('get last 3 fvc readings per patient')
    row['fvc_last_1'] = last_fvc[2]
    row['fvc_last_2'] = last_fvc[1]
    row['fvc_last_3'] = last_fvc[0]
    row['week_last_1'] = last_weeks[2]
    row['week_last_2'] = last_weeks[1]
    row['week_last_3'] = last_weeks[0]




rows = []
row: Dict[str, Any] = {}
prev = None
last_weeks: Deque[int] = deque()
last_fvc: Deque[float] = deque()
for t in data.itertuples():
    # new patient
    if prev is not None and prev != t.Patient:
        set_last_visits(row, last_weeks, last_fvc)
        rows += explode(row, OUTPUT)
    if prev is None or prev != t.Patient:
        row = {}
        last_weeks = deque()
        last_fvc = deque()
        row['pid'] = t.Patient
        row['age'] = t.Age
        row['sex'] = t.Sex
        row['smoking'] = t.SmokingStatus
        row['week_1'] = t.Weeks
        row['fvc_1'] = t.FVC
        row['percent_1'] = t.Percent
    prev = t.Patient
    last_weeks.append(t.Weeks)
    if len(last_weeks) == 4:
        last_weeks.popleft()
    last_fvc.append(t.FVC)
    if len(last_fvc) == 4:
        last_fvc.popleft()
    

# add the last patient!
if len(row) != 0:
    set_last_visits(row, last_weeks, last_fvc)
    rows += explode(row, OUTPUT)
    
    
    
imf.set_index(['img'], drop=True, inplace=True)
for row in rows:
    k = row['img']
    row['lung_area'] = imf.loc[k]['lung_area']
    row['tissue_area'] = imf.loc[k]['tissue_area']
    row['lung_tissue_ratio'] = row['lung_area'] / row['tissue_area']




test = pd.DataFrame.from_records(rows)
test = test.astype({
    'pid': str,
    'img': str,
    'age': np.uint8,
    'sex': str,
    'smoking': str,
    'week_1': np.int16,
    'fvc_1': np.uint16,
    'percent_1': np.float32,
    'fvc_last_1': np.uint16,
    'fvc_last_2': np.uint16,
    'fvc_last_3': np.uint16,
    'week_last_1': np.int16,
    'week_last_2': np.int16,
    'week_last_3': np.int16,
    'lung_area': np.uint32,
    'tissue_area': np.uint32,
    'lung_tissue_ratio': np.float32
})
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 853 entries, 0 to 852
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pid                853 non-null    object 
 1   age                853 non-null    uint8  
 2   sex                853 non-null    object 
 3   smoking            853 non-null    object 
 4   week_1             853 non-null    int16  
 5   fvc_1              853 non-null    uint16 
 6   percent_1          853 non-null    float32
 7   fvc_last_1         853 non-null    uint16 
 8   fvc_last_2         853 non-null    uint16 
 9   fvc_last_3         853 non-null    uint16 
 10  week_last_1        853 non-null    int16  
 11  week_last_2        853 non-null    int16  
 12  week_last_3        853 non-null    int16  
 13  img                853 non-null    object 
 14  lung_area          853 non-null    uint32 
 15  tissue_area        853 non-null    uint32 
 16  lung_tissue_ratio  853 non

In [10]:
test.head()

Unnamed: 0,pid,age,sex,smoking,week_1,fvc_1,percent_1,fvc_last_1,fvc_last_2,fvc_last_3,week_last_1,week_last_2,week_last_3,img,lung_area,tissue_area,lung_tissue_ratio
0,ID00419637202311204720264,73,Male,Ex-smoker,6,3020,70.186852,3020,3020,3020,6,6,6,ID00419637202311204720264/1.png,1209,10598,0.114078
1,ID00419637202311204720264,73,Male,Ex-smoker,6,3020,70.186852,3020,3020,3020,6,6,6,ID00419637202311204720264/10.png,2897,5382,0.538276
2,ID00419637202311204720264,73,Male,Ex-smoker,6,3020,70.186852,3020,3020,3020,6,6,6,ID00419637202311204720264/11.png,3228,6479,0.498225
3,ID00419637202311204720264,73,Male,Ex-smoker,6,3020,70.186852,3020,3020,3020,6,6,6,ID00419637202311204720264/12.png,1166,12077,0.096547
4,ID00419637202311204720264,73,Male,Ex-smoker,6,3020,70.186852,3020,3020,3020,6,6,6,ID00419637202311204720264/13.png,2956,6687,0.442052


# Inference on image data

In [11]:
model = keras.models.load_model(CV_MODEL_FILE)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
efficientnetb4 (Functional)  (None, 1792)              17673823  
_________________________________________________________________
batch_normalization (BatchNo (None, 1792)              7168      
_________________________________________________________________
dense (Dense)                (None, 1792)              3213056   
_________________________________________________________________
dropout (Dropout)            (None, 1792)              0         
_________________________________________________________________
output (Dense)               (None, 3)                 5379      
Total params: 20,899,426
Trainable params: 3,222,019
Non-trainable params: 17,677,407
_________________________________________________________________


In [12]:
target_size = (INPUT_SHAPE[0], INPUT_SHAPE[1])
color_mode='rgb'
class_mode='multi_output'
idg = keras.preprocessing.image.ImageDataGenerator()
test_gen = idg.flow_from_dataframe(
    dataframe = test,
    x_col='img',
    y_col=TARGET,
    directory=OUTPUT,
    target_size=target_size,
    color_mode=color_mode,
    shuffle=False,
    batch_size=BATCH_SIZE,
    class_mode=class_mode
)
preds = model.predict(test_gen, use_multiprocessing=False, workers=4, verbose=1)
print(f'preds.shape={preds.shape}')

Found 853 validated image filenames.
preds.shape=(853, 3)


In [13]:
preds = preds.T
test['fvc_last_3_cv'] = preds[0]
test['fvc_last_2_cv'] = preds[1]
test['fvc_last_1_cv'] = preds[2]
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 853 entries, 0 to 852
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   pid                853 non-null    object 
 1   age                853 non-null    uint8  
 2   sex                853 non-null    object 
 3   smoking            853 non-null    object 
 4   week_1             853 non-null    int16  
 5   fvc_1              853 non-null    uint16 
 6   percent_1          853 non-null    float32
 7   fvc_last_1         853 non-null    uint16 
 8   fvc_last_2         853 non-null    uint16 
 9   fvc_last_3         853 non-null    uint16 
 10  week_last_1        853 non-null    int16  
 11  week_last_2        853 non-null    int16  
 12  week_last_3        853 non-null    int16  
 13  img                853 non-null    object 
 14  lung_area          853 non-null    uint32 
 15  tissue_area        853 non-null    uint32 
 16  lung_tissue_ratio  853 non

In [14]:
test.drop(['img'], axis=1, inplace=True)

In [15]:
def set_dist(row, fvc_last_1_cv, fvc_last_2_cv, fvc_last_3_cv, lung_area, tissue_area, lung_tissue_ratio) -> None:
    quantiles = [0, 0.5, 0.75, 0.9, 0.95, 0.99, 1]
    s = pd.Series(fvc_last_1_cv)
    qs = s.quantile(quantiles).to_numpy()
    row['fvc_last_1_min'] = qs[0]
    row['fvc_last_1_p50'] = qs[1]
    row['fvc_last_1_p75'] = qs[2]
    row['fvc_last_1_p90'] = qs[3]
    row['fvc_last_1_p95'] = qs[4]
    row['fvc_last_1_p99'] = qs[5]
    row['fvc_last_1_max'] = qs[6]
    s = pd.Series(fvc_last_2_cv)
    qs = s.quantile(quantiles).to_numpy()
    row['fvc_last_2_min'] = qs[0]
    row['fvc_last_2_p50'] = qs[1]
    row['fvc_last_2_p75'] = qs[2]
    row['fvc_last_2_p90'] = qs[3]
    row['fvc_last_2_p95'] = qs[4]
    row['fvc_last_2_p99'] = qs[5]
    row['fvc_last_2_max'] = qs[6]
    s = pd.Series(fvc_last_3_cv)
    qs = s.quantile(quantiles).to_numpy()
    row['fvc_last_3_min'] = qs[0]
    row['fvc_last_3_p50'] = qs[1]
    row['fvc_last_3_p75'] = qs[2]
    row['fvc_last_3_p90'] = qs[3]
    row['fvc_last_3_p95'] = qs[4]
    row['fvc_last_3_p99'] = qs[5]
    row['fvc_last_3_max'] = qs[6]
    s = pd.Series(lung_area)
    qs = s.quantile(quantiles).to_numpy()
    row['lung_area_min'] = qs[0]
    row['lung_area_p50'] = qs[1]
    row['lung_area_p75'] = qs[2]
    row['lung_area_p90'] = qs[3]
    row['lung_area_p95'] = qs[4]
    row['lung_area_p99'] = qs[5]
    row['lung_area_max'] = qs[6]
    s = pd.Series(tissue_area)
    qs = s.quantile(quantiles).to_numpy()
    row['tissue_area_min'] = qs[0]
    row['tissue_area_p50'] = qs[1]
    row['tissue_area_p75'] = qs[2]
    row['tissue_area_p90'] = qs[3]
    row['tissue_area_p95'] = qs[4]
    row['tissue_area_p99'] = qs[5]
    row['tissue_area_max'] = qs[6]
    s = pd.Series(lung_tissue_ratio)
    qs = s.quantile(quantiles).to_numpy()
    row['lung_tissue_ratio_min'] = qs[0]
    row['lung_tissue_ratio_p50'] = qs[1]
    row['lung_tissue_ratio_p75'] = qs[2]
    row['lung_tissue_ratio_p90'] = qs[3]
    row['lung_tissue_ratio_p95'] = qs[4]
    row['lung_tissue_ratio_p99'] = qs[5]
    row['lung_tissue_ratio_max'] = qs[6]



rows = []
row: Dict[str, Any] = {}
prev = None
fvc_last_1_cv = []
fvc_last_2_cv = []
fvc_last_3_cv = []
lung_area = []
tissue_area = []
lung_tissue_ratio = []
for t in test.itertuples():
    # new patient
    if prev is not None and prev != t.pid:
        set_dist(row, fvc_last_1_cv, fvc_last_2_cv, fvc_last_3_cv, lung_area, tissue_area, lung_tissue_ratio)
        rows.append(row)
    if prev is None or prev != t.pid:
        row = {}
        fvc_last_1_cv = []
        fvc_last_2_cv = []
        fvc_last_3_cv = []
        lung_area = []
        tissue_area = []
        lung_tissue_ratio = []
        row['pid'] = t.pid
        row['age'] = t.age
        row['sex'] = t.sex
        row['smoking'] = t.smoking
        row['week_1'] = t.week_1
        row['fvc_1'] = t.fvc_1
        row['percent_1'] = t.percent_1
        row['fvc_last_1'] = t.fvc_last_1
        row['fvc_last_2'] = t.fvc_last_2
        row['fvc_last_3'] = t.fvc_last_3
        row['week_last_1'] = t.week_last_1
        row['week_last_2'] = t.week_last_2
        row['week_last_3'] = t.week_last_3
    prev = t.pid
    fvc_last_1_cv.append(t.fvc_last_1_cv)
    fvc_last_2_cv.append(t.fvc_last_2_cv)
    fvc_last_3_cv.append(t.fvc_last_3_cv)
    lung_area.append(t.lung_area)
    tissue_area.append(t.tissue_area)
    lung_tissue_ratio.append(t.lung_tissue_ratio)
    
# add the last patient!
if len(row) != 0:
    set_dist(row, fvc_last_1_cv, fvc_last_2_cv, fvc_last_3_cv, lung_area, tissue_area, lung_tissue_ratio)
    rows.append(row)


# handle patients whose scans failed preprocessing
for p in noscan_patients:
    row = {}
    rec = data.loc[p]
    row['pid'] = rec['Patient']
    row['age'] = rec['Age']
    row['sex'] = rec['Sex']
    row['smoking'] = rec['SmokingStatus']
    row['week_1'] = rec['Weeks']
    row['fvc_1'] = rec['FVC']
    row['percent_1'] = rec['Percent']
    row['fvc_last_1'] = rec['FVC']
    row['fvc_last_2'] = rec['FVC']
    row['fvc_last_3'] = rec['FVC']
    row['week_last_1'] = rec['Weeks']
    row['week_last_2'] = rec['Weeks']
    row['week_last_3'] = rec['Weeks']
    row['fvc_last_1_min'] = rec['FVC']
    row['fvc_last_1_p50'] = rec['FVC']
    row['fvc_last_1_p75'] = rec['FVC']
    row['fvc_last_1_p90'] = rec['FVC']
    row['fvc_last_1_p95'] = rec['FVC']
    row['fvc_last_1_p99'] = rec['FVC']
    row['fvc_last_1_max'] = rec['FVC']
    row['fvc_last_2_min'] = rec['FVC']
    row['fvc_last_2_p50'] = rec['FVC']
    row['fvc_last_2_p75'] = rec['FVC']
    row['fvc_last_2_p90'] = rec['FVC']
    row['fvc_last_2_p95'] = rec['FVC']
    row['fvc_last_2_p99'] = rec['FVC']
    row['fvc_last_2_max'] = rec['FVC']
    row['fvc_last_3_min'] = rec['FVC']
    row['fvc_last_3_p50'] = rec['FVC']
    row['fvc_last_3_p75'] = rec['FVC']
    row['fvc_last_3_p90'] = rec['FVC']
    row['fvc_last_3_p95'] = rec['FVC']
    row['fvc_last_3_p99'] = rec['FVC']
    row['fvc_last_3_max'] = rec['FVC']
    row['lung_area_min'] = 1727
    row['lung_area_p50'] = 1727
    row['lung_area_p75'] = 1727
    row['lung_area_p90'] = 1727
    row['lung_area_p95'] = 1727
    row['lung_area_p99'] = 1727
    row['lung_area_max'] = 1727
    row['tissue_area_min'] = 6327
    row['tissue_area_p50'] = 6327
    row['tissue_area_p75'] = 6327
    row['tissue_area_p90'] = 6327
    row['tissue_area_p95'] = 6327
    row['tissue_area_p99'] = 6327
    row['tissue_area_max'] = 6327
    row['lung_tissue_ratio_min'] = 0.280338
    row['lung_tissue_ratio_p50'] = 0.280338
    row['lung_tissue_ratio_p75'] = 0.280338
    row['lung_tissue_ratio_p90'] = 0.280338
    row['lung_tissue_ratio_p95'] = 0.280338
    row['lung_tissue_ratio_p99'] = 0.280338
    row['lung_tissue_ratio_max'] = 0.280338
    rows.append(row)



test = pd.DataFrame.from_records(rows)
test = test.astype({
    'pid': str,
    'age': np.uint8,
    'sex': str,
    'smoking': str,
    'week_1': np.int16,
    'fvc_1': np.uint16,
    'percent_1': np.float32,
    'fvc_last_1': np.uint16,
    'fvc_last_2': np.uint16,
    'fvc_last_3': np.uint16,
    'week_last_1': np.int16,
    'week_last_2': np.int16,
    'week_last_3': np.int16,
    'fvc_last_1_min': np.float32,
    'fvc_last_1_p50': np.float32,
    'fvc_last_1_p75': np.float32,
    'fvc_last_1_p90': np.float32,
    'fvc_last_1_p95': np.float32,
    'fvc_last_1_p99': np.float32,
    'fvc_last_1_max': np.float32,
    'fvc_last_2_min': np.float32,
    'fvc_last_2_p50': np.float32,
    'fvc_last_2_p75': np.float32,
    'fvc_last_2_p90': np.float32,
    'fvc_last_2_p95': np.float32,
    'fvc_last_2_p99': np.float32,
    'fvc_last_2_max': np.float32,
    'fvc_last_3_min': np.float32,
    'fvc_last_3_p50': np.float32,
    'fvc_last_3_p75': np.float32,
    'fvc_last_3_p90': np.float32,
    'fvc_last_3_p95': np.float32,
    'fvc_last_3_p99': np.float32,
    'fvc_last_3_max': np.float32,
    'lung_area_min': np.float32,
    'lung_area_p50': np.float32,
    'lung_area_p75': np.float32,
    'lung_area_p90': np.float32,
    'lung_area_p95': np.float32,
    'lung_area_p99': np.float32,
    'lung_area_max': np.float32,
    'tissue_area_min': np.float32,
    'tissue_area_p50': np.float32,
    'tissue_area_p75': np.float32,
    'tissue_area_p90': np.float32,
    'tissue_area_p95': np.float32,
    'tissue_area_p99': np.float32,
    'tissue_area_max': np.float32,
    'lung_tissue_ratio_min': np.float32,
    'lung_tissue_ratio_p50': np.float32,
    'lung_tissue_ratio_p75': np.float32,
    'lung_tissue_ratio_p90': np.float32,
    'lung_tissue_ratio_p95': np.float32,
    'lung_tissue_ratio_p99': np.float32,
    'lung_tissue_ratio_max': np.float32,
})
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 55 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   pid                    5 non-null      object 
 1   age                    5 non-null      uint8  
 2   sex                    5 non-null      object 
 3   smoking                5 non-null      object 
 4   week_1                 5 non-null      int16  
 5   fvc_1                  5 non-null      uint16 
 6   percent_1              5 non-null      float32
 7   fvc_last_1             5 non-null      uint16 
 8   fvc_last_2             5 non-null      uint16 
 9   fvc_last_3             5 non-null      uint16 
 10  week_last_1            5 non-null      int16  
 11  week_last_2            5 non-null      int16  
 12  week_last_3            5 non-null      int16  
 13  fvc_last_1_min         5 non-null      float32
 14  fvc_last_1_p50         5 non-null      float32
 15  fvc_last_1

In [16]:
test.head()

Unnamed: 0,pid,age,sex,smoking,week_1,fvc_1,percent_1,fvc_last_1,fvc_last_2,fvc_last_3,week_last_1,week_last_2,week_last_3,fvc_last_1_min,fvc_last_1_p50,fvc_last_1_p75,fvc_last_1_p90,fvc_last_1_p95,fvc_last_1_p99,fvc_last_1_max,fvc_last_2_min,fvc_last_2_p50,fvc_last_2_p75,fvc_last_2_p90,fvc_last_2_p95,fvc_last_2_p99,fvc_last_2_max,fvc_last_3_min,fvc_last_3_p50,fvc_last_3_p75,fvc_last_3_p90,fvc_last_3_p95,fvc_last_3_p99,fvc_last_3_max,lung_area_min,lung_area_p50,lung_area_p75,lung_area_p90,lung_area_p95,lung_area_p99,lung_area_max,tissue_area_min,tissue_area_p50,tissue_area_p75,tissue_area_p90,tissue_area_p95,tissue_area_p99,tissue_area_max,lung_tissue_ratio_min,lung_tissue_ratio_p50,lung_tissue_ratio_p75,lung_tissue_ratio_p90,lung_tissue_ratio_p95,lung_tissue_ratio_p99,lung_tissue_ratio_max
0,ID00419637202311204720264,73,Male,Ex-smoker,6,3020,70.186852,3020,3020,3020,6,6,6,1755.218872,1993.170776,2040.519287,2170.290771,2288.594971,2388.203125,2422.076172,1749.875122,1988.050537,2035.876709,2164.606201,2282.508301,2382.138428,2415.994629,1753.001953,1991.195068,2038.71167,2168.175049,2286.382324,2386.138672,2420.05249,1062.0,2425.5,2894.75,3120.100098,3201.75,3520.72998,3629.0,4833.0,7686.5,10704.0,14028.099609,17253.699219,18706.470703,19068.0,0.055695,0.310563,0.452613,0.5201,0.558761,0.581905,0.586385
1,ID00421637202311550012437,68,Male,Ex-smoker,15,2739,82.045288,2739,2739,2739,15,15,15,1973.940674,2215.106934,2277.627441,2316.939209,2323.922119,2382.368408,2402.996094,1970.100098,2211.291992,2273.252197,2312.720215,2320.003662,2378.40918,2398.870361,1972.491577,2213.606445,2275.846191,2315.237793,2322.388672,2380.779541,2401.310547,304.0,1117.0,1559.5,1727.400024,1793.849976,1891.75,1907.0,2853.0,4605.0,6762.75,7816.899902,7951.649902,8282.169922,8284.0,0.046907,0.249877,0.353083,0.393445,0.419008,0.460098,0.505062
2,ID00422637202311677017371,73,Male,Ex-smoker,6,1930,76.672493,1930,1930,1930,6,6,6,1800.063843,2080.223877,2134.165039,2192.431396,2235.386719,2291.397461,2361.101318,1795.481812,2076.734375,2130.778564,2188.518799,2231.613281,2287.243164,2356.725342,1798.093506,2078.861572,2132.761719,2190.835449,2233.831299,2289.738525,2359.381104,363.0,1048.0,1858.0,2095.0,2217.800049,2480.159912,2573.0,2695.0,3716.0,5030.0,6156.200195,6321.399902,6923.720215,7137.0,0.057307,0.234274,0.554655,0.641637,0.688987,0.752324,0.780734
3,ID00423637202312137826377,72,Male,Ex-smoker,17,3294,79.258904,3294,3294,3294,17,17,17,2101.561523,2281.04834,2384.118652,2477.837646,2518.882324,2540.867676,2564.791016,2098.225098,2277.25,2380.603027,2474.005371,2515.038086,2537.089111,2560.825195,2100.091309,2279.53125,2382.694336,2476.292236,2517.290527,2539.388428,2563.157227,514.0,1976.0,3389.75,3991.699951,4223.0,4391.040039,4458.0,9135.0,10661.5,13399.0,17013.900391,17550.949219,17935.439453,18098.0,0.031469,0.184374,0.348307,0.394121,0.42795,0.477978,0.487028
4,ID00426637202313170790466,73,Male,Never smoked,0,2925,71.824966,2925,2925,2925,0,0,0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,1727.0,1727.0,1727.0,1727.0,1727.0,1727.0,1727.0,6327.0,6327.0,6327.0,6327.0,6327.0,6327.0,6327.0,0.280338,0.280338,0.280338,0.280338,0.280338,0.280338,0.280338


# Inference on tabular data

In [17]:
m1 = lgb.Booster(model_file=LGB1_MODEL_FILE)
m2 = lgb.Booster(model_file=LGB2_MODEL_FILE)
m3 = lgb.Booster(model_file=LGB3_MODEL_FILE)

In [18]:
sex_encoder = {'Male': 0.7873563218390804, 'Female': 0.21264367816091953}
smoking_encoder = {'Ex-smoker': 0.6666666666666666, 'Never smoked': 0.28160919540229884, 'Currently smokes': 0.05172413793103448}
test['sex'] = test['sex'].map(sex_encoder).fillna(0)
assert not test['sex'].isna().any()
test['smoking'] = test['smoking'].map(smoking_encoder).fillna(0)
assert not test['smoking'].isna().any()


In [19]:
FEATURES = set(test.columns) - NON_FEATURES
FEATURES = list(FEATURES)
FEATURES.sort()
print(f'{len(FEATURES)} FEATURES={FEATURES}')

48 FEATURES=['age', 'fvc_1', 'fvc_last_1_max', 'fvc_last_1_min', 'fvc_last_1_p50', 'fvc_last_1_p75', 'fvc_last_1_p90', 'fvc_last_1_p95', 'fvc_last_1_p99', 'fvc_last_2_max', 'fvc_last_2_min', 'fvc_last_2_p50', 'fvc_last_2_p75', 'fvc_last_2_p90', 'fvc_last_2_p95', 'fvc_last_2_p99', 'fvc_last_3_max', 'fvc_last_3_min', 'fvc_last_3_p50', 'fvc_last_3_p75', 'fvc_last_3_p90', 'fvc_last_3_p95', 'fvc_last_3_p99', 'lung_area_max', 'lung_area_min', 'lung_area_p50', 'lung_area_p75', 'lung_area_p90', 'lung_area_p95', 'lung_area_p99', 'lung_tissue_ratio_max', 'lung_tissue_ratio_min', 'lung_tissue_ratio_p50', 'lung_tissue_ratio_p75', 'lung_tissue_ratio_p90', 'lung_tissue_ratio_p95', 'lung_tissue_ratio_p99', 'percent_1', 'sex', 'smoking', 'tissue_area_max', 'tissue_area_min', 'tissue_area_p50', 'tissue_area_p75', 'tissue_area_p90', 'tissue_area_p95', 'tissue_area_p99', 'week_1']


In [20]:
x_test = test[FEATURES]
test['pred_fvc_last_1'] = m1.predict(x_test)
test['pred_fvc_last_2'] = m2.predict(x_test)
test['pred_fvc_last_3'] = m3.predict(x_test)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 58 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   pid                    5 non-null      object 
 1   age                    5 non-null      uint8  
 2   sex                    5 non-null      float64
 3   smoking                5 non-null      float64
 4   week_1                 5 non-null      int16  
 5   fvc_1                  5 non-null      uint16 
 6   percent_1              5 non-null      float32
 7   fvc_last_1             5 non-null      uint16 
 8   fvc_last_2             5 non-null      uint16 
 9   fvc_last_3             5 non-null      uint16 
 10  week_last_1            5 non-null      int16  
 11  week_last_2            5 non-null      int16  
 12  week_last_3            5 non-null      int16  
 13  fvc_last_1_min         5 non-null      float32
 14  fvc_last_1_p50         5 non-null      float32
 15  fvc_last_1

In [21]:
test.head()

Unnamed: 0,pid,age,sex,smoking,week_1,fvc_1,percent_1,fvc_last_1,fvc_last_2,fvc_last_3,week_last_1,week_last_2,week_last_3,fvc_last_1_min,fvc_last_1_p50,fvc_last_1_p75,fvc_last_1_p90,fvc_last_1_p95,fvc_last_1_p99,fvc_last_1_max,fvc_last_2_min,fvc_last_2_p50,fvc_last_2_p75,fvc_last_2_p90,fvc_last_2_p95,fvc_last_2_p99,fvc_last_2_max,fvc_last_3_min,fvc_last_3_p50,fvc_last_3_p75,fvc_last_3_p90,fvc_last_3_p95,fvc_last_3_p99,fvc_last_3_max,lung_area_min,lung_area_p50,lung_area_p75,lung_area_p90,lung_area_p95,lung_area_p99,lung_area_max,tissue_area_min,tissue_area_p50,tissue_area_p75,tissue_area_p90,tissue_area_p95,tissue_area_p99,tissue_area_max,lung_tissue_ratio_min,lung_tissue_ratio_p50,lung_tissue_ratio_p75,lung_tissue_ratio_p90,lung_tissue_ratio_p95,lung_tissue_ratio_p99,lung_tissue_ratio_max,pred_fvc_last_1,pred_fvc_last_2,pred_fvc_last_3
0,ID00419637202311204720264,73,0.787356,0.666667,6,3020,70.186852,3020,3020,3020,6,6,6,1755.218872,1993.170776,2040.519287,2170.290771,2288.594971,2388.203125,2422.076172,1749.875122,1988.050537,2035.876709,2164.606201,2282.508301,2382.138428,2415.994629,1753.001953,1991.195068,2038.71167,2168.175049,2286.382324,2386.138672,2420.05249,1062.0,2425.5,2894.75,3120.100098,3201.75,3520.72998,3629.0,4833.0,7686.5,10704.0,14028.099609,17253.699219,18706.470703,19068.0,0.055695,0.310563,0.452613,0.5201,0.558761,0.581905,0.586385,2639.528147,2713.309536,2665.432654
1,ID00421637202311550012437,68,0.787356,0.666667,15,2739,82.045288,2739,2739,2739,15,15,15,1973.940674,2215.106934,2277.627441,2316.939209,2323.922119,2382.368408,2402.996094,1970.100098,2211.291992,2273.252197,2312.720215,2320.003662,2378.40918,2398.870361,1972.491577,2213.606445,2275.846191,2315.237793,2322.388672,2380.779541,2401.310547,304.0,1117.0,1559.5,1727.400024,1793.849976,1891.75,1907.0,2853.0,4605.0,6762.75,7816.899902,7951.649902,8282.169922,8284.0,0.046907,0.249877,0.353083,0.393445,0.419008,0.460098,0.505062,2634.089282,2544.287612,2709.768443
2,ID00422637202311677017371,73,0.787356,0.666667,6,1930,76.672493,1930,1930,1930,6,6,6,1800.063843,2080.223877,2134.165039,2192.431396,2235.386719,2291.397461,2361.101318,1795.481812,2076.734375,2130.778564,2188.518799,2231.613281,2287.243164,2356.725342,1798.093506,2078.861572,2132.761719,2190.835449,2233.831299,2289.738525,2359.381104,363.0,1048.0,1858.0,2095.0,2217.800049,2480.159912,2573.0,2695.0,3716.0,5030.0,6156.200195,6321.399902,6923.720215,7137.0,0.057307,0.234274,0.554655,0.641637,0.688987,0.752324,0.780734,1715.647497,1817.159729,1889.482779
3,ID00423637202312137826377,72,0.787356,0.666667,17,3294,79.258904,3294,3294,3294,17,17,17,2101.561523,2281.04834,2384.118652,2477.837646,2518.882324,2540.867676,2564.791016,2098.225098,2277.25,2380.603027,2474.005371,2515.038086,2537.089111,2560.825195,2100.091309,2279.53125,2382.694336,2476.292236,2517.290527,2539.388428,2563.157227,514.0,1976.0,3389.75,3991.699951,4223.0,4391.040039,4458.0,9135.0,10661.5,13399.0,17013.900391,17550.949219,17935.439453,18098.0,0.031469,0.184374,0.348307,0.394121,0.42795,0.477978,0.487028,2615.118184,2546.177662,2657.071946
4,ID00426637202313170790466,73,0.787356,0.281609,0,2925,71.824966,2925,2925,2925,0,0,0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,2925.0,1727.0,1727.0,1727.0,1727.0,1727.0,1727.0,1727.0,6327.0,6327.0,6327.0,6327.0,6327.0,6327.0,6327.0,0.280338,0.280338,0.280338,0.280338,0.280338,0.280338,0.280338,3239.519438,3285.449183,3192.177425


# Submission

In [22]:
def _breakpoints(week_1) -> Tuple[int, int]:
    if week_1 <= 29:
        return (41, 57)
    if week_1 <= 34:
        return (46, 62)
    return (62, 76)


week_min = -12
week_max = 133
rows = []
row: Dict[str, Any] = {}
for t in test.itertuples():
    breakpoints = _breakpoints(t.week_1)
    for i in range(week_min, week_max + 1):
        row = {}
        if i >= breakpoints[1]:
            row['FVC'] = int(t.pred_fvc_last_1)
        elif i >= breakpoints[0]:
            row['FVC'] = int(t.pred_fvc_last_2)
        else:
            row['FVC'] = int(t.pred_fvc_last_3)
        row['Patient_Week'] = f'{t.pid}_{i}'
        row['Confidence'] = CONFIDENCE
        rows.append(row)
    
    
cols = ['Patient_Week', 'FVC', 'Confidence']
sub = pd.DataFrame.from_records(rows, columns=cols)
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 730 entries, 0 to 729
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Patient_Week  730 non-null    object
 1   FVC           730 non-null    int64 
 2   Confidence    730 non-null    int64 
dtypes: int64(2), object(1)
memory usage: 17.2+ KB


In [23]:
sub.to_csv('submission.csv', index=False)

# DEBUG

In [24]:
!pip list

Package                  Version
------------------------ ---------
absl-py                  0.10.0
argon2-cffi              20.1.0
astunparse               1.6.3
async-generator          1.10
attrs                    20.2.0
backcall                 0.2.0
bleach                   3.2.1
cachetools               4.1.1
certifi                  2020.6.20
cffi                     1.14.3
chardet                  3.0.4
cycler                   0.10.0
decorator                4.4.2
defusedxml               0.6.0
entrypoints              0.3
fsspec                   0.8.3
gast                     0.3.3
gcsfs                    0.6.2
google-api-core          1.22.2
google-auth              1.21.2
google-auth-oauthlib     0.4.1
google-cloud-core        1.4.1
google-cloud-logging     1.15.1
google-cloud-storage     1.27.0
google-pasta             0.2.0
google-resumable-media   0.5.1
googleapis-common-protos 1.52.0
grpcio                   1.32.0
h5py                  