In [None]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import RobustScaler, MinMaxScaler, StandardScaler #whatever the scaling method you decide
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GroupKFold #for balanced division of data

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, SequentialSampler

In [None]:
train_path = '../input/ventilator-pressure-prediction/train.csv'
test_path = '../input/ventilator-pressure-prediction/test.csv'

In [None]:
#this piece of code is from one of the notebooks here in this competition
def difference_operator(df, feature):
    col_name = f"{feature}_diff"
    df[col_name] = (
        df[feature].shift(-1).fillna(method="ffill")
        - df[feature].shift(1).fillna(method="bfill")
    ) / (
        df["time_step"].shift(-1).fillna(method="ffill")
        - df["time_step"].shift(1).fillna(method="bfill")
    )
    return df

In [None]:
#extracting some extra features from the main  "u_in" feature  MAY help our model!!! 

def get_extra_features(df):   
    df['state'] = df['R'].astype(str) + '_' + df['C'].astype(str)
    df = df.merge(pd.get_dummies(df["state"], prefix="state"), left_index=True, right_index=True).drop(["state"], axis=1)
    df['time_diff'] = df['time_step'].diff().fillna(0)
    df['flow_diff'] = df['u_in'].diff().fillna(0)
    df['flow_cum'] = df['u_in'].cumsum()
        
    df["flow_lag1"] = df.groupby("breath_id")["u_in"].shift(1).fillna(method="bfill")
    df["flow_back1"] = (df.groupby("breath_id")["u_in"].shift(-1).fillna(method="ffill"))
    df["flow_lag2"] = df.groupby("breath_id")["u_in"].shift(2).fillna(method="bfill")
    df["flow_back2"] = (df.groupby("breath_id")["u_in"].shift(-2).fillna(method="ffill"))
    df["flow_lag3"] = df.groupby("breath_id")["u_in"].shift(3).fillna(method="bfill")
    df["flow_back3"] = (df.groupby("breath_id")["u_in"].shift(-3).fillna(method="ffill"))
    df["time_lag"] = (df.groupby("breath_id")["time_step"].shift(1).fillna(method="bfill"))
    df["time_back"] = (df.groupby("breath_id")["time_step"].shift(-1).fillna(method="ffill"))

    df["area"] = df["time_back"] * df["u_in"]
    df["segment"] = (1-df["u_out"])
    df["area_cum"] = df.groupby(["breath_id"])["area"].cumsum()
    
    df['flow_diff2'] = df['u_in']-df['flow_lag1']
    df['flow_diff3'] = df['u_in']-df['flow_lag2']
    df['flow_diff4'] = df['u_in']-df['flow_lag3']
    
    difference_operator(df, "u_in")
    difference_operator(df, "u_in_diff")
    difference_operator(df, "u_in_diff_diff")
    difference_operator(df, "u_in_diff_diff_diff")
        
    difference_operator(df, "area")
    difference_operator(df, "area_diff")
    difference_operator(df, "area_diff_diff")
    difference_operator(df, "area_diff_diff_diff")
    return df

In [None]:
#getting our data preprocessed , divided and grouped by 'breath_id'
def get_data(scaler= None):
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    train = get_extra_features(train)
    test = get_extra_features(test)
    feats = [ 'u_in', 'time_diff','flow_diff','flow_cum', 'flow_lag1', 'flow_back1', 'flow_lag2', 'flow_back2',
            'flow_lag3', 'flow_back3', 'time_lag', 'time_back', 'area', 'area_cum', 
            'u_in_diff', 'u_in_diff_diff', 'u_in_diff_diff_diff', 'u_in_diff_diff_diff_diff',
            'area_diff', 'area_diff_diff', 'area_diff_diff_diff', 'area_diff_diff_diff_diff',
            'flow_diff2', 'flow_diff3', 'flow_diff4']
    
    test.loc[:, 'pressure'] = 0
    test.loc[:, 'fold'] = -1
    
    not_to_scale = [ 'state_20_10', 'state_20_20', 'state_20_50', 'state_50_10',
       'state_50_20', 'state_50_50', 'state_5_10', 'state_5_20', 'state_5_50','segment']
                              
    if scaler is not None :                          
        trans = make_column_transformer((scaler, feats),remainder='passthrough',n_jobs=-1)
    
        train[feats] = trans.fit_transform(train[feats])
        test[feats] = trans.transform(test[feats])
    
    Fold = GroupKFold(n_splits=10)   #whatever how many fold we want to split the data
    
    groups = train['breath_id'].values
    for f, (train_index, val_index) in enumerate(Fold.split(train, train['pressure'], groups)):
        train.loc[val_index, 'fold'] = int(f)
    train['fold'] = train['fold'].astype(int)
                              
    train = train.groupby('breath_id').agg(list).reset_index(drop=True)
    test = test.groupby('breath_id').agg(list).reset_index(drop=True)
    train['fold'] = train['fold'].apply(lambda x: x[0])
    test['fold'] = test['fold'].apply(lambda x: x[0])
 
    train = train[feats + not_to_scale + ['pressure']+['fold']]
    test = test[feats + not_to_scale + ['pressure']+['fold']]
    
    return {
        'train' :train,
        'test' : test,
    }

In [None]:
data = get_data(scaler=MinMaxScaler()) #or whatever

In [None]:
train = data['train']
test = data['test']
#train.head()     #check your data

In [None]:
#we can save our preprocessed data if we want 
import pickle
with open('preprocessed_data', 'wb') as handle:
    pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
class VentilatorData(Dataset):
    def __init__(self, df, flip=0):
        super().__init__()
        self.df = df.values.tolist()
        self.flip = flip              # percentage for data augmentation if we wish 
    def __getitem__(self, idx):
        row = self.df[idx]
        tensors = torch.as_tensor(row[:-2], dtype = torch.float)   #all rows except the ['fold', pressure]
        segment = torch.as_tensor(row[-3], dtype=torch.long)    #thats the 'segment' or the 'u_out' row
        target = torch.as_tensor(row[-2], dtype=torch.float)   #the 'pressure row'
        
        if np.random.rand() < self.flip:    #use it for sum augmentition
            tensors = tensors.flip(-1)
        return {
            'tensors':tensors,
            'segment':segment,
            'target':target,
        }
    def __len__(self):
        return len(self.df)

In [None]:
loader = DataLoader(VentilatorData(train), 64)
batch = next(iter(loader))
batch['tensors'].shape, batch['target'].shape, batch['segment'].shape

# we can start building our models 

# to be continued !
