In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, RobustScaler

train_ori = pd.read_csv('../input/ventilator-pressure-prediction/train.csv', index_col=None, header=0)
test_ori = pd.read_csv('../input/ventilator-pressure-prediction/test.csv', index_col=None, header=0)



def add_features(df):
    df['area'] = df['time_step'] * df['u_in']
    df['area'] = df.groupby('breath_id')['area'].cumsum()

    df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()

    df['u_in_lag1'] = df.groupby('breath_id')['u_in'].shift(1)
    df['u_in_lag_back1'] = df.groupby('breath_id')['u_in'].shift(-1)
    df['u_in_lag2'] = df.groupby('breath_id')['u_in'].shift(2)
    df['u_in_lag_back2'] = df.groupby('breath_id')['u_in'].shift(-2)
    df['u_in_lag3'] = df.groupby('breath_id')['u_in'].shift(3)
    df['u_in_lag_back3'] = df.groupby('breath_id')['u_in'].shift(-3)
    df['u_in_lag4'] = df.groupby('breath_id')['u_in'].shift(4)
    df['u_in_lag_back4'] = df.groupby('breath_id')['u_in'].shift(-4)
    df = df.fillna(0)

    df['breath_id__u_in__max'] = df.groupby(['breath_id'])['u_in'].transform('max')
   
    df['u_in_diff1'] = df['u_in'] - df['u_in_lag1']
    df['u_in_diff2'] = df['u_in'] - df['u_in_lag2']
    df['u_in_diff3'] = df['u_in'] - df['u_in_lag3']
    df['u_in_diff4'] = df['u_in'] - df['u_in_lag4']
    df['cross'] = df['u_in'] * df['u_out']
    df['cross2'] = df['time_step'] * df['u_out']

    df['R'] = df['R'].astype(str)
    df['C'] = df['C'].astype(str)
    df['R__C'] = df["R"].astype(str) + '__' + df["C"].astype(str)
    df = pd.get_dummies(df)
    return df


#处理train数据和test数据
train = add_features(train_ori)
test = add_features(test_ori)

# 压力
targets = train[['pressure']].to_numpy().reshape(-1, 80)

# 丢弃无用列
train.drop(['pressure', 'id', 'breath_id'], axis=1, inplace=True)
train = train.values
test.drop(['id', 'breath_id'], axis=1, inplace=True)
test = test.values

# 拆分其他特征与rc的multi-hot编码特征
test_x_rc = test[:, -15:]
test_x_other = test[:, :-15]
train_x_other = train[:, :-15]
train_x_rc = train[:, -15:]

# 进行标准化处理
scale = RobustScaler()
scale.fit(train_x_other)
train_x_other = scale.transform(train_x_other)
train_x_other = train_x_other.reshape(-1, 80, train_x_other.shape[-1])
train_x_rc = train_x_rc.reshape(-1, 80, train_x_rc.shape[-1])

test_x_other = scale.transform(test_x_other)
test_x_other = test_x_other.reshape((-1, 80, test_x_other.shape[-1]))
test_x_rc = test_x_rc.reshape((-1, 80, test_x_rc.shape[-1]))
test_x_rc = test_x_rc[:, 0, :]  # train_data少这一步是因为后面kfold还要合并，所以不如直接不拆

# 保存处理好的数据
np.save('./train_x_rc.npy', train_x_rc)
np.save('./train_x_other.npy', train_x_other)
np.save('./target.npy', targets)
np.save('./test_x_rc.npy', test_x_rc)
np.save('./test_x_other.npy', test_x_other)