In [None]:
import pandas as pd
import numpy as np
from sklearn import metrics
import time

In [None]:
df = pd.read_csv('../input/google-brain5folds/Google_Brain-5Folds.csv')
test = pd.read_csv('../input/ventilator-pressure-prediction/test.csv')
sample_submission = pd.read_csv('../input/ventilator-pressure-prediction/sample_submission.csv')

In [None]:
## add u_in_cumsum
df['u_in_cumsum'] = (df['u_in']).groupby(df['breath_id']).cumsum()

test['u_in_cumsum'] = (test['u_in']).groupby(test['breath_id']).cumsum()

## add R*C
df['R*C'] = df['R'] * df['C']

test['R*C'] = test['R'] * test['C']

## add breath_id__u_in__min
df['breath_id__u_in__min'] = df.groupby(['breath_id'])['u_in'].transform('min')

test['breath_id__u_in__min'] = test.groupby(['breath_id'])['u_in'].transform('min')

## add time_diff
df['time_diff']=df.time_step.diff().fillna(0)

test['time_diff']=test.time_step.diff().fillna(0)

## add lag
df['u_in_lag'] = df['u_in'].shift(1)
df = df.fillna(0)

test['u_in_lag'] = test['u_in'].shift(1)
test = test.fillna(0)

## add u_in_diff
df['u_in_diff'] = df['u_in'] - df['u_in_lag']

test['u_in_diff'] = test['u_in'] - test['u_in_lag']

## add breath_id__u_in__diffmax & breath_id__u_in__diffmean
df['breath_id__u_in__diffmax'] = df.groupby(['breath_id'])['u_in'].transform('max') - df['u_in']
df['breath_id__u_in__diffmean'] = df.groupby(['breath_id'])['u_in'].transform('mean') - df['u_in']

test['breath_id__u_in__diffmax'] = test.groupby(['breath_id'])['u_in'].transform('max') - test['u_in']
test['breath_id__u_in__diffmean'] = test.groupby(['breath_id'])['u_in'].transform('mean') - test['u_in']

## add last_value_u_in
df['last_value_u_in'] = df.groupby('breath_id')['u_in'].transform('last')

test['last_value_u_in'] = test.groupby('breath_id')['u_in'].transform('last')

useful_features =  [c for c in df.columns if c not in ("id", "pressure", "kfold")]
test = test[useful_features]

test.head()

In [None]:
import lightgbm as lgb
params = {'objective': 'regression',
          'learning_rate': 0.25,
          "boosting_type": "gbdt",
          'min_data_in_leaf':600,
          'max_bin': 128,
          #'device':'gpu',
          #"gpu_platform_id": 0,
          #"gpu_device_id": 0,
          'feature_fraction':0.4,
          'lambda_l1':36, 'lambda_l2':80,
          'max_depth':16,
          'num_leaves':1000,
          "metric": 'mae',

         }

In [None]:
final_predictions = []
valid_scores = []
for fold in range(5):
    print('Fold: {}'.format(fold))
    start = time.time()
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = test.copy()
    
    valid_ids = xvalid.id.values.tolist()

    ytrain = xtrain.pressure
    yvalid = xvalid.pressure
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    model = lgb.LGBMRegressor(**params, n_estimators=10000, n_jobs=-1)
    model.fit(xtrain, ytrain, eval_set = [(xvalid,yvalid)], early_stopping_rounds = 10, verbose = 200)
   
    test_preds = model.predict(xtest)
    preds_valid = model.predict(xvalid)
    
    final_predictions.append(test_preds)
    
    mae= metrics.mean_absolute_error(yvalid, preds_valid)
    print(fold,mae)
    
    valid_scores.append(mae)
    
    print(time.time()-start)

print('average of all valid folds {}'.format(np.mean(valid_scores)))

In [None]:
sample_submission.pressure = np.mean(np.column_stack(final_predictions),axis=1)

sample_submission.to_csv('third_sub.csv',index=False)
