In [1]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
#import matplotlib.pyplot as plt
#import seaborn as sns
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline



In [2]:
#import the data
data_features_train = pd.read_csv('train_features.csv',index_col='pid').sort_values(by=['pid', 'Time'])
data_features_test = pd.read_csv('test_features.csv',index_col='pid').sort_values(by=['pid', 'Time'])
data_labels_train = pd.read_csv('train_labels.csv', index_col='pid').sort_values(by=['pid'])


vitals = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
tests = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total',
         'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2',
         'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
sepsis = ['LABEL_Sepsis']

In [3]:
def feature_engineering(df):
    df_pop_mean = df.drop(['Age'], axis=1).mean()
    #construct the features
    
    df_age = df['Age'].groupby(by='pid', dropna=False).mean() #this is a bit hackerboy method lmao
    
    df_mean = df.drop(['Age'], axis=1).groupby(by='pid', dropna=False).mean()
    df_var = df.drop(['Age'], axis=1).groupby(by='pid', dropna=False).var()
    df_min = df.drop(['Age'], axis=1).groupby(by='pid', dropna=False).min()
    df_max = df.drop(['Age'], axis=1).groupby(by='pid', dropna=False).max()
    df_first = df.drop(['Age'], axis=1).groupby(by='pid', dropna=False).first()
    df_last = df.drop(['Age'], axis=1).groupby(by='pid', dropna=False).last()
    df_nan_count = df.drop(['Age'], axis=1).groupby(by='pid', dropna=False).count()
    #df_last_observed_value = df.groupby(by='pid', dropna=False)
    #df_first_observed_value
    
    
    
    #impute missing values
    df_mean.fillna({col:df_pop_mean[col] for col in df.drop(['Age'], axis=1).columns}, inplace=True)
    df_var.fillna({col:df_pop_mean[col] for col in df.drop(['Age'], axis=1).columns}, inplace=True)
    df_min.fillna({col:df_pop_mean[col] for col in df.drop(['Age'], axis=1).columns}, inplace=True)
    df_max.fillna({col:df_pop_mean[col] for col in df.drop(['Age'], axis=1).columns}, inplace=True)
    df_first.fillna({col:df_pop_mean[col] for col in df.drop(['Age'], axis=1).columns}, inplace=True)
    df_last.fillna({col:df_pop_mean[col] for col in df.drop(['Age'], axis=1).columns}, inplace=True)
    #df_last_observed_value.fillna({col:df_pop_mean[col] for col in df.drop(['Age'], axis=1).columns}, inplace=True)
    #df_first_observed_value.fillna({col:df_pop_mean[col] for col in df.drop(['Age'], axis=1).columns}, inplace=True)
    
    df_features = pd.concat([df_age, df_mean.add_suffix('_mean'), df_var.add_suffix('_var'),
                             df_min.add_suffix('_min'), df_max.add_suffix('_max'), df_first.add_suffix('_first'),
                             df_last.add_suffix('_last'), df_nan_count.add_suffix('_nan_count')], axis=1)
    return df_features

In [7]:
#set up features, took most correlated ones for each of 'RRate', 'ABPm', 'SpO2', 'Heartrate'

features_3 = ['Age','Temp', 'Lactate', 'ABPs', 'Chloride', 'Hgb', 'pH', 'ABPd', 'RRate', 'ABPm', 'SpO2', 'Heartrate']

X_3_train = feature_engineering(data_features_train[features_3]).to_numpy()
X_3_test = feature_engineering(data_features_test[features_3]).to_numpy()
y_3_train = data_labels_train[vitals].to_numpy()

In [8]:
# Define a pipeline to search for the best hyperparameters for ridge/randomforest

pipe = Pipeline([('scaler', StandardScaler()), ('regr', RandomForestRegressor())])

# Parameter space:
max_features = ['sqrt']
n_estimators = [500,600,700,800]

param_grid = {'regr':[RandomForestRegressor()],
            'regr__max_features':max_features,
            'regr__n_estimators':n_estimators}

model = GridSearchCV(pipe, param_grid=param_grid, scoring='r2', verbose=50)

predictions = np.empty([data_features_test.index.unique().size,4])
predictions.shape

best_scores = []
m = 0
for vital in vitals:

    model.fit(X_3_train, data_labels_train[vital])
    predictions[:,m] = model.predict(X_3_test) 
    best_scores.append(model.best_score_)
    m = m+1

print(f'The mean r2-score is: {np.mean(best_scores)}')


Fitting 5 folds for each of 4 candidates, totalling 20 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] regr=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False), regr__max_features=sqrt, regr__n_estimators=500 
[CV]  regr=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
       

In [24]:

predictions_fixed = predictions[:,0:4]

[[ 15.33147222  83.09596819  98.62266667  86.67726389]
 [ 17.71851516  87.19916667  97.2205      93.508125  ]
 [ 19.0175363   74.13116667  95.7065      71.414375  ]
 ...
 [ 18.61996591  77.72182828  98.4685      88.61475   ]
 [ 19.79520833  93.20283333  97.25766667 108.3435    ]
 [ 17.622625    81.96933333  98.9565     102.102375  ]]


In [30]:
submission_3 = pd.DataFrame(predictions_fixed, index=data_features_test.index.unique(), columns=data_labels_train[vitals].columns)
submission_3.head

<bound method NDFrame.head of        LABEL_RRate  LABEL_ABPm  LABEL_SpO2  LABEL_Heartrate
pid                                                        
0        15.331472   83.095968   98.622667        86.677264
3        17.718515   87.199167   97.220500        93.508125
5        19.017536   74.131167   95.706500        71.414375
7        17.878000   88.410167   97.758833        93.804250
9        20.222583   88.494778   96.003167        91.343375
...            ...         ...         ...              ...
31647    16.548000   71.837833   97.135833        74.072500
31649    16.432042   84.257029   96.387037        91.508250
31651    18.619966   77.721828   98.468500        88.614750
31652    19.795208   93.202833   97.257667       108.343500
31655    17.622625   81.969333   98.956500       102.102375

[12664 rows x 4 columns]>

In [42]:
subtask_1 = pd.read_csv('subtask_1.csv', index_col='pid')
subtask_2 = pd.read_csv('subtask_2.csv', index_col='pid')
submission = pd.concat((subtask_1, subtask_2, submission_3), axis=1)
submission.to_csv('prediction.zip', index=True, float_format='%.3f', compression='zip')
submission.to_csv('prediction.csv', index=True, float_format='%.3f')