In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [2]:
X_train = pd.read_csv('task2_k49am2lqi/train_features.csv')
y_train = pd.read_csv('task2_k49am2lqi/train_labels.csv')

In [3]:
X_test = pd.read_csv('task2_k49am2lqi/test_features.csv')

In [4]:
X_test.head(14)

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,0,1,39.0,,,,,,,,...,,,,,,,,,,
1,0,2,39.0,,44.2,17.0,,36.0,10.2,13.0,...,119.0,100.0,,98.0,31.0,82.0,21.8,,119.0,
2,0,3,39.0,,,,,,,,...,,100.0,,,,78.0,,,125.0,7.34
3,0,4,39.0,,,,,,,,...,,100.0,,,,80.0,,,136.0,
4,0,5,39.0,,,,,,,,...,,100.0,,,,83.0,,,135.0,
5,0,6,39.0,,,,,36.0,,,...,,100.0,,,,88.0,,,144.0,
6,0,7,39.0,,38.5,20.0,,,9.1,16.0,...,109.0,100.0,,102.0,25.9,,26.4,,,
7,0,8,39.0,,,,,36.0,,,...,,100.0,,,,90.0,,,129.0,7.4
8,0,9,39.0,,,,,36.0,,,...,,100.0,,,,90.0,,,121.0,
9,0,10,39.0,,,,,36.0,,,...,,100.0,,,,85.0,,,120.0,


In [5]:
def process_data(X):
    X = X.fillna(0)
    X_new = pd.DataFrame(columns=X.columns).drop('Time', axis=1)
    length = X.shape[1] - 3

    for i in range(0, X.shape[0], 12):
        new_f = np.zeros(length + 2)
        for j in range(length):
            temp = 0
            counts = 0
            for k in range(12):
                t = k*X.iloc[i + k ,3 + j]
                temp += t
                if t != 0:
                    counts += k
        
            if counts == 0:
                new_f[j + 2] = -1 #No valid measurement
            else:
                new_f[j + 2] = temp/counts
    
        new_f[0] = X.iloc[i, 0]
        new_f[1] = X.iloc[i, 2]
    
        X_new.loc[i] = new_f
    
    return X_new

In [6]:
def get_avg(X):
    X = X.values
    avg = np.zeros(X.shape[1])
    for i in range(X.shape[1]):
        temp = 0
        count = 0
        for j in range(X.shape[0]):
            if X[j, i] > 0:
                temp += X[j, i]
                count += 1
        avg[i] = temp/count
        
    return avg

In [7]:
def fill_na(X):
    header = X.columns
    avg = get_avg(X)
    X = X.values
    for i in range(X.shape[1]):
        for j in range(X.shape[0]):
            if X[j,i] < 0:
                X[j,i] = avg[i]

    return pd.DataFrame(X, columns=header)

In [8]:
X_train = process_data(X_train)

In [9]:
X_test = process_data(X_test)

In [10]:
X_train

Unnamed: 0,pid,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,BaseExcess,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,1.0,34.0,-1.000000,-1.000000,12.000000,-1.00,37.181818,8.500000,26.000000,-2.000000,...,-1.000000,100.000000,-1.0,111.000000,22.983784,69.757576,-1.000000,-1.000000,112.151515,7.387241
12,10.0,71.0,-1.000000,27.800000,12.000000,-1.00,36.000000,14.600000,-1.000000,-1.000000,...,68.000000,98.075758,-1.0,-1.000000,42.100000,77.090909,1.300000,0.010000,131.272727,-1.000000
24,100.0,68.0,-1.000000,20.900000,21.000000,-1.00,37.142857,12.500000,27.000000,-1.000000,...,-1.000000,95.803030,-1.0,101.000000,36.800000,114.060606,-1.000000,-1.000000,112.387097,-1.000000
36,1000.0,79.0,32.143939,-1.000000,22.000000,3.79,37.227273,9.200000,-1.000000,-1.000000,...,-1.000000,98.803030,-1.0,-1.000000,27.300000,81.121212,-1.000000,-1.000000,145.303030,7.300000
48,10000.0,76.0,-1.000000,30.881818,22.000000,-1.00,36.923077,10.454545,25.909091,1.916667,...,-1.000000,98.212121,-1.0,103.090909,29.481818,80.924242,-1.000000,-1.000000,127.560606,7.397692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227880,9993.0,80.0,-1.000000,-1.000000,13.857143,-1.00,35.625000,9.028571,-1.000000,-1.000000,...,64.285714,99.833333,-1.0,-1.000000,27.685714,108.287879,0.514286,11.537143,97.212121,-1.000000
227892,9995.0,73.0,-1.000000,55.500000,50.000000,-1.00,36.000000,11.200000,29.000000,-1.000000,...,-1.000000,94.060606,-1.0,89.000000,35.600000,62.000000,-1.000000,-1.000000,182.636364,-1.000000
227904,9996.0,53.0,-1.000000,-1.000000,-1.000000,-1.00,37.833333,10.700000,-1.000000,0.428571,...,-1.000000,99.484848,-1.0,-1.000000,26.480000,100.075758,-1.000000,-1.000000,96.803030,7.414000
227916,9998.0,89.0,-1.000000,34.700000,13.000000,-1.00,36.848485,8.400000,-1.000000,-1.000000,...,-1.000000,100.000000,-1.0,-1.000000,25.600000,96.742424,-1.000000,1.224286,136.409091,-1.000000


In [34]:
#X_train_n = fill_na(X_train)

In [35]:
#X_test_n = fill_na(X_test)

In [11]:
count = X_train[X_train < 0].count()
print(count)

pid                     0
Age                     0
EtCO2               17676
PTT                 11910
BUN                  5656
Lactate             14303
Temp                  468
Hgb                  5490
HCO3                11748
BaseExcess          17123
RRate                 159
Fibrinogen          17494
Phosphate            9850
WBC                  6185
Creatinine           5850
PaCO2               11323
AST                 14307
FiO2                11396
Platelets            6012
SaO2                14050
Glucose              2794
ABPm                  110
Magnesium            6858
Potassium            4186
ABPd                 3889
Calcium              6931
Alkalinephos        14356
SpO2                   24
Bilirubin_direct    18380
Chloride            11162
Hct                  4487
Heartrate               8
Bilirubin_total     14388
TroponinI           16222
ABPs                  400
pH                  11090
dtype: int64

In [12]:
header = ['pid', 'LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2','LABEL_Sepsis','LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']

In [13]:
#Task1

t1_labels = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
t1_train = np.array(y_train[t1_labels])



In [15]:
final = np.zeros((X_test.shape[0], len(header)))
final[:, 0] = X_test.values[:, 0]
for i in range(1, t1_train.shape[1] + 1):
    print(t1_labels[i - 1])
    model = XGBRegressor(n_estimators=100, max_depth=4, objective="binary:logistic")
    model.fit(X_train.values, t1_train[:,i - 1])
    prediction = model.predict(X_test.values)
    final[:, i] = prediction

LABEL_BaseExcess
LABEL_Fibrinogen
LABEL_AST
LABEL_Alkalinephos
LABEL_Bilirubin_total
LABEL_Lactate
LABEL_TroponinI
LABEL_SaO2
LABEL_Bilirubin_direct
LABEL_EtCO2


In [16]:
#Task2
t2_labels = ['LABEL_Sepsis']
t2_train = np.array(y_train[t2_labels])

In [17]:
model = XGBRegressor(n_estimators=100, max_depth=4, objective="binary:logistic")
model.fit(X_train.values, t2_train)
prediction = model.predict(X_test.values)



In [18]:
final[:, len(t1_labels) + 1] = prediction

In [19]:
df = pd.DataFrame(final, columns=header)
df.head()

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0.0,0.951406,0.531434,0.600778,0.706396,0.857141,0.441245,0.00296,0.17427,0.004557,0.0015,0.026805,0.0,0.0,0.0,0.0
1,10001.0,0.074723,0.021638,0.252227,0.232096,0.220429,0.082792,0.056476,0.077888,0.020766,0.018537,0.011199,0.0,0.0,0.0,0.0
2,10003.0,0.014774,0.018175,0.148822,0.127359,0.135073,0.349485,0.041912,0.304239,0.016424,0.010695,0.037393,0.0,0.0,0.0,0.0
3,10004.0,0.014121,0.025122,0.201324,0.201327,0.179432,0.078796,0.017627,0.067454,0.020267,0.025975,0.013275,0.0,0.0,0.0,0.0
4,10005.0,0.111325,0.01605,0.080411,0.065819,0.093162,0.076376,0.00278,0.03395,0.001594,4.3e-05,0.01728,0.0,0.0,0.0,0.0


In [20]:
#Task3
t3_labels = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
t3_train = np.array(y_train[t3_labels])

In [23]:
offset = len(t1_labels) + len(t2_labels) + 1
for i in range(t3_train.shape[1]):
    print(t3_labels[i])
    model = XGBRegressor(n_estimators=100, max_depth=4)
    model.fit(X_train.values, t3_train[:,i])
    prediction = model.predict(X_test.values)
    final[:, offset + i] = prediction

LABEL_RRate
LABEL_ABPm
LABEL_SpO2
LABEL_Heartrate


In [24]:
df = pd.DataFrame(final, columns=header)
df.head()

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0.0,0.951406,0.531434,0.600778,0.706396,0.857141,0.441245,0.00296,0.17427,0.004557,0.0015,0.026805,13.395888,80.582001,98.8974,76.214035
1,10001.0,0.074723,0.021638,0.252227,0.232096,0.220429,0.082792,0.056476,0.077888,0.020766,0.018537,0.011199,18.017267,88.07135,94.68586,101.403404
2,10003.0,0.014774,0.018175,0.148822,0.127359,0.135073,0.349485,0.041912,0.304239,0.016424,0.010695,0.037393,17.453121,82.041344,97.854813,90.145638
3,10004.0,0.014121,0.025122,0.201324,0.201327,0.179432,0.078796,0.017627,0.067454,0.020267,0.025975,0.013275,16.002142,73.990669,95.746384,90.534981
4,10005.0,0.111325,0.01605,0.080411,0.065819,0.093162,0.076376,0.00278,0.03395,0.001594,4.3e-05,0.01728,19.779545,72.115662,95.858215,59.624149


In [25]:
df.to_csv('submission.zip', index=False, float_format='%.3f', compression = 'zip')