In [1]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import roc_auc_score, f1_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [2]:
X_train = pd.read_csv('task2_k49am2lqi/train_features.csv')
y_train = pd.read_csv('task2_k49am2lqi/train_labels.csv')

In [3]:
X_test = pd.read_csv('task2_k49am2lqi/test_features.csv')

In [4]:
X_test.head(14)

Unnamed: 0,pid,Time,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,0,1,39.0,,,,,,,,...,,,,,,,,,,
1,0,2,39.0,,44.2,17.0,,36.0,10.2,13.0,...,119.0,100.0,,98.0,31.0,82.0,21.8,,119.0,
2,0,3,39.0,,,,,,,,...,,100.0,,,,78.0,,,125.0,7.34
3,0,4,39.0,,,,,,,,...,,100.0,,,,80.0,,,136.0,
4,0,5,39.0,,,,,,,,...,,100.0,,,,83.0,,,135.0,
5,0,6,39.0,,,,,36.0,,,...,,100.0,,,,88.0,,,144.0,
6,0,7,39.0,,38.5,20.0,,,9.1,16.0,...,109.0,100.0,,102.0,25.9,,26.4,,,
7,0,8,39.0,,,,,36.0,,,...,,100.0,,,,90.0,,,129.0,7.4
8,0,9,39.0,,,,,36.0,,,...,,100.0,,,,90.0,,,121.0,
9,0,10,39.0,,,,,36.0,,,...,,100.0,,,,85.0,,,120.0,


In [5]:
def process_data(X):
    X = X.fillna(0)
    X_new = pd.DataFrame(columns=X.columns).drop('Time', axis=1)
    length = X.shape[1] - 3

    for i in range(0, X.shape[0], 12):
        new_f = np.zeros(length + 2)
        for j in range(length):
            temp = 0
            counts = 0
            for k in range(12):
                t = k*X.iloc[i + k ,3 + j]
                temp += t
                if t != 0:
                    counts += k
        
            if counts == 0:
                new_f[j + 2] = -1 #No valid measurement
            else:
                new_f[j + 2] = temp/counts
    
        new_f[0] = X.iloc[i, 0]
        new_f[1] = X.iloc[i, 2]
    
        X_new.loc[i] = new_f
    
    return X_new

In [6]:
X_train = process_data(X_train)

In [7]:
X_test = process_data(X_test)

In [8]:
X_train

Unnamed: 0,pid,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,BaseExcess,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,1.0,34.0,-1.000000,-1.000000,12.000000,-1.00,37.181818,8.500000,26.000000,-2.000000,...,-1.000000,100.000000,-1.0,111.000000,22.983784,69.757576,-1.000000,-1.000000,112.151515,7.387241
12,10.0,71.0,-1.000000,27.800000,12.000000,-1.00,36.000000,14.600000,-1.000000,-1.000000,...,68.000000,98.075758,-1.0,-1.000000,42.100000,77.090909,1.300000,0.010000,131.272727,-1.000000
24,100.0,68.0,-1.000000,20.900000,21.000000,-1.00,37.142857,12.500000,27.000000,-1.000000,...,-1.000000,95.803030,-1.0,101.000000,36.800000,114.060606,-1.000000,-1.000000,112.387097,-1.000000
36,1000.0,79.0,32.143939,-1.000000,22.000000,3.79,37.227273,9.200000,-1.000000,-1.000000,...,-1.000000,98.803030,-1.0,-1.000000,27.300000,81.121212,-1.000000,-1.000000,145.303030,7.300000
48,10000.0,76.0,-1.000000,30.881818,22.000000,-1.00,36.923077,10.454545,25.909091,1.916667,...,-1.000000,98.212121,-1.0,103.090909,29.481818,80.924242,-1.000000,-1.000000,127.560606,7.397692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227880,9993.0,80.0,-1.000000,-1.000000,13.857143,-1.00,35.625000,9.028571,-1.000000,-1.000000,...,64.285714,99.833333,-1.0,-1.000000,27.685714,108.287879,0.514286,11.537143,97.212121,-1.000000
227892,9995.0,73.0,-1.000000,55.500000,50.000000,-1.00,36.000000,11.200000,29.000000,-1.000000,...,-1.000000,94.060606,-1.0,89.000000,35.600000,62.000000,-1.000000,-1.000000,182.636364,-1.000000
227904,9996.0,53.0,-1.000000,-1.000000,-1.000000,-1.00,37.833333,10.700000,-1.000000,0.428571,...,-1.000000,99.484848,-1.0,-1.000000,26.480000,100.075758,-1.000000,-1.000000,96.803030,7.414000
227916,9998.0,89.0,-1.000000,34.700000,13.000000,-1.00,36.848485,8.400000,-1.000000,-1.000000,...,-1.000000,100.000000,-1.0,-1.000000,25.600000,96.742424,-1.000000,1.224286,136.409091,-1.000000


In [9]:
#count = X_train[X_train < 0].count()
#count

In [10]:
#h = X_train.columns

In [11]:
#drop = []
#for i in range(len(count)):
#    if count[i] > 0.6*X_train.shape[0]:
#        drop.append(h[i])

In [12]:
#X_train = X_train.drop(drop, axis=1)
#X_test = X_test.drop(drop, axis=1)

In [13]:
X_train

Unnamed: 0,pid,Age,EtCO2,PTT,BUN,Lactate,Temp,Hgb,HCO3,BaseExcess,...,Alkalinephos,SpO2,Bilirubin_direct,Chloride,Hct,Heartrate,Bilirubin_total,TroponinI,ABPs,pH
0,1.0,34.0,-1.000000,-1.000000,12.000000,-1.00,37.181818,8.500000,26.000000,-2.000000,...,-1.000000,100.000000,-1.0,111.000000,22.983784,69.757576,-1.000000,-1.000000,112.151515,7.387241
12,10.0,71.0,-1.000000,27.800000,12.000000,-1.00,36.000000,14.600000,-1.000000,-1.000000,...,68.000000,98.075758,-1.0,-1.000000,42.100000,77.090909,1.300000,0.010000,131.272727,-1.000000
24,100.0,68.0,-1.000000,20.900000,21.000000,-1.00,37.142857,12.500000,27.000000,-1.000000,...,-1.000000,95.803030,-1.0,101.000000,36.800000,114.060606,-1.000000,-1.000000,112.387097,-1.000000
36,1000.0,79.0,32.143939,-1.000000,22.000000,3.79,37.227273,9.200000,-1.000000,-1.000000,...,-1.000000,98.803030,-1.0,-1.000000,27.300000,81.121212,-1.000000,-1.000000,145.303030,7.300000
48,10000.0,76.0,-1.000000,30.881818,22.000000,-1.00,36.923077,10.454545,25.909091,1.916667,...,-1.000000,98.212121,-1.0,103.090909,29.481818,80.924242,-1.000000,-1.000000,127.560606,7.397692
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227880,9993.0,80.0,-1.000000,-1.000000,13.857143,-1.00,35.625000,9.028571,-1.000000,-1.000000,...,64.285714,99.833333,-1.0,-1.000000,27.685714,108.287879,0.514286,11.537143,97.212121,-1.000000
227892,9995.0,73.0,-1.000000,55.500000,50.000000,-1.00,36.000000,11.200000,29.000000,-1.000000,...,-1.000000,94.060606,-1.0,89.000000,35.600000,62.000000,-1.000000,-1.000000,182.636364,-1.000000
227904,9996.0,53.0,-1.000000,-1.000000,-1.000000,-1.00,37.833333,10.700000,-1.000000,0.428571,...,-1.000000,99.484848,-1.0,-1.000000,26.480000,100.075758,-1.000000,-1.000000,96.803030,7.414000
227916,9998.0,89.0,-1.000000,34.700000,13.000000,-1.00,36.848485,8.400000,-1.000000,-1.000000,...,-1.000000,100.000000,-1.0,-1.000000,25.600000,96.742424,-1.000000,1.224286,136.409091,-1.000000


In [14]:
header = ['pid', 'LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2','LABEL_Sepsis','LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']

In [15]:
#Task1

t1_labels = ['LABEL_BaseExcess', 'LABEL_Fibrinogen', 'LABEL_AST', 'LABEL_Alkalinephos', 'LABEL_Bilirubin_total', 'LABEL_Lactate', 'LABEL_TroponinI', 'LABEL_SaO2', 'LABEL_Bilirubin_direct', 'LABEL_EtCO2']
t1_train = np.array(y_train[t1_labels])



In [16]:
final = np.zeros((X_test.shape[0], len(header)))
final[:, 0] = X_test.values[:, 0]
for i in range(1, t1_train.shape[1] + 1):
    print(t1_labels[i - 1])
    model = LogisticRegression()
    model.fit(X_train.values, t1_train[:,i - 1])
    prediction = model.predict_proba(X_test)
    final[:, i] = prediction[:,1]

LABEL_BaseExcess


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LABEL_Fibrinogen


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LABEL_AST


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LABEL_Alkalinephos


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LABEL_Bilirubin_total


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LABEL_Lactate


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LABEL_TroponinI


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LABEL_SaO2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LABEL_Bilirubin_direct


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LABEL_EtCO2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [17]:
#Task2
t2_labels = ['LABEL_Sepsis']
t2_train = np.array(y_train[t2_labels])

In [18]:
model = LogisticRegression()
model.fit(X_train.values, t2_train)
prediction = model.predict_proba(X_test)

  return f(*args, **kwargs)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
final[:, len(t1_labels) + 1] = prediction[:,1]

In [20]:
df = pd.DataFrame(final, columns=header)
df.head()

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0.0,0.478073,0.213332,0.429573,0.424615,0.4289,0.336401,0.029687,0.191726,0.056952,0.056261,0.087789,0.0,0.0,0.0,0.0
1,10001.0,0.135209,0.04237,0.288334,0.301258,0.300343,0.099263,0.101697,0.117142,0.02679,0.014268,0.027601,0.0,0.0,0.0,0.0
2,10003.0,0.224026,0.041447,0.209204,0.203758,0.209389,0.180291,0.075414,0.46331,0.028344,0.019295,0.042164,0.0,0.0,0.0,0.0
3,10004.0,0.089568,0.063311,0.459593,0.433254,0.438759,0.132743,0.093367,0.108805,0.0548,0.033907,0.053827,0.0,0.0,0.0,0.0
4,10005.0,0.192072,0.042131,0.108861,0.089913,0.107082,0.107839,0.025628,0.120542,0.009234,0.005271,0.058323,0.0,0.0,0.0,0.0


In [21]:
#Task3
t3_labels = ['LABEL_RRate', 'LABEL_ABPm', 'LABEL_SpO2', 'LABEL_Heartrate']
t3_train = np.array(y_train[t3_labels])

In [22]:
offset = len(t1_labels) + len(t2_labels) + 1
for i in range(t3_train.shape[1]):
    print(t3_labels[i])
    model = Ridge(alpha=10)
    model.fit(X_train.values, t3_train[:,i])
    prediction = model.predict(X_test)
    final[:, offset + i] = prediction

LABEL_RRate
LABEL_ABPm
LABEL_SpO2
LABEL_Heartrate


In [23]:
df = pd.DataFrame(final, columns=header)
df.head()

Unnamed: 0,pid,LABEL_BaseExcess,LABEL_Fibrinogen,LABEL_AST,LABEL_Alkalinephos,LABEL_Bilirubin_total,LABEL_Lactate,LABEL_TroponinI,LABEL_SaO2,LABEL_Bilirubin_direct,LABEL_EtCO2,LABEL_Sepsis,LABEL_RRate,LABEL_ABPm,LABEL_SpO2,LABEL_Heartrate
0,0.0,0.478073,0.213332,0.429573,0.424615,0.4289,0.336401,0.029687,0.191726,0.056952,0.056261,0.087789,15.20192,82.992139,98.42731,87.05158
1,10001.0,0.135209,0.04237,0.288334,0.301258,0.300343,0.099263,0.101697,0.117142,0.02679,0.014268,0.027601,18.185678,89.718451,96.269716,100.571859
2,10003.0,0.224026,0.041447,0.209204,0.203758,0.209389,0.180291,0.075414,0.46331,0.028344,0.019295,0.042164,18.137523,80.379343,97.527356,88.885062
3,10004.0,0.089568,0.063311,0.459593,0.433254,0.438759,0.132743,0.093367,0.108805,0.0548,0.033907,0.053827,16.846605,78.775024,96.787306,88.224513
4,10005.0,0.192072,0.042131,0.108861,0.089913,0.107082,0.107839,0.025628,0.120542,0.009234,0.005271,0.058323,19.182738,74.531563,96.625458,61.659645


In [24]:
df.to_csv('submission.zip', index=False, float_format='%.3f', compression = 'zip')