In [125]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import pandas as pd
import torch.nn as nn
import torch
import random as rd
import matplotlib.pyplot as plt
import numpy as np

In [126]:
total_feats = ['PTT',
 'Bicarbonate',
 'Calcium, Total',
 'Glucose',
 'pCO2',
 'Sodium',
 'Neutrophils',
 'pH',
 'Chloride',
 'Hemoglobin',
 'Phosphate',
 'Alanine Aminotransferase (ALT)',
 'Urea Nitrogen']
labs = {
    "51221": "Hematocrit",
    "51265": "Platelet Count",
    "50912": "Creatinine",
    "50971": "Potassium",
    "51222": "Hemoglobin",
    "51301": "White Blood Cells",
    "51249": "MCHC",
    "51279": "Red Blood Cells",
    "51250": "MCV",
    "51248": "MCH",
    "51277": "RDW",
    "51006": "Urea Nitrogen",
    "50983": "Sodium",
    "50902": "Chloride",
    "50882": "Bicarbonate",
    "50868": "Anion Gap",
    "50931": "Glucose",
    "50960": "Magnesium",
    "50893": "Calcium, Total",
    "50970": "Phosphate",
    "51237": "INR(PT)",
    "51274": "PT",
    "51275": "PTT",
    "51146": "Basophils",
    "51256": "Neutrophils",
    "51254": "Monocytes",
    "51200": "Eosinophils",
    "51244": "Lymphocytes",
    "52172": "RDW-SD",
    "50934": "H",
    "51678": "L",
    "50947": "I",
    "50861": "Alanine Aminotransferase (ALT)",
    "50878": "Asparate Aminotransferase (AST)",
    "50813": "Lactate",
    "50863": "Alkaline Phosphatase",
    "50885": "Bilirubin, Total",
    "50820": "pH",
    "50862": "Albumin",
    "50802": "Base Excess",
    "50821": "pO2",
    "50804": "Calculated Total CO2",
    "50818": "pCO2",
    "52075": "Absolute Neutrophil Count",
    "52073": "Absolute Eosinophil Count",
    "52074": "Absolute Monocyte Count",
    "52069": "Absolute Basophil Count",
    "51133": "Absolute Lymphocyte Count",
    "50910": "Creatine Kinase (CK)",
    "52135": "Immature Granulocytes"
}
labs_reversed = {value: key for key, value in labs.items()}
targets = [
    "RDW", "INR(PT)", "pO2", "Hematocrit", "Alkaline Phosphatase", "Lactate", 
    "Eosinophils", "Potassium", "Anion Gap", "Magnesium", "Base Excess", 
    "Lymphocytes", "Creatinine", "Basophils", "Bilirubin, Total", "Monocytes", 
    "MCV", "Albumin", "PT", "MCH", "Red Blood Cells", "Platelet Count", 
    "RDW-SD", "MCHC", "White Blood Cells", "Asparate Aminotransferase (AST)", 
    "Calculated Total CO2", "Creatine Kinase (CK)"
]
encode = lambda x: [labs_reversed[i] for i in x]
decode = lambda x: [labs[i] for i in x]
total_feats = encode(total_feats)
targets = encode(targets)

for i in range (len(targets)):
    targets[i] += "_percentile"

for i in range (len(total_feats)):
    total_feats[i] += "_percentile"

relevant_cols = total_feats + targets
print(relevant_cols)

['51275_percentile', '50882_percentile', '50893_percentile', '50931_percentile', '50818_percentile', '50983_percentile', '51256_percentile', '50820_percentile', '50902_percentile', '51222_percentile', '50970_percentile', '50861_percentile', '51006_percentile', '51277_percentile', '51237_percentile', '50821_percentile', '51221_percentile', '50863_percentile', '50813_percentile', '51200_percentile', '50971_percentile', '50868_percentile', '50960_percentile', '50802_percentile', '51244_percentile', '50912_percentile', '51146_percentile', '50885_percentile', '51254_percentile', '51250_percentile', '50862_percentile', '51274_percentile', '51248_percentile', '51279_percentile', '51265_percentile', '52172_percentile', '51249_percentile', '51301_percentile', '50878_percentile', '50804_percentile', '50910_percentile']


In [127]:
big_data = pd.read_csv('train.csv', index_col = "subject_id")
big_data = big_data[relevant_cols]
def create_dataset(original_data, input_list, output_list):
    selected_columns = input_list + output_list
    df = original_data[selected_columns]
    df = df.dropna()
    return df

In [128]:
def rf2(xs, y, n_estimators=44, max_samples=329,
       max_features=0.9128376331116463, min_samples_leaf=1, max_depth= 23, **kwargs):
    return RandomForestRegressor(n_jobs=-1, max_depth=max_depth, n_estimators=n_estimators,
        max_samples=max_samples, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)
MSE = nn.MSELoss()

In [129]:
loss_df= pd.DataFrame(columns=['output', 'loss'])

for target in targets:
    target = [target]
    df = create_dataset(big_data, total_feats, target)
    print(f"dataset size: {df.size}")
    y_df = df[target]
    x_df = df[total_feats]

    y = y_df.to_numpy()
    y = y.ravel()
    x = x_df.to_numpy()

    
    m = rf2(x,y)

    y_preds = m.predict(x)

    loss = float(MSE(torch.tensor(y), torch.tensor(y_preds)))

    loss_df.loc[len(loss_df)] = [target, loss]
    
    print(loss)

dataset size: 35728
0.04648103594589998
dataset size: 35714
0.03405000348391568
dataset size: 35728
0.05292338348205663
dataset size: 35728
0.004385431350481164
dataset size: 35392
0.056475297048003935
dataset size: 34608
0.04995920710959325
dataset size: 35728
0.03860824376647158
dataset size: 35728
0.05742760606622611
dataset size: 35728
0.024325064230996802
dataset size: 35728
0.057273666289082203
dataset size: 35728
0.009217731741778796
dataset size: 35728
0.01934703020696012
dataset size: 35728
0.026685472183739428
dataset size: 35728
0.046360135106084645
dataset size: 35252
0.04760887735059669
dataset size: 35728
0.04946482387868697
dataset size: 35728
0.06680067194495053
dataset size: 30156
0.042019054379264086
dataset size: 35714
0.04264253250898958
dataset size: 35728
0.06478607772691568
dataset size: 35728
0.014946313667125122
dataset size: 35728
0.06739638105323195
dataset size: 13496
0.03255835132489193
dataset size: 35728
0.0529651149861786
dataset size: 35728
0.0535755960

Unnamed: 0,output,loss
0,[51277_percentile],0.046481
1,[51237_percentile],0.03405
2,[50821_percentile],0.052923
3,[51221_percentile],0.004385
4,[50863_percentile],0.056475
5,[50813_percentile],0.049959
6,[51200_percentile],0.038608
7,[50971_percentile],0.057428
8,[50868_percentile],0.024325
9,[50960_percentile],0.057274


In [131]:
loss_df

Unnamed: 0,output,loss
0,[51277_percentile],0.046481
1,[51237_percentile],0.03405
2,[50821_percentile],0.052923
3,[51221_percentile],0.004385
4,[50863_percentile],0.056475
5,[50813_percentile],0.049959
6,[51200_percentile],0.038608
7,[50971_percentile],0.057428
8,[50868_percentile],0.024325
9,[50960_percentile],0.057274
