In [10]:
import numpy as np
import os
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt

In [3]:
import random  
seed_num = 42
random.seed(seed_num)

x = np.load('/project/LSH/x_(7727,10,4068).npy').reshape(7727,-1)
y = np.load('/project/LSH/y_(7727,1).npy')

idx = list(range(len(x)))
random.shuffle(idx)

i = round(x.shape[0]*0.8)
X_train, y_train = x[idx[:i],:], y[idx[:i]]
X_test, y_test = x[idx[i:],:], y[idx[i:]]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6182, 40680), (6182,), (1545, 40680), (1545,))

In [6]:
# Random Forest
model = RandomForestClassifier()

model.fit(X_train, y_train)
pred = model.predict(X_test)

precision = precision_score(y_test, pred)
recall = recall_score(y_test, pred)
f1 = f1_score(y_test, pred)
roc_auc = roc_auc_score(y_test, pred)
acc = accuracy_score(y_test, pred)

print(f'정확도 : {acc}, Precision : {precision}, Recall : {recall}, F1 : {f1}, roc_auc : {roc_auc}')


정확도 : 0.7508090614886731, Precision : 0.7516218721037998, Recall : 0.8739224137931034, F1 : 0.8081714000996513, roc_auc : 0.7197813041412843


In [7]:
model.feature_importances_

array([4.87272883e-04, 2.18282277e-05, 2.23960745e-04, ...,
       1.08758380e-04, 2.37369291e-05, 0.00000000e+00])

In [14]:
len(model.feature_importances_)

40680

In [42]:
COLS = list(pd.read_csv('/project/LSH/total_data_7727.csv')['ITEMID'].sort_values().unique())*10
df = pd.DataFrame(COLS, columns=['cols'])
df['fi'] = model.feature_importances_

In [45]:
df = df.sort_values('fi', ascending=False)
df

Unnamed: 0,cols,fi
36733,51006,0.004053
36622,50818,0.003758
36818,51277,0.003679
28597,51006,0.003593
32665,51006,0.003563
...,...,...
16356,50957,0.000000
16351,50951,0.000000
16349,50949,0.000000
16348,50948,0.000000


In [47]:
df[:12]

Unnamed: 0,cols,fi
36733,51006,0.004053
36622,50818,0.003758
36818,51277,0.003679
28597,51006,0.003593
32665,51006,0.003563
40444,63323026201,0.003519
36376,63323026201,0.003518
36623,50820,0.003475
36620,50813,0.003241
24614,51277,0.00288


In [63]:
df1 = df.groupby(by='cols').sum().sort_values('fi',ascending=False)
df2 = df1[:10].reset_index()

In [64]:
df2

Unnamed: 0,cols,fi
0,51277,0.023777
1,51006,0.021961
2,63323026201,0.014319
3,50912,0.01037
4,50818,0.008391
5,50882,0.008053
6,50820,0.007674
7,50931,0.007436
8,50821,0.006995
9,55390000401,0.006973


In [65]:
name = {51277: 'RDW', 51006: 'Urea Nitrogen', 50912: 'Creatinine', 50882: 'Bicarbonate',
         63323026201 : 'Heparin', 55390000401 : 'Glucagon', 50818:'pCO2', 50820:'pH', 50931:'Glucose', 50821:'pO2'}

In [66]:
df2['name'] = df2['cols'].apply(lambda x:name[x])

In [69]:
df2.cols[:8].tolist()

[51277, 51006, 63323026201, 50912, 50818, 50882, 50820, 50931]