# KISS Model

Simplest possible model that still (sort of) accomplishes our goal, mostly for rapid prototyping

In [182]:
import onnxruntime as rt
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import StringTensorType
from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer

from nltk.corpus import stopwords

from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score, fbeta_score, jaccard_score

In [4]:
diagnosis_df = pd.read_csv(
    "../data/mimiciii-14/DIAGNOSES_ICD.csv.gz", usecols=["HADM_ID", "ICD9_CODE"]
)
note_events_df = pd.read_csv(
    "../data/mimiciii-14/NOTEEVENTS.csv.gz", usecols=["HADM_ID", "TEXT"]
)
df = (note_events_df.merge(diagnosis_df)
      .set_index("HADM_ID")
      .dropna()
      .reset_index())
df.head()

Unnamed: 0,HADM_ID,TEXT,ICD9_CODE
0,167853.0,Admission Date: [**2151-7-16**] Dischar...,1193
1,167853.0,Admission Date: [**2151-7-16**] Dischar...,4254
2,167853.0,Admission Date: [**2151-7-16**] Dischar...,42731
3,167853.0,Admission Date: [**2151-7-16**] Dischar...,2639
4,167853.0,Admission Date: [**2151-7-16**] Dischar...,2762


In [5]:
df = df.reset_index()

In [6]:
codes_freq = df["ICD9_CODE"].value_counts()
most_frequent = set(codes_freq.index[:10])
df_subset = df[df.ICD9_CODE.isin(most_frequent)]
df_subset.shape

(3616703, 4)

In [7]:
df_subset.ICD9_CODE.unique()

array(['42731', '51881', '5849', '41401', '4019', '4280', '5990', '25000',
       '769', '7742'], dtype=object)

In [8]:
df_subset2 = df_subset.sample(n=50000)

In [10]:
df_subset2.shape

(50000, 4)

In [11]:
icd9_codes = list(df_subset2.ICD9_CODE.unique())
ddf = pd.DataFrame(index=df_subset2.HADM_ID.unique(), columns=["TEXT"] + icd9_codes).fillna(False)
for idx, row in df_subset2.iterrows():
    ddf.loc[row.HADM_ID, "TEXT"] = row.TEXT
    ddf.loc[row.HADM_ID, row.ICD9_CODE] = True
ddf.head()

Unnamed: 0,TEXT,7742,5849,5990,4280,769,42731,51881,41401,4019,25000
156143.0,Neonatology NP Note\nPE\nswaddled in open crib...,True,False,False,False,True,False,False,False,False,False
150509.0,"34F with asthma, DM, HTN, G3P1 POD#9 from Csec...",False,True,False,False,False,False,True,False,False,True
154108.0,Neonatology Attending\nDOL 7\n\nIn room air wi...,True,False,False,False,True,False,False,False,False,False
131259.0,[**2138-3-24**] 12:30 PM\n VIDEO OROPHARYNGEAL...,False,False,True,False,False,True,False,True,True,False
100694.0,Normal sinus rhythm. Borderline A-V conductio...,False,False,False,True,False,False,False,False,False,False


In [22]:
ddf[icd9_codes].sum(axis=1).max()

7

In [191]:
X = ddf[["TEXT"]].values
y = ddf[icd9_codes[0]].values.astype("int64")

In [192]:
X.shape, y.shape

((18984, 1), (18984,))

In [193]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.33, random_state=42)

In [194]:
pipeline = Pipeline(steps=[
    ("reshape", ColumnTransformer([
        ("vectorize", TfidfVectorizer(), 0),
    ])),
    ("dimensionality_reduction", TruncatedSVD(n_components=50)),
    ("classifier", RandomForestClassifier()),
])

In [195]:
X_train.shape, y_train.shape

((12719, 1), (12719,))

In [196]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('reshape',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('vectorize',
                                                  TfidfVectorizer(analyzer='word',
                                                                  binary=False,
                                                                  decode_error='strict',
                                                                  dtype=<class 'numpy.float64'>,
                                                                  encoding='utf-8',
                                                                  input='content',
                                                                  lowercase=True,
                                                                  max_df=1.0,
                           

In [177]:
# dummy = Pipeline(steps=[
#     ("reshape", ColumnTransformer([
#         ("vectorize", TfidfVectorizer(stop_words="english"), 0),
#     ])),
#     ("dimensionality_reduction", TruncatedSVD(n_components=50)),
#     ("classifier", DummyClassifier(strategy="uniform")),
# ])
# dummy.fit(X_train, y_train)

In [153]:
accuracy_score(y_test, dummy.predict(X_test)), accuracy_score(y_test, pipeline.predict(X_test))

(0.0009577015163607343, 0.04325618515562649)

In [154]:
jaccard_score(y_test, dummy.predict(X_test), average="weighted"), jaccard_score(y_test, pipeline.predict(X_test), average="weighted")

(0.14369919653691138, 0.0598540386760255)

In [155]:
fbeta_score(y_test, dummy.predict(X_test), beta=1, average="weighted"), fbeta_score(y_test, pipeline.predict(X_test), beta=1, average="weighted")

(0.25479296832605325, 0.07932491745885362)

In [156]:
pipeline.predict(X_test.iloc[0:2])

array([[False, False, False, False, False, False, False, False, False,
        False],
       [ True, False, False, False,  True, False, False, False, False,
        False]])

In [199]:
onx = convert_sklearn(
    pipeline,
    "autoicd-kiss",
    initial_types=[
        ("input", StringTensorType(shape=[None, 1]))
    ],
    options={
        TfidfVectorizer: {
            "separators": [
                ' ', '.', '\\?', ',', ';', ':', '!',
                '\\(', '\\)', '\n', '"', "'",
                "-", "\\[", "\\]", "@"
            ]
        }
    }
)
with open("rf_iris.onnx", "wb") as f:
    f.write(onx.SerializeToString())

  op_type, domain))


In [200]:
sess = rt.InferenceSession("rf_iris.onnx")
pred_onx = sess.run(None, {"input": X_test})[0]
print("predict", pred_onx[0])
print("predict_proba", pred_onx[1])

predict 1
predict_proba 1
