# KISS Model

Simplest possible model that still (sort of) accomplishes our goal, mostly for rapid prototyping

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

In [None]:
diagnosis_df = pd.read_csv(
    "../data/mimiciii-14/DIAGNOSES_ICD.csv.gz", usecols=["HADM_ID", "ICD9_CODE"]
)
note_events_df = pd.read_csv(
    "../data/mimiciii-14/NOTEEVENTS.csv.gz", usecols=["HADM_ID", "TEXT"]
)
df = (note_events_df.merge(diagnosis_df)
      .set_index("HADM_ID")
      .dropna())
df.head()

In [None]:
codes_freq = df.groupby("ICD9_CODE").size().sort_values()
most_frequent = set(codes_freq.index[:-100])
df_subset = df[df.ICD9_CODE.isin(most_frequent)]

In [None]:
corpus = df_subset["TEXT"]
y = df_subset["ICD9_CODE"]
corpus_train, corpus_test, y_train, y_test \
    = train_test_split(corpus, y, test_size=0.33, random_state=42)

In [None]:
vectorizer = TfidfVectorizer()
corpus_train_embedded = vectorizer.fit_transform(corpus_train)
corpus_test_embedded = vectorizer.transform(corpus_test)

In [None]:
clf = RandomForestClassifier()
clf.fit(corpus_train_embedded, y_train)