# Cross Training with MIMIC and MLH

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pdb
import pandas as pd
import pickle
import numpy as np
np.set_printoptions(precision=4)

from tqdm import tqdm_notebook as tqdm
from ast import literal_eval
from pathlib import Path
from scipy import stats

from sklearn.feature_extraction.text import TfidfVectorizer

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style("darkgrid")
%matplotlib inline

In [3]:
mimic_path = Path('mimic_data')
mlh_path = Path('mlh_data')
path = Path('data')
workdir = path/'workdir'
vectordir = workdir/'vectordir'

In [4]:
mimic_notes_df = pd.read_csv(mimic_path/'notes_all_proc.csv', usecols=['hadm_id', 'note', 'imi_adm_label'])
mimic_notes_df = mimic_notes_df[mimic_notes_df['imi_adm_label'] != -1].reset_index(drop=True)

mlh_notes_df = pd.read_csv(mlh_path/'notes_all_proc.csv', usecols=['hadm_id', 'note', 'imi_adm_label'])
mlh_notes_df = mlh_notes_df[mlh_notes_df['imi_adm_label'] != -1].reset_index(drop=True)

mimic_notes_df.shape, mlh_notes_df.shape

((38112, 3), (116400, 3))

In [5]:
mimic2mlh_vec = TfidfVectorizer(ngram_range=(1,2), max_features=60_000)

x_train_mimic = mimic2mlh_vec.fit_transform(mimic_notes_df['note'])
x_test_mlh = mimic2mlh_vec.transform(mlh_notes_df['note'])

y_train_mimic = mimic_notes_df['imi_adm_label']
y_test_mlh = mlh_notes_df['imi_adm_label']

with open(vectordir/'mimic2mlh.pkl', 'wb') as f:
  pickle.dump(mimic2mlh_vec, f)
  pickle.dump(x_train_mimic, f)
  pickle.dump(x_test_mlh, f)
  pickle.dump(y_train_mimic, f)
  pickle.dump(y_test_mlh, f)

In [6]:
mlh2mimic_vec = TfidfVectorizer(ngram_range=(1,2), max_features=60_000)

x_train_mlh = mlh2mimic_vec.fit_transform(mlh_notes_df['note'])
x_test_mimic = mlh2mimic_vec.transform(mimic_notes_df['note'])

y_train_mlh = mlh_notes_df['imi_adm_label']
y_test_mimic = mimic_notes_df['imi_adm_label']

with open(vectordir/'mlh2mimic.pkl', 'wb') as f:
  pickle.dump(mlh2mimic_vec, f)
  pickle.dump(x_train_mlh, f)
  pickle.dump(x_test_mimic, f)
  pickle.dump(y_train_mlh, f)
  pickle.dump(y_test_mimic, f)