In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_msgpack('preprocess/Readmission/Split_Notes_Texts_combined.msg')

In [None]:
df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
import re
from tqdm import tqdm_notebook

pattern = re.compile(r'\s\W+\s')
texts = list(df['cseq'])
for t in tqdm_notebook(range(len(texts))) :
    texts[t] = re.sub(r'\s+', ' ', pattern.sub(' ', texts[t]))

In [None]:
df['cseq'] = texts

In [None]:
tokenizer = lambda s : s.split(' ')
vocab = CountVectorizer(max_features=25000, tokenizer=tokenizer, stop_words='english')
vocab.fit(list(df[df['exp_split'] == 'train']['cseq']))

In [None]:
bow = {}
for key in ['train', 'dev'] :
    bow[key] = vocab.transform(list(df[df['exp_split'] == key]['cseq']))

In [None]:
label = {}
for key in ['train', 'dev'] :
    label[key] = list(df[df['exp_split'] == key]['y'])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_auc_score

In [None]:
lr = LinearSVC(class_weight='balanced')
lr.fit(bow['train'].toarray(), label['train'])
predict = lr.predict_proba(bow['dev'].toarray())
print(roc_auc_score(label['dev'], predict[:, 1]))

In [None]:
lr.predict(bow['dev'].toarray())

In [None]:
from sklearn.preprocessing import normalize
normalised_bow = {}
for key in ['train', 'dev'] :
    normalised_bow[key] = normalize(bow[key], norm='l2', copy=True)

In [None]:
lr_norm = LogisticRegression(class_weight='balanced')
lr_norm.fit(normalised_bow['train'].toarray(), label['train'])
predict = lr_norm.predict_proba(normalised_bow['dev'].toarray())
print(roc_auc_score(label['dev'], predict[:, 1]))

In [None]:
binary_bow = {}
for key in ['train', 'dev'] :
    binary_bow[key] = np.clip(bow[key].toarray(), 0, 1)

In [None]:
lr_bin = LogisticRegression(class_weight='balanced')
lr_bin.fit(binary_bow['train'], label['train'])
predict = lr_bin.predict_proba(binary_bow['dev'])
print(roc_auc_score(label['dev'], predict[:, 1]))

In [None]:
bin_normalised_bow = {}
for key in ['train', 'dev'] :
    bin_normalised_bow[key] = normalize(binary_bow[key], norm='l2', copy=True)

In [None]:
lr_bin = LogisticRegression(class_weight='balanced')
lr_bin.fit(bin_normalised_bow['train'], label['train'])
predict = lr_bin.predict_proba(bin_normalised_bow['dev'])
print(roc_auc_score(label['dev'], predict[:, 1]))