In [None]:
#!git clone -b master https://github.com/charles9n/bert-sklearn
#!cd bert-sklearn; pip install .

In [29]:
import os
import math
import random
import csv
import sys

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
import statistics as stats

from bert_sklearn import BertClassifier
from bert_sklearn import BertRegressor
from bert_sklearn import BertTokenClassifier
from bert_sklearn import load_model
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
import xgboost

In [2]:
import sys
sys.path.insert(0,"../src")

In [3]:
import vectorize
import helpers
import transformers
from utils import *
from loss.loss import *

from bert_sklearn_transformer import BertTransformer

04/22/2020 12:15:00 - INFO - transformers.file_utils -   PyTorch version 1.4.0 available.
04/22/2020 12:15:01 - INFO - transformers.file_utils -   TensorFlow version 2.0.0 available.
04/22/2020 12:15:02 - INFO - numexpr.utils -   NumExpr defaulting to 8 threads.


In [4]:
full_df = pd.read_pickle("../data/full_df.pkl")

In [5]:
MAX_VOCAB = None
MAX_SEQ_LENGTH = 5000
full_df['TEXT_PROCESSED'] = vectorize.clean_notes(full_df, 'TEXT')

In [6]:
full_df['ICD9_GRP_LIST'] = full_df.ICD9_GRP.apply(lambda x: re.split(" +", x.strip()))

# Split the dataset

In [36]:
# Another way of splitting the data
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

split = np.random.choice(
    ["train", "val", "test"],
    size=full_df.shape[0],
    p=[.7, .15, .15]
)
full_df["split"] = split

sample_df = full_df.sample(frac=0.3)


X_train = sample_df[sample_df["split"] == "train"]['TEXT_PROCESSED']
y_train = mlb.fit_transform(sample_df[sample_df["split"] == "train"]['ICD9_GRP_LIST'])

X_test = sample_df[sample_df["split"] == "test"]['TEXT_PROCESSED']
y_test = mlb.transform(sample_df[sample_df["split"] == "test"]['ICD9_GRP_LIST'])

# SKLearn Pipeline

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score

In [None]:
models = []

models.append(("LogisticRegression",LogisticRegression()))
models.append(("SVC",SVC()))
models.append(("LinearSVC",LinearSVC()))
models.append(("KNeighbors",KNeighborsClassifier()))
models.append(("DecisionTree",DecisionTreeClassifier()))
models.append(("RandomForest",RandomForestClassifier()))
rf2 = RandomForestClassifier(n_estimators=100, criterion='gini',
                                max_depth=10, random_state=0, max_features=None)
models.append(("RandomForest2",rf2))
models.append(("MLPClassifier",MLPClassifier(solver='lbfgs', random_state=0)))

results = []
names = []
for name,model in models:
    result = cross_val_score(model, X_train, y_train,  cv=3)
    names.append(name)
    results.append(result)

for i in range(len(names)):
    print(names[i],results[i].mean())

In [46]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, multilabel_confusion_matrix, classification_report
from sklearn.ensemble import RandomForestClassifier

#classifier = SGDClassifier()
#classifier = XGBClassifier(n_jobs=-1, max_depth=4)
classifier = RandomForestClassifier()
model = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range=(2,2))),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(classifier)),
])

In [47]:
model.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
parameters = {'vectorizer__ngram_range': [(1, 1), (1, 2),(2,2)],
               'tfidf__use_idf': (True, False)}
gs_clf_svm = GridSearchCV(model, parameters, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)
print(gs_clf_svm.best_score_)
print(gs_clf_svm.best_params_)

# Evaluation on the test data

In [11]:
#evaluation on test data
pred = model.predict(X_test)

multilabel_confusion_matrix(pred, y_test)

array([[[2355,    0],
        [   0,    0]],

       [[1674,  274],
        [  78,  329]],

       [[1972,  204],
        [  10,  169]],

       [[ 390,   96],
        [ 406, 1463]],

       [[ 741,  269],
        [ 402,  943]],

       [[1561,  391],
        [  99,  304]],

       [[ 269,   23],
        [ 240, 1823]],

       [[1094,  298],
        [ 170,  793]],

       [[1328,  375],
        [ 113,  539]],

       [[1243,  290],
        [ 192,  630]],

       [[2350,    5],
        [   0,    0]],

       [[2112,  230],
        [   0,   13]],

       [[1884,  421],
        [  11,   39]],

       [[2111,   64],
        [   0,  180]],

       [[1512,  470],
        [ 134,  239]],

       [[2169,  186],
        [   0,    0]],

       [[2285,   70],
        [   0,    0]],

       [[1201,  356],
        [ 190,  608]]])

In [12]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.81      0.55      0.65       603
           2       0.94      0.45      0.61       373
           3       0.78      0.94      0.85      1559
           4       0.70      0.78      0.74      1212
           5       0.75      0.44      0.55       695
           6       0.88      0.99      0.93      1846
           7       0.82      0.73      0.77      1091
           8       0.83      0.59      0.69       914
           9       0.77      0.68      0.72       920
          10       0.00      0.00      0.00         5
          11       1.00      0.05      0.10       243
          12       0.78      0.08      0.15       460
          13       1.00      0.74      0.85       244
          14       0.64      0.34      0.44       709
          15       0.00      0.00      0.00       186
          16       0.00      0.00      0.00        70
          17       0.76    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Using Word2Vec

In [48]:
documents = full_df.TEXT_PROCESSED.values

In [49]:
documents = [d.split() for d in documents]

In [50]:
import gensim
from gensim.models import KeyedVectors

model = gensim.models.Word2Vec(
    documents,
    size=150,
    window=10,
    min_count=2,
    workers=10,
    iter=5)
    
#odel = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin.gz", binary=True)

04/22/2020 13:07:32 - INFO - gensim.models.word2vec -   collecting all words and their counts
04/22/2020 13:07:32 - INFO - gensim.models.word2vec -   PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
04/22/2020 13:07:35 - INFO - gensim.models.word2vec -   PROGRESS: at sentence #10000, processed 16230158 words, keeping 60607 word types
04/22/2020 13:07:38 - INFO - gensim.models.word2vec -   PROGRESS: at sentence #20000, processed 32505504 words, keeping 85084 word types
04/22/2020 13:07:42 - INFO - gensim.models.word2vec -   PROGRESS: at sentence #30000, processed 48845407 words, keeping 104352 word types
04/22/2020 13:07:45 - INFO - gensim.models.word2vec -   PROGRESS: at sentence #40000, processed 65298054 words, keeping 120787 word types
04/22/2020 13:07:49 - INFO - gensim.models.word2vec -   PROGRESS: at sentence #50000, processed 81702362 words, keeping 135390 word types
04/22/2020 13:07:50 - INFO - gensim.models.word2vec -   collected 139102 word types from a corpu

In [52]:
model.wv.most_similar(positive=['smoking'], topn=10)

[('tobacco', 0.730054497718811),
 ('tob', 0.6298959255218506),
 ('tobacoo', 0.6190227270126343),
 ('smokes', 0.6182222366333008),
 ('smoke', 0.6151465177536011),
 ('cigar', 0.6050142049789429),
 ('cigars', 0.6028656363487244),
 ('smoker', 0.5915770530700684),
 ('ivda', 0.5884487628936768),
 ('pipe', 0.5807419419288635)]

In [None]:
mo

In [None]:
model['medicine']

# Using Bio Sent2Vec - Not working

In [13]:
import sent2vec
from nltk import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from scipy.spatial import distance

In [None]:

model_path = "/notebooks/storage/Downloads/transfer_model/BioSentVec_PubMed_MIMICIII-bigram_d700.bin"
model = sent2vec.Sent2vecModel()
try:
    model.load_model(model_path)
except Exception as e:
    print(e)
print('model successfully loaded')

In [None]:
stop_words = set(stopwords.words('english'))
def preprocess_sentence(text):
    text = text.replace('/', ' / ')
    text = text.replace('.-', ' .- ')
    text = text.replace('.', ' . ')
    text = text.replace('\'', ' \' ')
    text = text.lower()

    tokens = [token for token in word_tokenize(text) if token not in punctuation and token not in stop_words]

    return ' '.join(tokens)

In [None]:

sentence = preprocess_sentence('Breast cancers with HER2 amplification have a higher risk of CNS metastasis and poorer prognosis.')
print(sentence)

In [None]:
sentence_vector = model.embed_sentence(sentence)
print(sentence_vector)

# Using Bert

In [14]:
from transformers import BertConfig, BertForSequenceClassification, BertModel, BertTokenizer
from sklearn import svm
from sklearn.pipeline import Pipeline


In [15]:
# Another way of splitting the data

split = np.random.choice(
    ["train", "val", "test"],
    size=full_df.shape[0],
    p=[.7, .15, .15]
)
full_df["split"] = split

In [16]:
sample_df = full_df.sample(frac=0.3)

In [17]:
X_train = sample_df[sample_df["split"] == "train"]['TEXT']
y_train = mlb.fit_transform(sample_df.ICD9_GRP_LIST)


In [19]:
config = BertConfig.from_pretrained('bert-base-uncased') 

bert_tok = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel(config)

bert_transformer = BertTransformer(bert_tok, bert_model)

04/22/2020 12:20:45 - INFO - transformers.configuration_utils -   loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /root/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.8f56353af4a709bf5ff0fbc915d8f5b42bfff892cbb6ac98c3c45f481a03c685
04/22/2020 12:20:45 - INFO - transformers.configuration_utils -   Model config BertConfig {
  "_num_labels": 2,
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bad_words_ids": null,
  "bos_token_id": null,
  "decoder_start_token_id": null,
  "do_sample": false,
  "early_stopping": false,
  "eos_token_id": null,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "is_encoder_decoder": false,
  "label2id": {
   

In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

In [21]:
classifier = MultinomialNB(fit_prior=True, class_prior = None)
classifier = LogisticRegression(solver='sag')
model = Pipeline(
    [
        ("vectorizer", bert_transformer),
        ("classifier", OneVsRestClassifier(classifier)),
    ]
)
model.fit(X_train, y_train)

KeyboardInterrupt: 

# Training

In [None]:
classifier = svm.LinearSVC(C=1.0, class_weight="balanced")
model = Pipeline(
    [
        ("vectorizer", bert_transformer),
        ("classifier", classifier),
    ]
)
model.fit(X_train, y_train)

# Adding TF-IDF

In [None]:
from sklearn.feature_extraction.text import (
   CountVectorizer, TfidfTransformer
)

tf_idf = Pipeline([
    ("vect", CountVectorizer()),
    ("tfidf", TfidfTransformer())
    ])

model = Pipeline([
    ("union", FeatureUnion(transformer_list=[
        ("bert", bert_transformer),
        ("tf_idf", tf_idf)
        ])),
        ("classifier", classifier),
    ])

In [None]:
model.fit(X_train, y_train)

In [None]:
# Not working 


# from sklearn.preprocessing import MultiLabelBinarizer
# mlb = MultiLabelBinarizer()

# X_train = sample_df['TEXT']
# y_train = mlb.fit_transform(sample_df.ICD9_GRP_LIST)
# model = BertClassifier(max_seq_length=128, train_batch_size=4)
# model_gradient_accumulation_steps = 4

# model
# model = model.fit(X_train, y_train)