## Data augmentation and classification

In [1]:
import tensorflow as tf
import pandas as pd
from pandas import ExcelWriter
from pandas import ExcelFile
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from transformers import AutoModel,AutoTokenizer,AutoConfig,AutoModelWithLMHead
from transformers import AutoModelForSequenceClassification
import numpy as np
from nltk import word_tokenize
import random
from collections import Counter
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
from nltk.corpus import stopwords


In [2]:
## load labelled data
df = pd.read_excel('master_label_data_classifiers_25feb_20 - J_Wosik.xlsx', sheet_name=0)

In [3]:
labeled_data = df[df.Urgency.isin({0.0, 1.0,-1.0})]

In [4]:
df.columns

Index(['Unnamed: 0_x', 'DEPARTMENT_NAME', 'ENCOUNTER_DATE', 'ENCOUNTER_ID',
       'ENCOUNTER_TYPE', 'END_DATE', 'FINANCIAL_CLASS', 'INDICATOR',
       'MESSAGE_DATE', 'MESSAGE_ID', 'MESSAGE_TEXT', 'MESSAGE_TO_FROM_PATIENT',
       'MESSAGE_TO_FROM_PROVIDER', 'MESSAGE_TYPE', 'MRN', 'PAT_ID', 'PAT_NAME',
       'PROVIDER_TYPE', 'START_DATE', 'Clean_Message', 'cluster_label',
       'binary_cluster', 'doc_binary', 'doc_topic_perct', 'dominant_topic_ids',
       'cluster_topic_interp', 'cluster', 'Urgency', '# of questions',
       'statin related', 'Bleeding', 'Thanks/best wishes', 'ECG question',
       'Stopping Meds Prior to Surgery', 'Requests for referrals',
       'Simple Refill', 'Includes vitals', 'INR message'],
      dtype='object')

In [5]:
maps = {-1.0: 0, 0.0: 1, 1.0:2} ## non-urgent, medium, and urgent

In [6]:
## sentences & labels
sentences,labels = labeled_data.Clean_Message.to_list(),labeled_data.Urgency.to_list()

In [7]:
## map the original labels to 0, ..., C-1, with C the number of classes
labels = [maps[it] for it in labels]

In [8]:
Counter(labels).most_common()

[(1, 955), (0, 631), (2, 170)]

In [29]:
## split dataset into tran/test
x_train, x_test, y_train, y_test = train_test_split(sentences, labels,\
                          random_state=2020, stratify=labels, test_size=0.2)

In [30]:
Counter(y_test).most_common()

[(1, 191), (0, 127), (2, 34)]

In [31]:
Counter(y_train).most_common()

[(1, 764), (0, 504), (2, 136)]

### Data augmentation for positive labels

In [14]:
## positive sentences
pos_sens = [x_train[i] for i,lab in enumerate(y_train) if lab==2]

In [96]:
len(pos_sens)

136

### Approach 1: Easy Data Augmentation
A simple baseline for data augmentation is shuffling the text elements to create new text. For example, if we have labeled sentences and we want to get more, we can shuffle each sentence words to create a new sentence.

In [17]:
import random
from random import shuffle
random.seed(1)
import re
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet 

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/DHE/ss1043/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [18]:
import json

In [75]:
data0 = json.load(open('original_labelled_data.json', 'r'))
sentences,labels = data0['sentences'],data0['labels']

In [76]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(sentences, labels, 
                                                random_state=2020, stratify=labels,test_size=0.2)


In [40]:
len(sentences)

1756

In [42]:
## If you use ContextualWordEmbsAug or ContextualWordEmbsForSentenceAug, install the 
##following dependencies as well
import nlpaug

In [46]:
from nlpaug import augmenter 

In [66]:
import nlpaug.augmenter.word as naw

In [100]:
aug_sub = naw.ContextualWordEmbsAug(model_path="./bio_bert_model",action='substitute',top_k=5,aug_max=5)


In [101]:
#aug_sub = naw.ContextualWordEmbsAug(model_path="./bert-base-uncased",action='substitute',top_k=5,aug_max=5)
aug_ins = naw.ContextualWordEmbsAug(model_path="./bio_bert_model",action='insert',top_k=20,aug_max=5)

In [55]:
from nlpaug.augmenter import sentence,word

In [60]:

def augment_train(x_train, y_train, label_aug=2, num_aug=4):
    """ Use Data augmentation to augment training set
    :params[in]: x_train, training examples
    :params[in]: y_train, labels for training set
    :params[in]: label_aug, the label to do augmentation
    
    :params[in]: x_train,y_train, augmented training data
    """
    x_train_urgent = [x for x,y in zip(x_train,y_train) if \
                      y==label_aug]
    ## augment sentences
    aug_sentences = []
    for sen0 in x_train_urgent:
        ## original sentence will be removed
        aug_sentences += eda(sen0, alpha_sr=0.05, alpha_ri=0.05, \
                             alpha_rs=0.05, p_rd=0.05, num_aug=num_aug)[:-1]
    ## append to the original data --only augment urgent sentences
    x_train += aug_sentences
    y_train +=  [label_aug]*len(aug_sentences)
    ## return augmented data
    return x_train, y_train

In [25]:
Counter(y_train)

Counter({0: 504, 1: 764, 2: 136})

In [27]:
2*764/136

11.235294117647058

In [77]:
## data augmentation for all three labels
## double label=1
#x_train, y_train=augment_train(x_train, y_train, label_aug=1, num_aug=1)
## 3 times label=0
#x_train, y_train=augment_train(x_train, y_train, label_aug=0, num_aug=2)
## 11 times label=2
x_train, y_train=augment_train(x_train, y_train, label_aug=2, num_aug=4)


In [78]:
Counter(y_train)

Counter({0: 504, 1: 764, 2: 680})

In [79]:
Counter(y_test)

Counter({0: 127, 1: 191, 2: 34})

In [80]:
data_augment = {'x_train':x_train, 'y_train':y_train, 'x_test':x_test, 'y_test':y_test}

In [81]:
with open('eda_data_9mar.json', 'w') as jf:
    json.dump(data_augment, jf)

In [82]:
with open('eda_data_9mar.json', 'r') as jf:
    res=json.load(jf)

In [84]:
len(res['y_train'])

1948

In [86]:
Counter(res['y_train'])

Counter({0: 504, 1: 764, 2: 680})

### Easy Data Augmentation
We present EDA: easy data augmentation techniques for boosting performance on text classification tasks. These are a generalized set of data augmentation techniques that are easy to implement and have shown improvements on five NLP classification tasks, with substantial improvements on datasets of size N < 500. While other techniques require you to train a language model on an external dataset just to get a small boost, we found that simple text editing operations using EDA result in good performance gains. Given a sentence in the training set, we perform the following operations:

    Synonym Replacement (SR): Randomly choose n words from the sentence that are not stop words. Replace each of these words with one of its synonyms chosen at random.
    Random Insertion (RI): Find a random synonym of a random word in the sentence that is not a stop word. Insert that synonym into a random position in the sentence. Do this n times.
    Random Swap (RS): Randomly choose two words in the sentence and swap their positions. Do this n times.
    Random Deletion (RD): For each word in the sentence, randomly remove it with probability p.

In [20]:
# Easy data augmentation techniques for text classification
#stop words list

stop_words = ['i', 'me', 'my', 'myself', 'we', 'our', 
			'ours', 'ourselves', 'you', 'your', 'yours', 
			'yourself', 'yourselves', 'he', 'him', 'his', 
			'himself', 'she', 'her', 'hers', 'herself', 
			'it', 'its', 'itself', 'they', 'them', 'their', 
			'theirs', 'themselves', 'what', 'which', 'who', 
			'whom', 'this', 'that', 'these', 'those', 'am', 
			'is', 'are', 'was', 'were', 'be', 'been', 'being', 
			'have', 'has', 'had', 'having', 'do', 'does', 'did',
			'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
			'because', 'as', 'until', 'while', 'of', 'at', 
			'by', 'for', 'with', 'about', 'against', 'between',
			'into', 'through', 'during', 'before', 'after', 
			'above', 'below', 'to', 'from', 'up', 'down', 'in',
			'out', 'on', 'off', 'over', 'under', 'again', 
			'further', 'then', 'once', 'here', 'there', 'when', 
			'where', 'why', 'how', 'all', 'any', 'both', 'each', 
			'few', 'more', 'most', 'other', 'some', 'such', 'no', 
			'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 
			'very', 's', 't', 'can', 'will', 'just', 'don', 
			'should', 'now', '']

def get_only_chars(line):

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ") #replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line) #delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

########################################################################
# Synonym replacement
# Replace n words in the sentence with synonyms from wordnet
########################################################################

#for the first time you use wordnet

def synonym_replacement(words, n):
	new_words = words.copy()
	random_word_list = list(set([word for word in words if word not in stop_words]))
	random.shuffle(random_word_list)
	num_replaced = 0
	for random_word in random_word_list:
		synonyms = get_synonyms(random_word)
		if len(synonyms) >= 1:
			synonym = random.choice(list(synonyms))
			new_words = [synonym if word == random_word else word for word in new_words]
			#print("replaced", random_word, "with", synonym)
			num_replaced += 1
		if num_replaced >= n: #only replace up to n words
			break
	#this is stupid but we need it, trust me
	sentence = ' '.join(new_words)
	new_words = sentence.split(' ')

	return new_words

def get_synonyms(word):
	synonyms = set()
	for syn in wordnet.synsets(word): 
		for l in syn.lemmas(): 
			synonym = l.name().replace("_", " ").replace("-", " ").lower()
			synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
			synonyms.add(synonym) 
	if word in synonyms:
		synonyms.remove(word)
	return list(synonyms)

########################################################################
# Random deletion
# Randomly delete words from the sentence with probability p
########################################################################

def random_deletion(words, p):

	#obviously, if there's only one word, don't delete it
	if len(words) == 1:
		return words

	#randomly delete words with probability p
	new_words = []
	for word in words:
		r = random.uniform(0, 1)
		if r > p:
			new_words.append(word)

	#if you end up deleting all words, just return a random word
	if len(new_words) == 0:
		rand_int = random.randint(0, len(words)-1)
		return [words[rand_int]]

	return new_words

########################################################################
# Random swap
# Randomly swap two words in the sentence n times
########################################################################

def random_swap(words, n):
	new_words = words.copy()
	for _ in range(n):
		new_words = swap_word(new_words)
	return new_words

def swap_word(new_words):
	random_idx_1 = random.randint(0, len(new_words)-1)
	random_idx_2 = random_idx_1
	counter = 0
	while random_idx_2 == random_idx_1:
		random_idx_2 = random.randint(0, len(new_words)-1)
		counter += 1
		if counter > 3:
			return new_words
	new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1] 
	return new_words

########################################################################
# Random insertion
# Randomly insert n words into the sentence
########################################################################

def random_insertion(words, n):
	new_words = words.copy()
	for _ in range(n):
		add_word(new_words)
	return new_words

def add_word(new_words):
	synonyms = []
	counter = 0
	while len(synonyms) < 1:
		random_word = new_words[random.randint(0, len(new_words)-1)]
		synonyms = get_synonyms(random_word)
		counter += 1
		if counter >= 10:
			return
	random_synonym = synonyms[0]
	random_idx = random.randint(0, len(new_words)-1)
	new_words.insert(random_idx, random_synonym)



In [23]:
########################################################################
# main data augmentation function
########################################################################

def eda(sentence, alpha_sr=0.05, alpha_ri=0.05, alpha_rs=0.05, p_rd=0.05, num_aug=9):
	
	sentence = get_only_chars(sentence)
	words = sentence.split(' ')
	words = [word for word in words if word is not '']
	num_words = len(words)
	
	augmented_sentences = []
	num_new_per_technique = int(num_aug/4)+1
	n_sr = max(1, int(alpha_sr*num_words))
	n_ri = max(1, int(alpha_ri*num_words))
	n_rs = max(1, int(alpha_rs*num_words))

	#sr -- synonym replacement 
	for _ in range(num_new_per_technique):
		a_words = synonym_replacement(words, n_sr)
		augmented_sentences.append(' '.join(a_words))

	#ri -- random insertion
	for _ in range(num_new_per_technique):
		a_words = random_insertion(words, n_ri)
		augmented_sentences.append(' '.join(a_words))

	#rs -- random swap
	for _ in range(num_new_per_technique):
		a_words = random_swap(words, n_rs)
		augmented_sentences.append(' '.join(a_words))

	#rd -- random deletion
	for _ in range(num_new_per_technique):
		a_words = random_deletion(words, p_rd)
		augmented_sentences.append(' '.join(a_words))

	augmented_sentences = [get_only_chars(sentence) for sentence in augmented_sentences]
	shuffle(augmented_sentences)

	#trim so that we have the desired number of augmented sentences
	if num_aug >= 1:
		augmented_sentences = augmented_sentences[:num_aug]
	else:
		keep_prob = num_aug / len(augmented_sentences)
		augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

	#append the original sentence
	augmented_sentences.append(sentence)

	return augmented_sentences

In [26]:
sen0 = 'i am in duhram, which is located in north carolina, united states'
eda(sen0, num_aug=1)

['i am in north which is located in duhram carolina united states',
 'i am in duhram which is located in north carolina united states']

In [59]:
## augment sentences
aug_sentences = []
for sen in pos_sens:
    ## original sentence will be removed
    aug_sentences += eda(sen0, num_aug=4)[:-1]
## append to the original data --only augment urgent sentences
x_train += aug_sentences
y_train +=  [2]*len(aug_sentences)

In [60]:
len(aug_sentences)

544

In [61]:
len(pos_sens)

136

In [62]:
## append to the original data --only augment urgent sentences
x_train += aug_sentences
y_train +=  [2]*len(aug_sentences)

In [63]:
Counter(y_train).most_common()

[(1, 764), (2, 680), (0, 504)]

In [65]:
Counter(y_test).most_common()

[(1, 191), (0, 127), (2, 34)]

In [66]:
import json

In [67]:
help(json.dump)

Help on function dump in module json:

dump(obj, fp, skipkeys=False, ensure_ascii=True, check_circular=True, allow_nan=True, cls=None, indent=None, separators=None, default=None, sort_keys=False, **kw)
    Serialize ``obj`` as a JSON formatted stream to ``fp`` (a
    ``.write()``-supporting file-like object).
    
    If ``skipkeys`` is true then ``dict`` keys that are not basic types
    (``str``, ``int``, ``float``, ``bool``, ``None``) will be skipped
    instead of raising a ``TypeError``.
    
    If ``ensure_ascii`` is false, then the strings written to ``fp`` can
    contain non-ASCII characters if they appear in strings contained in
    ``obj``. Otherwise, all such characters are escaped in JSON strings.
    
    If ``check_circular`` is false, then the circular reference check
    for container types will be skipped and a circular reference will
    result in an ``OverflowError`` (or worse).
    
    If ``allow_nan`` is false, then it will be a ``ValueError`` to
    serialize o

In [68]:
data = {'x_train':x_train, 'y_train':y_train, 'x_test':x_test, 'y_test':y_test}
with open('eda_augment_data.json', 'w') as jf:
    json.dump(data, jf, ensure_ascii=False)