# Setup

## Colab Specific Setup

In [None]:
!pip install keras==2.6.0

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
!wget https://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip glove.840B.300d.zip

## General Setup

In [None]:
from methods import *

from methods import *
from b_2_train_eval import run_model
from collections import defaultdict
from antonym_aug import AR, eda_5

# Augmentation

In [None]:
train_orig = 'data/subj/train_orig.txt'
train_aug_st = 'data/subj/train_aug_st.txt'
train_aug_ar = 'data/subj/train_aug_ar.txt'
train_aug_eda_ar = 'data/subj/train_aug_eda_ar.txt'
test_path = 'data/subj/test.txt'

In [None]:
!head $train_orig

In [None]:
def gen_ar_aug(traing_orig, output_file, alpha=0.3, num_aug=9):
    writer = open(output_file, 'w')
    lines = open(train_orig, 'r').readlines()
    for i, line in enumerate(lines):
        parts = line[:-1].split('\t')
        label = parts[0]
        sentence = parts[1]
        aug_sentences = AR(sentence, alpha=alpha, num_aug=num_aug)
        for aug_sentence in aug_sentences:
            writer.write(label + '\t' + aug_sentence + '\n')
    writer.close()
    print('finished AR for', train_orig, 'to', output_file, 'with alpha', alpha)

def gen_eda_ar_aug(traing_orig, output_file, num_aug=9):
    writer = open(output_file, 'w')
    lines = open(train_orig, 'r').readlines()
    for i, line in enumerate(lines):
        parts = line[:-1].split('\t')
        label = parts[0]
        sentence = parts[1]
        aug_sentences = eda_5(sentence, num_aug=num_aug)
        for aug_sentence in aug_sentences:
            writer.write(label + '\t' + aug_sentence + '\n')
    writer.close()
    print('finished EDA + AR for', train_orig, 'to', output_file)

In [None]:
gen_standard_aug(train_orig, train_aug_st, num_aug=1)
gen_ar_aug(train_orig, train_aug_ar, alpha=0.3, num_aug=1)
gen_eda_ar_aug(train_orig, train_aug_eda_ar, num_aug=1)

word2vec_pickle = 'data/subj/word2vec.p'
gen_vocab_dicts('data/subj', word2vec_pickle, '../../eda_nlp/word2vec/glove.840B.300d.txt')

In [None]:
!head $train_aug_ar

# Training

In [None]:
orig_accs = {}
aug_accs = {}
ar_accs = {}
eda_ar_accs = {}
word2vec = load_pickle(word2vec_pickle)
num_classes = 2
input_size = 40
word2vec_len = 300

In [None]:
def run_model(train_file, test_file, num_classes, percent_dataset):

	#initialize model
	model = build_model(input_size, word2vec_len, num_classes)

	#load data
	train_x, train_y = get_x_y(train_file, num_classes, word2vec_len, input_size, word2vec, percent_dataset)
	test_x, test_y = get_x_y(test_file, num_classes, word2vec_len, input_size, word2vec, 1)

	#implement early stopping
	callbacks = [EarlyStopping(monitor='val_loss', patience=3)]

	#train model
	model.fit(	train_x, 
				train_y, 
				epochs=100000, 
				callbacks=callbacks,
				validation_split=0.1, 
				batch_size=1024, 
				shuffle=True, 
				verbose=0)
	#model.save('checkpoints/lol')
	#model = load_model('checkpoints/lol')

	#evaluate model
	y_pred = model.predict(test_x)
	test_y_cat = one_hot_to_categorical(test_y)
	y_pred_cat = one_hot_to_categorical(y_pred)
	acc = accuracy_score(test_y_cat, y_pred_cat)

	#clean memory???
	train_x, train_y = None, None
	gc.collect()

	#return the accuracy
	#print("data with shape:", train_x.shape, train_y.shape, 'train=', train_file, 'test=', test_file, 'with fraction', percent_dataset, 'had acc', acc)
	return acc

In [None]:
increments = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]
for increment in increments:
			
    #calculate augmented accuracy
    aug_acc = run_model(train_aug_st, test_path, num_classes, increment)
    aug_accs[increment] = aug_acc

    #calculate AR accuracy
    ar_acc = run_model(train_aug_ar, test_path, num_classes, increment)
    ar_accs[increment] = ar_acc

    #calculate EDA + AR accuracy
    eda_ar_acc = run_model(train_aug_eda_ar, test_path, num_classes, increment)
    eda_ar_accs[increment] = eda_ar_acc


    print(increment, aug_acc, ar_acc, eda_ar_accs)

    gc.collect()

In [None]:
import matplotlib.pyplot as plt

x = [100*p for p in aug_accs.keys()]
y1 = list(aug_accs.values())
y2 = list(ar_accs.values())
y3 = list(eda_ar_accs.values())

plt.plot(x, y1)
plt.plot(x, y2)
plt.plot(x, y3)
plt.ylim(0.4, 1)
plt.legend(['EDA', 'AR', 'EDA+AR'])
plt.xlabel('Percent of Dataset (%)')
plt.ylabel('Acuracy')
plt.savefig('img/eda-ar-result.pdf')
plt.show()