In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
import os

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

from util.train import embed_train_model
from util.wordvec_load import LoadGlove, get_glove_embeddings

In [3]:
NUM_TRAIN = 5
nclass = {"imdb": 2, "ag_news": 4}

In [4]:
if Path("train.json").is_file() == True:
  with open("train.json", 'r') as f:
      train = json.load(f)
else:
  train = {}

In [None]:
for f in Path("Data/perturbed").rglob("*.csv"):
    dim = int(f.stem.split('_')[-2])

    # CHANGE file paths for GloVe
    if dim == 50:
        glove_path = "/path/to/glove.6B.50d.txt"
    elif dim == 100:
        glove_path = "/path/to/glove.6B.100d.txt"
    elif dim == 300:
        glove_path = "/path/to/glove.6B.300d.txt"

    df_train_dp = pd.read_csv(f)
    if "imdb" in f.name:
        df_pre_train = pd.read_csv("Data/imdb_preprocessed_train.csv")
        df_pre_test = pd.read_csv("Data/imdb_preprocessed_test.csv")
        num_classes = nclass["imdb"]
    elif "ag_news" in f.name:
        df_pre_train = pd.read_csv("Data/ag_news_preprocessed_train.csv")
        df_pre_test = pd.read_csv("Data/ag_news_preprocessed_test.csv")
        num_classes = nclass["ag_news"]

    X_train = df_pre_train['text'].values
    y_train = df_pre_train['label'].values
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X_train)
    vocab_size = len(tokenizer.word_index)+1

    wv_model = LoadGlove(glove_path)
    embedding_matrix = get_glove_embeddings(embeddings_index=wv_model, dim=dim, tokenizer=tokenizer)

    X_train_dp = df_train_dp['text'].values
    y_train_dp = df_train_dp['label'].values

    X_test = df_pre_test['text'].values
    y_test = df_pre_test['label'].values

    y_train = to_categorical(y_train_dp)
    y_test = to_categorical(y_test)

    training_sequences = tokenizer.texts_to_sequences(X_train_dp)
    maxlen = 500
    training_padded = pad_sequences(training_sequences, maxlen=maxlen)

    testing_sequences = tokenizer.texts_to_sequences(X_test)
    testing_padded = pad_sequences(testing_sequences, maxlen=maxlen)

    X_train, X_test = training_padded, testing_padded

    accuracies = []
    if Path("models/").is_dir() == False:
        os.makedirs("models/")
    model_savepath = "models/"
    for i in range(NUM_TRAIN):
        model, _ = embed_train_model(model_savepath, num_classes, embedding_matrix, X_train, y_train, X_test, y_test, vocab_size, maxlen, dim)
        _, accuracy = model.evaluate(testing_padded, y_test)
        accuracies.append(accuracy)
    train[f.stem] = np.mean(np.array(accuracies))
    print("{}: {}".format(f.stem, np.mean(np.array(accuracies))))

    with open("train.json", 'w') as out:
        json.dump(train, out, indent=3)