In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import json

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

from util.train import embed_train_model
from util.wordvec_load import LoadGlove, get_glove_embeddings

from sklearn.model_selection import train_test_split
import random

In [None]:
NUM_TRAIN = 5
nclass = {"ag_news": 4}

In [None]:
train = {}

In [None]:
def get_train(orig, full, per):
  random.seed(42)

  new_data = []
  labels = orig["label"].values
  idx = 0
  for x, y in zip(orig["text"].values, full["text"].values):
    length = len(x.split())
    s_range = list(range(0, length, 1))
    choices = random.sample(s_range, int(per*length))
    temp1 = x.split()
    temp2 = y.split()
    new_sent = []
    for i in range(length):
      try:
        if i in choices:
          new_sent.append(temp2[i])
        else:
          new_sent.append(temp1[i])
      except IndexError:
        continue
    new_data.append({"text":" ".join(new_sent), "label":labels[idx]})
    idx += 1
  return pd.DataFrame(new_data)

In [None]:
train_path = "data/ls_dropout_ag_news.csv"
dim = 300
glove_path = "glove.6B.300d.txt" # Must download!
orig = pd.read_csv("data/ag_news_preprocessed_train.csv").sample(frac=0.5, random_state=42)
df = pd.read_csv(train_path)

In [None]:
for p in [0.25, 0.5, 0.75, 1]:
  if str(p) in train:
    continue

  print(p)

  tdf = get_train(orig, df, p)
  X = tdf['text'].values
  y = tdf['label'].values

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  num_classes = nclass["ag_news"]

  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(X)
  vocab_size = len(tokenizer.word_index)+1

  wv_model = LoadGlove(glove_path)
  embedding_matrix = get_glove_embeddings(embeddings_index=wv_model, dim=dim, tokenizer=tokenizer)

  y_train = to_categorical(y_train)
  y_test = to_categorical(y_test)

  training_sequences = tokenizer.texts_to_sequences(X_train)
  maxlen = 500
  training_padded = pad_sequences(training_sequences, maxlen=maxlen)

  testing_sequences = tokenizer.texts_to_sequences(X_test)
  testing_padded = pad_sequences(testing_sequences, maxlen=maxlen)

  X_train, X_test = training_padded, testing_padded

  accuracies = []
  model_savepath = "models/"
  for i in range(NUM_TRAIN):
      model, _ = embed_train_model(model_savepath, num_classes, embedding_matrix, X_train, y_train, X_test, y_test, vocab_size, maxlen, dim)
      _, accuracy = model.evaluate(testing_padded, y_test)
      accuracies.append(accuracy)
  train[p] = {}
  train[p]["accs"] = accuracies
  train[p]["mean"] = np.mean(np.array(accuracies))
  train[p]["std"] = np.std(np.array(accuracies))
  print("{}: {}".format(p, np.mean(np.array(accuracies))))

  with open("ls_train.json", 'w') as out:
      json.dump(train, out, indent=3)