In [1]:
import os
from glob import glob
import json
from collections import defaultdict
import random
import numpy as np
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.tokenize import word_tokenize
from src.dataio.utils import is_number, get_word_from_text, find_nth
from collections import Counter

### Split data

In [5]:
text_files = sorted(glob(os.path.join("clean_data/ie_data", "*.txt")))
split_dir = "splits"
if not os.path.isdir(split_dir):
    os.makedirs(split_dir)

templates = defaultdict(list)

for tf in text_files:
    with open(tf, "r", encoding="utf-8") as f:
        d = json.load(f)
        templates[d["company"]].append(os.path.basename(tf).split(".")[0])
        
data_size = len(templates.keys())
n_train = int(0.6 * data_size)
n_val = int(0.2 * data_size)
n_test = data_size - n_train - n_val

companies = sorted(templates.keys())
random.seed(0)
random.shuffle(companies)

train_companies = companies[:n_train]
val_companies = companies[n_train:n_train+n_val]
test_companies = companies[-n_test:]

def distribute_files(templates, companies, max_samples):
    files = []
    for company in companies:
        docs = templates[company]
        random.seed(0)
        random.shuffle(docs)
        files += docs[:max_samples]
    return files

max_samples = 10
train_files = distribute_files(templates, train_companies, max_samples)
val_files = distribute_files(templates, val_companies, max_samples)
test_files = distribute_files(templates, test_companies, max_samples)
print(len(train_files), len(val_files), len(test_files))
print(len(train_files) + len(val_files) + len(test_files))

with open(os.path.join(split_dir, "train_list.txt"), "w") as f:
    f.write("\n".join(train_files))
with open(os.path.join(split_dir, "val_list.txt"), "w") as f:
    f.write("\n".join(val_files))
with open(os.path.join(split_dir, "test_list.txt"), "w") as f:
    f.write("\n".join(test_files))

In [59]:
split_dir = "splits"
max_vocab = 512
vocabs = Counter()
words = []

with open(os.path.join(split_dir, "train_list.txt"), "r") as f:
    train_files = f.read().splitlines()
for f in train_files:
    with open(os.path.join("ocr_results", "%s.txt"%f), "r") as f:
        lines = f.read().splitlines()
    for line in lines:
        sep = find_nth(line, ",", 8)
        word = get_word_from_text(line[sep+1:]).lower()
        words.append("[NUMBER]" if is_number(word) else word)
vocabs.update(words)

In [60]:
words = [word[0] for word in vocabs.most_common(512)]
words = ["[PAD]", "[UNK]"] + words
with open(os.path.join(split_dir, "vocab.txt"), "w") as f:
    f.write("\n".join(words))

In [15]:
# load the Stanford GloVe model
size = 300
glove_input_file = 'word_embedding/glove.6B/glove.6B.%dd.txt'%size
word2vec_output_file = 'word_embedding/glove.6B.%dd.txt.word2vec'%size
glove2word2vec(glove_input_file, word2vec_output_file)

model = KeyedVectors.load_word2vec_format(word2vec_output_file, binary=False)

numberVectors = {}
for word in model.vocab.keys():
    if is_number(word):
        numberVectors[word] = model[word]
numberVector = np.mean(list(numberVectors.values()), axis=0)

added_embeddings = {"[PAD]": np.zeros((size), dtype=np.float32),
                   "[UNK]": np.zeros((size), dtype=np.float32),
                   "[NUMBER]": numberVector}
added_words = ["[PAD]", "[UNK]", "[NUMBER]"]
added_vectors = [np.zeros((size), dtype=np.float32), np.zeros((size), dtype=np.float32), numberVector]
model.add(added_words, added_vectors)
model.wv.save("splits/w2v-%d"%size)

