## Additional preprocessing for LSTM, which uses GloVe instead of BERT embeddings

In [4]:
# import multiprocessing as mp
import numpy as np
import os
import pandas as pd
import pickle
import re
import spacy
import warnings

warnings.filterwarnings("ignore")

In [7]:
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [8]:
def preprocess(rev):
    """extra (but minimal) preprocessing for
    the LSTM baseline, which uses GloVe vectors
    rather than BERT embeddings"""
    text, label = rev
    text = nlp(text)
    text = [word.lemma_ for word in text]
    text = " ".join(text).lower()
    text = re.sub("[^a-z]+", " ", text)
    text = re.sub("\s+", " ", text)
    return (text, label)


def preprocess_corpus(corpus):
    """loop through reviews"""
    corpus = [preprocess(c) for c in corpus]
    return corpus


def make_corpus(df):
    """separate out text and labels, 
    then preprocess"""
    text = list(df["text"].values)
    labels = list(df["labels"].values)
    corp = list(zip(text, labels))
    corp = preprocess_corpus(corp)
    return corp


def check_just_ascii(key):
    """checks whether words are composed of
    only ascii characters; for glove dict
    only"""
    codes = [ord(c) for c in key]
    try:
        if max(codes) < 128:
            return True
    except:
        return False
    return False


def glove2dict(glove_path):
    """helper function to retrieve retrieve the
    GloVe embeddings"""
    with open(glove_path, encoding="utf-8") as f:
        reader = csv.reader(f, delimiter=" ", quoting=csv.QUOTE_NONE) #, encoding="utf-8")
        embed = {line[0]: np.array(list(map(float, line[1:])))
                for line in reader}
    return embed

In [9]:
train_df = pd.read_json("train_df_final.json")
dev_df = pd.read_json("dev_df_final.json")
test_df = pd.read_json("test_df_final.json")

print(train_df.shape)
print(dev_df.shape)
print(test_df.shape)

(392000, 5)
(98000, 5)
(48419, 5)


In [11]:
cols = ["text", "labels"]

train_corpus = make_corpus(train_df)
print("train: ", train_df.shape, len(train_corpus))
train_corpus = pd.DataFrame(train_corpus, columns=cols)
train_corpus.to_json("train_df_final_LSTM.json")

dev_corpus = make_corpus(dev_df)
print("dev: ", dev_df.shape, len(dev_corpus))
dev_corpus = pd.DataFrame(dev_corpus, columns=cols)
dev_corpus.to_json("dev_df_final_LSTM.json")

test_corpus = make_corpus(test_df)
print("test: ", test_df.shape, len(test_corpus))
test_corpus = pd.DataFrame(test_corpus, columns=cols)
test_corpus.to_json("test_df_final_LSTM.json")

print("fin")

train:  (392000, 5) 392000
dev:  (98000, 5) 98000
test:  (48419, 5) 48419
fin


In [6]:
os.listdir("glove.42B.300d")

In [None]:
glove_path = "glove.42B.300d/glove.42B.300d.txt"
glove_base = glove2dict(glove_path)
glove_base = {key:value for key, value in glove_base.items()}

glove_ascii_keys = [key for key in glove_base.keys() if check_just_ascii(key)]
glove_reduced_keys = [(key, re.sub("[^a-z]+", "", key)) for key in glove_ascii_keys if re.sub("[^a-z]+", "", key)]
glove_reduced = {tup[1]:glove_base[tup[0]] for tup in sorted(glove_reduced_keys)} # overwrites duplicates, e.g. '#the' and 'the'

pickle.dump(glove_reduced, open("glove_lower_lg.d", "wb"))