Generates a training set for IMDb pretraining
- Each row is of the generated .csv is a review, with each word represented with the BERT embedding.

In [1]:
from __future__ import division, print_function, absolute_import

import tensorflow.compat.v1 as tf
import tflearn
from tflearn.layers.core import input_data, dropout, fully_connected
from tflearn.layers.conv import conv_1d, global_max_pool
from tflearn.layers.merge_ops import merge
from tflearn.layers.estimator import regression
from tflearn.data_utils import to_categorical, pad_sequences
from tflearn.datasets import imdb

import os
from transformers import BertTokenizer
import pandas as pd
import numpy as np



Instructions for updating:
non-resource variables are not supported in the long term


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
bert_embeddings = pd.read_csv('BERT_embeddings.csv', index_col=False)
bert_embeddings = bert_embeddings.drop(['Unnamed: 0'], axis=1)


In [4]:
with open('/Users/kalliehuynh/compound-word-embeddings/data_processing/aclImdb/test/neg/3_4.txt', 'r') as file:
    file_str = file.read()
    tokens = tokenizer.tokenize(file_str)
    tokens = tokens[:100]

In [5]:
def review_array(tokens):
    """Generates an array of embeddings for a given tokenized piece of text.
    Truncate the list of tokens to 100.
    If there are less than 100 tokens, pad the array with 0's.

    Args:
        tokens (list): A list of tokens

    Returns:
        numpy.array: An array of embeddings, padded and truncated to a shape of (768, 100)
    """
    tokens = tokens[:100]
    embeddings_list = []
    for token in tokens:
        if not bert_embeddings.loc[bert_embeddings['word']==token].empty:
            embedding = bert_embeddings.loc[bert_embeddings['word']==token].iloc[:, 1:].to_numpy()
            embeddings_list.append(embedding)
    embeddings_array = np.array(embeddings_list)
    num_embeddings, _, _ = embeddings_array.shape
    embeddings_array = np.pad(embeddings_array, [(0, 100-num_embeddings), (0, 0), (0, 0)], 'constant')
    
    return embeddings_array.tolist()
            

In [6]:
test_neg = []
for root_dir_path, sub_dirs, files in os.walk('/Users/kalliehuynh/compound-word-embeddings/data_processing/aclImdb/test/neg'):
    for file in files:
        with open(root_dir_path + '/' + file) as f:
            test_neg.append(review_array(tokenizer.tokenize(f.read())))

In [7]:
# Double-check dimensions of the list:
print('Number of reviews:', len(test_neg))
print('Number of words per review:', len(test_neg[0]))
print('Length of each word vector:',  len(test_neg[0][0]))

Number of reviews: 12500
Number of words per review: 100
Length of each word vector: 1


In [8]:
array_labels = ['word_%02d' % x for x in range(100)]

In [9]:
test_data = pd.DataFrame(columns=[*array_labels])
for i in range(len(test_neg)):
    test_data.loc[i, array_labels] = test_neg[i]

In [10]:
test_data['positive'] = [0] * len(test_data)

In [11]:
test_data.to_csv('test_neg.csv')