In [25]:
!pip install transformers



In [18]:
from pathlib import Path

DATA_DIR = Path('data')
IN_PATH = DATA_DIR/'amazon-reviews-balanced.csv'
OUT_PATH = DATA_DIR/'amazon-reviews-sample.txt'
SEED = 42
SAMPLE_SIZE = 1000

In [19]:
def count_lines(path):
    """Count the number of lines in a file."""
    with open(path, 'r', encoding='utf8', errors='ignore') as f:
        return sum(1 for line in f)
    
lines_count = count_lines(IN_PATH)

print(lines_count)

1067020


In [21]:
import random
import math

def random_sample(arr, sample_size, seed):
    """Sample randomly from an array."""
    random.seed(seed)
    if sample_size <= 1:
        sample_size = math.floor(sample_size * len(arr))
    idx_sample = set(random.sample(arr, sample_size))
    return idx_sample

idx_sample = random_sample(range(lines_count), SAMPLE_SIZE, SEED)

print(len(idx_sample), list(idx_sample)[:5])

1000 [40961, 806915, 28677, 233478, 227336]


In [22]:
import csv
from tqdm.auto import tqdm

def preprocess_data(in_path, out_path, idx_sample):
    """For each line in input, preprocess it for the use in GPT-2, and append to output."""
    if out_path.exists():
        # Remove target if exists (since we are in append-mode)
        out_path.unlink()

    pbar = tqdm(total=len(idx_sample))
    with open(in_path, 'r', encoding='utf8') as inf:
        with open(out_path, 'a', encoding='utf8') as outf:
            reader = csv.reader(inf, delimiter=",")
            for i, row in enumerate(reader):
                if i in idx_sample:
                    product_category, review_body, star_rating = row
                    # Wrap product category and star rating into special symbols 
                    # for the GPT-2 model to distinguish them from the body text
                    text = '|' + product_category.replace('_', ' ') # otherwise '_' is treated as token
                    text += '|' + str(star_rating)
                    text += '|' + review_body
                    text += "<|endoftext|>" # use a special token to mark the end of the review
                    text += "\n" # for convenience and being able to do fp.readline() on text file
                    outf.write(text)
                    pbar.update()
                    
preprocess_data(IN_PATH, OUT_PATH, idx_sample)

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [26]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [28]:
with open(OUT_PATH, 'r', encoding='utf8') as f:
    row = f.readline()
    print(row)
    print(tokenizer.tokenize(row))

|Digital Software|2|Product advertises Blu-Ray support but then after you buy it there is no Blu-Ray support.  You can however buy a $20 plugin for Blu-Ray support but that is an additional cost and the way the product description reads there is no mention the need to buy this extra plugin.<br /><br />Also there appears to be no telephone support.  If you have any issues with getting it to work or issues with anything including billing, etc. you have to go through there web page support which doesn't appear to have anyone on the other end and no one returns my inquiries.<br /><br />Product does work on the Mac however.  Not as good as Nero but only solution for the Mac.  I did buy the plugin and Blu-rays burn fine though I was billed twice for the plugin.<br /><br />I wish there was an alternative seeing as how it's so difficult to contact someone if you need support.<|endoftext|>

['|', 'Digital', 'ĠSoftware', '|', '2', '|', 'Product', 'Ġadvert', 'ises', 'ĠBlu', '-', 'Ray', 'Ġsupport'

In [30]:
!python run_lm_finetuning.py \
    --output_dir=output \
    --model_type=gpt2 \
    --model_name_or_path=gpt2 \
    --do_train \
    --train_data_file=$OUT_PATH

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
100%|███████████████████████████████████████| 176/176 [00:00<00:00, 45976.43B/s]
100%|█████████████████████████| 548118077/548118077 [00:57<00:00, 9476142.47B/s]
Epoch:   0%|                                              | 0/1 [00:00<?, ?it/s]
Iteration:   0%|                                         | 0/16 [00:00<?, ?it/s][A^C
