# Prepares the dataset for pre-training. This includes cleaning of dataset,  creating vocabulary, defining encoder and decoder functions.

In [1]:
import os, time, math, pickle
import numpy as np, pandas as pd

In [2]:
DATA_DIR = "data/"
seed = 999

Cleaning the data

In [3]:
df = pd.read_csv(DATA_DIR + "dataset.csv")
df.dropna(inplace=True)
df["length"] = df["name"].apply(lambda x: len(x))
df = df[(df["nametype"] == "firstname") & (df["length"] < 15) & (df["length"] > 2)]
names = df["name"].tolist()
print("length of dataset in characters =", sum([len(x) for x in names]))

length of dataset in characters = 1579033


Vocabulary and encoding/decoding

In [4]:
chars = set()
for name in names:
    chars.update(list(name))
chars = sorted(list(chars))
chars += ["*", "{", "}", "=", "S", "E", "G", "M", "F", "C", "I", "O"]
vocab_size = len(chars)
print("all the unique characters:", "".join(chars))
print(f"vocab size: {vocab_size:,}")

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}


def encode(s):
    return [stoi[c] for c in s]


def decode(l):
    return "".join([itos[i] for i in l])

all the unique characters: abcdefghijklmnopqrstuvwxyz*{}=SEGMFCIO
vocab size: 38


splitting data into training and validation

In [5]:
train_df = df.sample(frac=0.8, random_state=seed)
val_df = df.drop(train_df.index)
val_df = val_df.sample(frac=1)
train_df.to_csv(DATA_DIR + "train.bin", index=False)
val_df.to_csv(DATA_DIR + "val.bin", index=False)
print("training names:", len(train_df), "validation names:", len(val_df))

training names: 208306 validation names: 52077


save metadata

In [6]:
meta = {
    "vocab_size": vocab_size,
    "itos": itos,
    "stoi": stoi,
}
with open(DATA_DIR + "meta.pkl", "wb") as f:
    pickle.dump(meta, f)

In [7]:
itos

{0: 'a',
 1: 'b',
 2: 'c',
 3: 'd',
 4: 'e',
 5: 'f',
 6: 'g',
 7: 'h',
 8: 'i',
 9: 'j',
 10: 'k',
 11: 'l',
 12: 'm',
 13: 'n',
 14: 'o',
 15: 'p',
 16: 'q',
 17: 'r',
 18: 's',
 19: 't',
 20: 'u',
 21: 'v',
 22: 'w',
 23: 'x',
 24: 'y',
 25: 'z',
 26: '*',
 27: '{',
 28: '}',
 29: '=',
 30: 'S',
 31: 'E',
 32: 'G',
 33: 'M',
 34: 'F',
 35: 'C',
 36: 'I',
 37: 'O'}

In [8]:
stoi

{'a': 0,
 'b': 1,
 'c': 2,
 'd': 3,
 'e': 4,
 'f': 5,
 'g': 6,
 'h': 7,
 'i': 8,
 'j': 9,
 'k': 10,
 'l': 11,
 'm': 12,
 'n': 13,
 'o': 14,
 'p': 15,
 'q': 16,
 'r': 17,
 's': 18,
 't': 19,
 'u': 20,
 'v': 21,
 'w': 22,
 'x': 23,
 'y': 24,
 'z': 25,
 '*': 26,
 '{': 27,
 '}': 28,
 '=': 29,
 'S': 30,
 'E': 31,
 'G': 32,
 'M': 33,
 'F': 34,
 'C': 35,
 'I': 36,
 'O': 37}