In [6]:
import sys
sys.path.append("../../")

import pandas as pd
import string
import re
import unicodedata
import importlib

import src.tokenizer.tokenizer as t

from tqdm import tqdm




In [51]:
train_0 = pd.read_parquet(r"..\..\data\raw\train-0.parquet")
train_1 = pd.read_parquet(r"..\..\data\raw\train-1.parquet")
train = pd.concat([train_0, train_1])

test = pd.read_parquet(r"..\..\data\raw\test.parquet")

validation = pd.read_parquet(r"..\..\data\raw\validation.parquet")

In [52]:
def clean_rows(df):
    df = df[df['text'].str.strip() != ""].reset_index(drop=True)
    df = df[~df['text'].str.startswith(" =")].reset_index(drop=True) # remove header rows
    df = df.sample(frac=1).reset_index(drop=True)
    return df

In [53]:
allowed_chars = set(string.ascii_letters + string.digits + string.punctuation + " \n\t")

def clean_text(text):
    text = unicodedata.normalize("NFKC", text)
    text = text.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
    text = "".join(c for c in text if c in allowed_chars)
    text = text.replace(" @-@ ", "-")
    text = text.replace(" @,@ ", ",")
    text = re.sub(r"\s+([.,!?;':)\]])", r"\1", text)
    text = re.sub(r"([.,!?;:])([^\s])", r"\1 \2", text)
    text = re.sub(r"([[(])\s+", r"\1", text)
    text = text.replace(r"\'", "'")
    text = text.strip()
    return text

In [72]:
def get_bin(text):
    return len(text) // 16

def remove_short_rows(df):
    return df[df['bin'] >= 4]

In [None]:
train = clean_rows(train)
train['text'] = train['text'].apply(clean_text)

validation = clean_rows(validation)
validation['text'] = validation['text'].apply(clean_text)

test = clean_rows(test)
test['text'] = test['text'].apply(clean_text)

In [73]:
train['bin'] = train['text'].apply(get_bin)
validation['bin'] = validation['text'].apply(get_bin)
test['bin'] = test['text'].apply(get_bin)

train = remove_short_rows(train)
validation = remove_short_rows(validation)
test = remove_short_rows(test)

In [106]:
train.to_parquet(r"..\..\data\clean\train.parquet")
validation.to_parquet(r"..\..\data\clean\validation.parquet")
test.to_parquet(r"..\..\data\clean\test.parquet")

In [4]:
train = pd.read_parquet(r"..\..\data\clean\train.parquet")
validation = pd.read_parquet(r"..\..\data\clean\validation.parquet")
test = pd.read_parquet(r"..\..\data\clean\test.parquet")

In [5]:
train.to_csv(r"..\..\data\clean\train.csv")
test.to_csv(r"..\..\data\clean\test.csv")
validation.to_csv(r"..\..\data\clean\validation.csv")

In [6]:
importlib.reload(t)

def tokenize(text, tokenizer):
    seq = tokenizer.encode(text)
    return seq

tokenizer = t.Tokenizer(token_to_id_path=r"..\tokenizer\token_to_id.json", merges_path=r"..\tokenizer\merges.json")

# train['text'] = train['text'].apply(lambda x: tokenize(x, tokenizer))
# validation['text'] = validation['text'].apply(lambda x: tokenize(x, tokenizer))
# test['text'] = test['text'].apply(lambda x: tokenize(x, tokenizer))









Tokenizer loaded with 21779 tokens


In [8]:
tokenized_val = []
for x in tqdm(validation['text'], desc="Tokenizing validation"):
    tokenized_val.append(tokenize(x, tokenizer))
validation['text'] = tokenized_val

Tokenizing validation: 100%|██████████| 1687/1687 [14:14<00:00,  1.97it/s]


In [9]:
tokenized_test = []
for x in tqdm(test['text'], desc="Tokenizing test"):
    tokenized_test.append(tokenize(x, tokenizer))
test['text'] = tokenized_test

Tokenizing test: 100%|██████████| 1880/1880 [16:41<00:00,  1.88it/s]


In [None]:
tokenized_train = []
for x in tqdm(train['text'], desc="Tokenizing train"):
    tokenized_train.append(tokenize(x, tokenizer))
train['text'] = tokenized_train

Tokenizing train:   0%|          | 0/770634 [00:00<?, ?it/s]


NameError: name 'tokenize' is not defined