In [18]:
import requests
import hashlib
import tarfile
import re

from pathlib import Path
from tqdm.notebook import tqdm

## Download and Extract the Data¶

The dataset will be downloaded from the [Data Asset Exchange](https://developer.ibm.com/exchanges/data/) content delivery network and extract the tarball.

In [8]:
# Download the dataset
fname = 'wikitext-103.tar.gz'
output_dir = './wikidata/'
url = 'https://dax-cdn.cdn.appdomain.cloud/dax-wikitext-103/1.0.1/' + fname
chunk_len = 8192
with requests.get(url, stream=True) as r:
    with open(output_dir + fname, 'wb') as f:
        for chunk in tqdm(r.iter_content(chunk_size=chunk_len), total=int(r.headers['Content-Length'])//chunk_len):
            if chunk:
                f.write(chunk)
# Path(fname).write_bytes(r.content)

HBox(children=(FloatProgress(value=0.0, max=23144.0), HTML(value='')))




In [10]:
# Verify the file was downloaded properly by comparing sha512 checksums
sha512sum = 'c8186919aa1840af6b734ea41abc580574ea8efe2fafda220f5d01002464d17566d84be5199b875136c9593f0e0678fb5d7c84bb2231de8b4151cb9c83fa2109'
sha512sum_computed = hashlib.sha512(Path(output_dir + fname).read_bytes()).hexdigest()
sha512sum == sha512sum_computed

True

In [13]:
# Extract the dataset
with tarfile.open(output_dir + fname) as tar:
    tar.extractall()

## Read the Data
Lets read our data into Python lists.

In [137]:
if re.match(r"[\s\n\t]+", "r "):
    print("Wow!!")

Wow!!


In [159]:
import os
import re
import csv

from multiprocessing import Pool
from tqdm.notebook import tqdm
from pathlib import Path
import pickle

def is_empty(elm):
    return True if (re.match(r"[\s\n\t]+", elm) or elm=="" or elm is None) else False

class Wikitext103Handler():
    
    def __init__(self, wiki103_dir):
        assert os.path.isdir(wiki103_dir) == True , \
        f"The specified directory was not found!"
        assert os.path.isfile(os.path.join(wiki103_dir, "wiki.train.tokens")), \
        f"Wikitext103 Train Tokens file Was Not Found in the path {wiki103_dir}!"
        assert os.path.isfile(os.path.join(wiki103_dir, "wiki.valid.tokens")), \
        f"Wikitext103 Valid Tokens file Was Not Found in the path {wiki103_dir}!"
        assert os.path.isfile(os.path.join(wiki103_dir, "wiki.test.tokens")), \
        f"Wikitext103 Test Tokens file Was Not Found in the path {wiki103_dir}!"

        # Reading data from disk
        train_data = " \n" + Path(os.path.join(wiki103_dir, "wiki.train.tokens")).read_text()
        print("Train data was loaded successfuly! ...")
        valid_data = " \n" + Path(os.path.join(wiki103_dir, "wiki.valid.tokens")).read_text()
        print("Validation data was loaded successfuly! ...")
        test_data = " \n" + Path(os.path.join(wiki103_dir, "wiki.test.tokens")).read_text()
        print("Test data was loaded successfuly! ...")


        # Preprocessing data
        ## Split out train headings and articles
        self.train_data = self._preprocess_data(train_data)
        self.train_data.update(dict(type='train'))
        ## Split out valid headings and articles
        self.valid_data = self._preprocess_data(valid_data)
        self.valid_data.update(dict(type="valid"))
        ## Split out train headings and articles
        self.test_data = self._preprocess_data(test_data)
        self.test_data.update(dict(type="test"))

        # Set mode to "training" by default
        self.data = self.train_data
        
        # Returns headings by default
        self.return_headings = True
    
    def _preprocess_data(self, data):
        # Store regular expression pattern to search for wikipedia article headings
        heading_pattern = '( \n \n = [^=]*[^=] = \n \n )'

        split = re.split(heading_pattern, data)
        headings = [x[7:-7] for x in split[1::2]]
        articles = [
            str(re.sub(r"(\n [= ]+ .*[ =]+ \n)", "", x)) # Removing subheadings
            for x in split[2::2]
        ]
        articles = [
            self.split_doc_to_sent(doc) for doc in tqdm(articles)
        ]
        return dict(
            headings = headings,
            articles = articles
        )
    
    def __getitem__(self, idx):
        if self.return_headings:
            return self.data['articles'][idx], self.data['headings'][idx]
        return self.data['articles'][idx]
    
    
    def parallel_process(self, iterator, func, ncores=None, message=None):
        # from pathos.multiprocessing import ProcessPool as Pool
        p = Pool(ncores)
        iter_multi = tqdm(
            p.imap(func, iterator),
            total=len(iterator), 
            desc=message
        )
        
        return list(iter_multi)
    
    
    def split_doc_to_sent(self, doc, min_len=10):
        return [
            y.strip() for x in re.split("\n  \n", doc) for y in re.split('\n', x.strip()) 
            if not is_empty(y.strip()) and len(y.strip().split())>=min_len
        ]
    
    
    def set_mode(self, mode=None, return_headings=None):
        if mode is not None:
            if mode == 'train':
                self.data = self.train_data
            elif mode == 'eval':
                self.data = self.valid_data
            elif mode == 'test':
                self.data = self.test_data
            else:
                raise TypeError('Parameter `mode` can only get one of the following values: "train", "eval", "test".')
        
        if return_headings is not None:
            self.return_headings = return_headings     
    
    def get_mode(self):
        mode = "eval" if self.data['type']=="valid" else self.data['type']
        return mode
    
    @staticmethod
    def load_handler(handler_path):
        with open(handler_path, "rb") as f:
            return pickle.load(f)
    
    def save_handler(self, handler_path):
        with open(handler_path, "wb") as f:
            pickle.dump(self, f)
            
    def export_for_bert(self, dir_path, unk_token="[UNK]"):
        def export(path, data, msg):
            with open(path, 'w') as f:
                writer = csv.writer(f, delimiter='\t')
                for sent_col in self._process_for_bert(data, msg=msg):
                    writer.writerow(sent_col)
        # Export Train Data
        export(
            os.path.join(dir_path, "wikitext103_bert_train.csv"), 
            self.train_data['articles'], 
            msg="Export Train Data..."
        )
        print("Train data export completed successfully!\n")
        # Export Validation Data
        export(
            os.path.join(dir_path, "wikitext103_bert_valid.csv"), 
            self.valid_data['articles'], 
            msg="Export Valid Data..."
        )
        print("Validation data export completed successfully!\n")
        # Export Test Data
        export(
            os.path.join(dir_path, "wikitext103_bert_test.csv"), 
            self.test_data['articles'], 
            msg="Export Test Data..."
        )
        print("Test data export completed successfully!\n")
    
    def _process_for_bert(self, articles, unk_token="[UNK]", msg=None):
        for article in tqdm(articles, total=len(articles), desc=msg):
            for i, sent in enumerate(article):
                if not is_empty(sent):
                    yield (sent.replace("<unk>", unk_token), 0 if i==0 else 1)

In [160]:
wikitext = Wikitext103Handler(wiki103_dir="./wikitext-103")

Train data was loaded successfuly! ...
Validation data was loaded successfuly! ...
Test data was loaded successfuly! ...


HBox(children=(FloatProgress(value=0.0, max=28471.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=60.0), HTML(value='')))




In [161]:
wikitext.export_for_bert(dir_path="./wikitext-103")

HBox(children=(FloatProgress(value=0.0, description='Export Train Data...', max=28471.0, style=ProgressStyle(d…


Train data export completed successfully!



HBox(children=(FloatProgress(value=0.0, description='Export Valid Data...', max=60.0, style=ProgressStyle(desc…


Validation data export completed successfully!



HBox(children=(FloatProgress(value=0.0, description='Export Test Data...', max=60.0, style=ProgressStyle(descr…


Test data export completed successfully!



In [162]:
handler_path = "./wikitext-103/wiki_handler.pkl"
# wikitext.set_mode(mode="eval", return_headings=False)
wikitext.save_handler(handler_path)

In [70]:
# wikitext = Wikitext103Handler.load_handler(handler_path)

## Prepare For BERT WordPiece Tokenizer Training

In [163]:
import pandas as pd

In [164]:
file_path = "./wikitext-103/wikitext103_bert_train.csv"

train_df = pd.read_csv(file_path, sep="\t", header=None)
print("{:,}".format(train_df.shape[0]))
train_df.head(2)

794,232


Unnamed: 0,0,1
0,Senjō no Valkyria 3 : [UNK] Chronicles ( Japan...,0
1,"The game began development in 2010 , carrying ...",1


In [171]:
with open("./wikitext-103/wikitext103_bert_train_for_tokenizer.txt", 'w') as f:
    for idx, row in tqdm(train_df[0].iteritems(), total=len(train_df[0])):
        f.write(row + '\n')

HBox(children=(FloatProgress(value=0.0, max=794232.0), HTML(value='')))


