## This is the notebook used to create a line-by-line dataset to pre-train language models for the Feedback Prize competition.

See pre-training notebook here: https://www.kaggle.com/nbroad/feedback-prize-pre-training-pt-gpu  
Dataset available here: https://www.kaggle.com/nbroad/feedback-prize-linebyline-text-dataset

In [None]:
import re
from pathlib import Path

from tqdm.auto import tqdm


# Simple sentence splitting function

alphabets= "([A-Za-z])"
prefixes = re.compile("(Mr|St|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|Mt)[.]")
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = re.compile("[.](co|net|org|io|gov|edu|us)")
etal = re.compile(r"(\bet al)[.]")
urls = re.compile("(www)[.]")
digits =  re.compile("[.]([0-9])")

def split_into_sentences(text):
    text = " " + text + "  "
    text = text.replace("\n"," ")
    text = prefixes.sub("\\1<prd>",text)
    text = websites.sub("<prd>\\1",text)
    text = urls.sub("\\1<prd>",text)
    if "Ph.D" in text: text = text.replace("Ph.D.","Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ",text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>",text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>",text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2",text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>",text)
    text = re.sub(" " + alphabets + "[.]"," \\1<prd>",text)
    text = etal.sub("\\1<prd>", text)
    text = digits.sub("<prd>\\1",text)
    if "”" in text: text = text.replace(".”","”.")
    if "\"" in text: text = text.replace(".\"","\".")
    if "!" in text: text = text.replace("!\"","\"!")
    if "?" in text: text = text.replace("?\"","\"?")
    text = text.replace(".",".<stop>")
    text = text.replace("?","?<stop>")
    text = text.replace("!","!<stop>")
    text = text.replace("<prd>",".")
    sentences = text.split("<stop>")
    if sentences[-1] == '':
        sentences = sentences[:-1]
    sentences = [s.strip() for s in sentences]
    return sentences

#### Use both train and test documents. Split into sentences, merge small sentences with larger ones, write to file

In [None]:
with open("train.txt", "w") as train_file:
    for filename in tqdm(Path("../input/feedback-prize-2021/train").glob("*.txt"), total=15594):
        with open(filename) as current_file:
            text = current_file.read().replace("\n", " ")
        sentences = split_into_sentences(text)
        
        cleaned_sentences = []
        
        for idx, sent in enumerate(sentences):
            
            # if a sentence is short, add it to the previous one
            if idx > 0 and len(sent) < 20:
                cleaned_sentences[-1] = cleaned_sentences[-1].strip() + " " + sent
            else:
                cleaned_sentences.append(sent)
        
        train_file.write("\n".join(cleaned_sentences)+"\n")
        
    for filename in tqdm(Path("../input/feedback-prize-2021/test").glob("*.txt")):
        # Weird way of making sure to not put back-to-back "\n"            
        with open(filename) as current_file:
            text = current_file.read().replace("\n", " ")
        sentences = split_into_sentences(text)
        
        cleaned_sentences = []
        
        for idx, sent in enumerate(sentences):
            
            # if a sentence is short, add it to the previous one
            if idx > 0 and len(sent) < 20:
                cleaned_sentences[-1] = cleaned_sentences[-1].strip() + " " + sent
            else:
                cleaned_sentences.append(sent)
        
        train_file.write("\n".join(cleaned_sentences)+"\n")

#### load into dataset object and remove short lines

In [None]:
from datasets import load_dataset

dataset = load_dataset("text", data_files="train.txt", split="train")

dataset = dataset.filter(lambda x: len(x["text"]) > 20) # remove short ones
dataset

#### remove duplicates

In [None]:
def remove_duplicates(example):
    if example["text"] in seen:
        return False
    seen.add(example["text"])
    return True

seen = set()

dataset = dataset.filter(remove_duplicates)
dataset

#### writing final file

In [None]:
with open("train.txt", "w") as fp:
    for i, line in enumerate(dataset["text"]):
        if i == 0:
            fp.write(line.strip())
        else:
            fp.write("\n"+line.strip())

#### checking final file

In [None]:
with open("train.txt") as fp:
    
    for i, line in enumerate(fp.readlines()):
        if i > 100: break
        print(line)