<h1><span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [4]:
import collections
import random
import time
from concurrent.futures import ProcessPoolExecutor

import torch
import torch.nn as nn
import torch.nn.functional as F

import spacy
from datasets import load_dataset, list_metrics, load_metric
# from transformers import GPT2Tokenizer, GPT2LMHeadModel



# wikibio = load_dataset('wiki_bio', cache_dir="/Volumes/External HD/Dev/datasets/wikibio", split='train[:1%]')
# wikibio[0]


In [5]:
wikitext = load_dataset(
    'wikitext', 
    'wikitext-103-raw-v1', 
    cache_dir="/Volumes/External HD/Dev/datasets/wikitext", 
    split='train[:1%]'
)

Reusing dataset wikitext (/Volumes/External HD/Dev/datasets/wikitext/wikitext/wikitext-103-raw-v1/1.0.0/47c57a6745aa5ce8e16a5355aaa4039e3aa90d1adad87cef1ad4e0f29e74ac91)


In [6]:
nlp = spacy.load(
    "en_core_web_sm", 
    exclude=['tagger', 'parser', 'attribute_ruler', 'lemmatizer']
) #Note: use larger model for production
nlp.add_pipe('sentencizer')

<spacy.pipeline.sentencizer.Sentencizer at 0x135fabf00>

In [7]:
# !python -m spacy download en_core_web_sm

When processing large volumes of text, the statistical models are usually more efficient if you let them work on batches of texts. spaCy’s nlp.pipe method takes an iterable of texts and yields processed Doc objects. The batching is done internally.

In [8]:
def runNER(model, texts):
    for doc in nlp.pipe(texts):
        for sent in doc.sents:
            if sent.ents:
                ents = str([(e.text, e.start_char - sent.start_char, e.end_char - sent.start_char, e.label_) for e in sent.ents]).strip('[]')
                out = f"{sent.text}|{ents}"
                print(out)
                print("\n")

In [39]:
class DataProcessor:
    def __init__(
        self, 
        text, 
        write_dir=None, 
        parallel=False
    ):
        self.text = text
        self.write_dir = write_dir
        self.parallel = parallel
        
        self.ner_texts = []
        self.ents = collections.defaultdict(list)
        self.permuted = []

        self.model = spacy.load(
            "en_core_web_sm", 
            exclude=['tagger', 'parser', 'attribute_ruler', 'lemmatizer']
        )
        self.model.add_pipe('sentencizer')
        
        self.keep_ents = ['PERSON', 'ORG', 'GPE']
        
    @classmethod
    def fromFile(cls, file_loc):
        pass
    
    
    def run(self, func, args):
        if self.parallel:
            with ProcessPoolExecutor() as executor:
                for output in executor.map(func, args):
                    return output.result(timeout=None)
        else:
            for output in func(*args):
                yield output
            
    
    
    def permuteEnts(self):
        timestamp = time.time()
        if self.write_dir:
            handler = open(self.write_dir + f'/permuted_entities.{timestamp}', 'w')
        for (sent, ents) in self.ner_texts:
            eligible = list(filter(lambda x: x[3] in self.keep_ents, ents))
            orig_ent = random.choice(eligible)
            ent_type = orig_ent[3]
            start, end  = orig_ent[1:3]
            while True:
                replace_ent = random.choice(self.ents[ent_type])
                if replace_ent != orig_ent: break

            prefix = sent[:start]
            suffix = sent[end:]
            new_sent = prefix + replace_ent + suffix
            if self.write_dir:
                handler.write(new_sent + "\n")
            self.permuted.append(new_sent)
            
        if self.write_dir:
            handler.close()
    
    
    def processEnts(self):
        timestamp = time.time()
        
        if self.write_dir:
            main_loc = self.write_dir + f'/processed_entities.{timestamp}'
            ent_loc = self.write_dir + f'/all_entities.{timestamp}'
            with open(main_loc, 'w') as f:
                for output in self.runNER(self.text):
                    text, ents = output
                    f.write(f"{text}|{str(ents).strip('[]')}\n")
                    self.ner_texts.append(output)
            with open(ent_loc, 'w') as f:
                for ent_type, ent_value in self.ents.items():
                    f.write(f"{ent_type}|{ent_value}\n")
        else:
            for output in self.runNER(self.text):
                self.ner_texts.append(output)
        
        
    def runNER(self, texts):
        for doc in self.model.pipe(texts):
            for sent in doc.sents:
                if any([e.label_ in self.keep_ents for e in sent.ents]):
                    ents = []
                    for e in sent.ents:
                        ents.append((e.text, e.start_char - sent.start_char, e.end_char - sent.start_char, e.label_))
                        self.ents[e.label_].append(e.text)
                    yield (sent.text, ents)
    
    def __repr__(self):
        
        return (f"DataProcessor:<{len(self.text)} RAW>"
                f"<{len(self.ner_texts)} NER>"
                f"<{len(self.permuted)} PERM>"
                f"<{sum([len(self.ents[k]) for k in self.ents])} ENTS>")

In [40]:

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

def filterText(iterator):

    valid  = []
    for text in iterator:
        if len(text) < 50:
            continue
        if not is_ascii(text):
            continue
        valid.append(text)

    return valid


In [41]:
sampleText = filterText(wikitext['text'][1:10000])

dp = DataProcessor(sampleText, write_dir='./data')
dp.keep_ents = ['PERSON']

In [42]:
dp.processEnts()

In [43]:
dp.permuteEnts()

In [44]:
dp

DataProcessor:<3160 RAW><4343 NER><4343 PERM><15773 ENTS>

In [117]:
class DataProcessor:
    def __init__(
        self, 
        text, 
        write_dir=None, 
        parallel=False
    ):
        self.text = text

        self.write_dir = write_dir
        self.parallel = parallel
        
        self.raw_texts = []
        self.ner_texts = []
        self.permuted = []
        self.changed_ents = []
        
        self.ents = collections.defaultdict(list)

        self.model = spacy.load(
            "en_core_web_sm", 
            exclude=['tagger', 'parser', 'attribute_ruler', 'lemmatizer']
        )
        self.model.add_pipe('sentencizer')
        
        self.keep_ents = ['PERSON', 'ORG', 'GPE']
        
    @classmethod
    def fromFile(cls, file_loc):
        pass
    
    
    def run(self, func, args):
        if self.parallel:
            with ProcessPoolExecutor() as executor:
                for output in executor.map(func, args):
                    return output.result(timeout=None)
        else:
            for output in func(*args):
                yield output
            
    
    
    def permuteEnts(self):
        timestamp = time.time()
        
        if self.write_dir:
            permuteFile = open(self.write_dir + f'/permuted_entities.{timestamp}', 'w')
            origFile = open(self.write_dir + f'/original_entities.{timestamp}', 'w')
            entFile = open(self.write_dir + f'/entity_swaps.{timestamp}', 'w')
            
        for idx, (sent, ents) in enumerate(self.ner_texts):
            eligible = list(filter(lambda x: x[3] in self.keep_ents, ents))
            orig_ent = random.choice(eligible)
            ent_type = orig_ent[3]
            start, end  = orig_ent[1:3]
            while True:
                replace_ent = random.choice(self.ents[ent_type])
                if replace_ent != orig_ent[0]: break

            prefix = sent[:start]
            suffix = sent[end:]
            new_sent = prefix + replace_ent + suffix

            if self.write_dir:
                permuteFile.write(new_sent + "\n")
                origFile.write(self.raw_texts[idx].strip('\n').strip(" ") + "\n")
                entFile.write(f"{orig_ent[0]}|{replace_ent}\n")
                
            self.permuted.append(new_sent)
            self.changed_ents.append((orig_ent[0], replace_ent))
            
        if self.write_dir:
            permuteFile.close()
            origFile.close()
            entFile.close()
    
    
    def processEnts(self):
                
        for output in self.runNER(self.text):
            self.ner_texts.append(output)
        
        
    def runNER(self, texts):
        for doc in self.model.pipe(texts):
            processed = []
            for sent in doc.sents:
                if any([e.label_ in self.keep_ents for e in sent.ents]):
                    ents = []
                    for e in sent.ents:
                        ents.append((e.text, e.start_char - sent.start_char, e.end_char - sent.start_char, e.label_))
                        self.ents[e.label_].append(e.text)
                    processed.append((sent.text, ents))
            if processed:
                self.raw_texts.append(doc.text)
                yield random.choice(processed)
            
    
    def __repr__(self):
        
        return (f"DataProcessor:<{len(self.text)} RAW>"
                f"<{len(self.ner_texts)} NER>"
                f"<{len(self.permuted)} PERM>"
                f"<{sum([len(self.ents[k]) for k in self.ents])} ENTS>")

In [118]:
sampleText = filterText(wikitext['text'][1:1000])

dp = DataProcessor(sampleText, write_dir='./data')
dp.keep_ents = ['PERSON']
dp.processEnts()
dp.permuteEnts()


In [119]:
dp

DataProcessor:<275 RAW><160 NER><160 PERM><1095 ENTS>

In [120]:
dp.permuted[0]

' The game began development in 2010 , carrying over a large portion of the work done on Dissident Aggressor .'

In [121]:
dp.raw_texts[0]

" The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making the game more forgiving for series newcomers . Character designer Raita Honjou and composer Hitoshi Sakimoto both returned from previous entries , along with Valkyria Chronicles II director Takeshi Ozawa . A large team of writers handled the script . The game 's opening theme was sung by May 'n . \n"

In [122]:
dp.changed_ents[0]

('Valkyria Chronicles II', 'Dissident Aggressor')