# Text preprocessing from scratch

In [406]:
import torch
from torch.utils import data
import torch.nn as nn
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
from torch.utils.tensorboard import SummaryWriter
from IPython import embed
import pandas as pd
import re
from pathlib import Path
from collections import Counter
import logging
from torch.utils.data import Dataset, DataLoader
import torchtext
from torchtext import vocab, data
import torchtext.datasets
from torchtext.datasets import language_modeling, LanguageModelingDataset, UDPOS

## Load text data

simply remove non alphabetical characters and lowercase everything

In [274]:
t = [['a', 'b'],['c','d']]
[ c for item in t for c in item]

['a', 'b', 'c', 'd']

In [267]:
class DataReader(object):
    def __init__(self, fn:str, mode:str='word'):
        self.mode = mode
        self.path = Path(fn)
        self.data = self.read_data(self.path)

    def read_data(self, path:Path):
        data = []
        with open(path, mode='r') as f:
            lines = f.readlines()
        for line in lines:
            data.append(self.tokenizer(line, self.mode))
        if self.mode == 'char':
            return [ c for row in data for c in row ]
        else:
            return data
    
    @staticmethod
    def tokenizer(sentence:str, mode:str='word'):
        
        if mode == 'word':
            l = re.sub('[^A-Za-z]+',' ', sentence.strip().lower()).split()
        elif mode == 'char':
            l = list(re.sub('[^A-Za-z]+',' ', sentence.strip().lower()))
        else:
            logging.error('unknown token type mode', mode)
        return(l)
    
    def __getitem__(self, index:int):
        return(self.data[index])
    
    def __len__(self):
        return(len(self.data))

In [269]:
d = DataReader('data/35.txt', mode='char')

In [270]:
print(d[:20])

['p', 'r', 'o', 'j', 'e', 'c', 't', ' ', 'g', 'u', 't', 'e', 'n', 'b', 'e', 'r', 'g', ' ', 's', ' ']


In [259]:
print(d.tokenizer('this is true!\n 34 at laest!', mode='char'))

['t', 'h', 'i', 's', ' ', 'i', 's', ' ', 't', 'r', 'u', 'e', ' ', 'a', 't', ' ', 'l', 'a', 'e', 's', 't', ' ']


In [260]:
print(d[0], len(d))

p 189604


## Define vocabulary

In [261]:
cnt = Counter(d)

In [262]:
cnt

Counter({'p': 2870,
         'r': 8835,
         'o': 11083,
         'j': 185,
         'e': 19670,
         'c': 4027,
         't': 15042,
         ' ': 33709,
         'g': 3491,
         'u': 4310,
         'n': 10945,
         'b': 2172,
         's': 9244,
         'h': 8787,
         'i': 11257,
         'm': 4411,
         'a': 12703,
         'y': 3001,
         'w': 3496,
         'l': 6629,
         'k': 1219,
         'f': 3735,
         'd': 6860,
         'v': 1407,
         'x': 264,
         'z': 146,
         'q': 106})

In [264]:
cnt.most_common(10)

[(' ', 33709),
 ('e', 19670),
 ('t', 15042),
 ('a', 12703),
 ('i', 11257),
 ('o', 11083),
 ('n', 10945),
 ('s', 9244),
 ('r', 8835),
 ('h', 8787)]

In [None]:
class Vocab(Object):
    def __init__(self):
        pass

    @staticmethod
    def numericalize(corpus:list):
        pass
    
    def __len__(self):
        pass
    
    def __len__(self):
        pass

# Text preprocessing using Torchtext

In [277]:
!head data/ptb.test.txt

 no it was n't black monday 
 but while the new york stock exchange did n't fall apart friday as the dow jones industrial average plunged N points most of it in the final hour it barely managed to stay this side of chaos 
 some circuit breakers installed after the october N crash failed their first test traders say unable to cool the selling panic in both stocks and futures 
 the N stock specialist firms on the big board floor the buyers and sellers of last resort who were criticized after the N crash once again could n't handle the selling pressure 
 big investment banks refused to step up to the plate to support the beleaguered floor traders by buying big blocks of stock traders say 
 heavy selling of standard & poor 's 500-stock index futures in chicago <unk> beat stocks downward 
 seven big board stocks ual amr bankamerica walt disney capital cities\/abc philip morris and pacific telesis group stopped trading and never resumed 
 the <unk> has already begun 
 the equity market was <

In [383]:
def tok(sentence:str):
    return(d.tokenizer(sentence, mode='word'))

In [384]:
TEXT = data.Field(lower=True, \
                  sequential=True, \
                  use_vocab=True, \
                  eos_token="<eos>", \
                  init_token="<bos>", \
                  tokenize=tok)

In [385]:
fields = [('text', TEXT)]

In [386]:
sentence = "toto is here! 42"
print(TEXT.preprocess(sentence))

['toto', 'is', 'here']


In [387]:
ds = data.TabularDataset(path='data/ptb.test.txt',format='csv',skip_header=False, fields=fields)

In [388]:
print(ds.examples[0].text)

['no', 'it', 'was', 'n', 't', 'black', 'monday']


In [396]:
TEXT.build_vocab(ds)
print(TEXT.vocab.freqs.most_common(10))

[('unk', 4794), ('the', 4541), ('n', 2873), ('of', 2195), ('to', 2043), ('a', 1855), ('in', 1640), ('and', 1540), ('s', 1162), ('that', 831)]


In [399]:
print(TEXT.vocab.stoi['unk'])
print(TEXT.vocab.itos[4])

4
unk


In [412]:
?LanguageModelingDataset

[0;31mInit signature:[0m
[0mLanguageModelingDataset[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtext_field[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mnewline_eos[0m[0;34m=[0m[0;32mTrue[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mencoding[0m[0;34m=[0m[0;34m'utf-8'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      Defines a dataset for language modeling.
[0;31mInit docstring:[0m
Create a LanguageModelingDataset given a path and a field.

Arguments:
    path: Path to the data file.
    text_field: The field that will be used for text data.
    newline_eos: Whether to add an <eos> token for every newline in the
        data file. Default: True.
    Remaining keyword arguments: Passed to the constructor of
        data.Dataset.
[0;31mFile:[0m           ~/anaconda3/envs/rnnlm/lib/python3.7/site-package

In [413]:
lm = LanguageModelingDataset('data/ptb.test.txt', TEXT)

In [418]:
train, test = lm.splits(TEXT)

ValueError: not enough values to unpack (expected 2, got 0)

In [419]:
?lm.splits

[0;31mSignature:[0m
[0mlm[0m[0;34m.[0m[0msplits[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mroot[0m[0;34m=[0m[0;34m'.data'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtrain[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mvalidation[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtest[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m**[0m[0mkwargs[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Create Dataset objects for multiple splits of a dataset.

Arguments:
    path (str): Common prefix of the splits' file paths, or None to use
        the result of cls.download(root).
    root (str): Root dataset storage directory. Default is '.data'.
    train (str): Suffix to add to path for the train set, or None for no
        train set. Default is None.
    validation (str)