## IMDb

At Fast.ai we have introduced a new module called fastai.text which replaces the torchtext library that was used in our 2018 dl1 course. The fastai.text module also supersedes the fastai.nlp library but retains many of the key functions.

In [1]:
from fastai.text import *
from fastai.core import num_cpus, partition_by_cores
import html
from pathlib import Path
import numpy as np
import csv
import pandas as pd
from collections import Counter, defaultdict
from itertools import chain
from nltk.corpus import brown
import os, re

from gensim.corpora import Dictionary
from gensim.models import Word2Vec
from typing import Callable, List, Collection
from concurrent.futures.process import ProcessPoolExecutor

The Fastai.text module introduces several custom tokens.

We need to download the IMDB large movie reviews from this site: http://ai.stanford.edu/~amaas/data/sentiment/
Direct link : [Link](http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz) and untar it into the PATH location. We use pathlib which makes directory traveral a breeze.

In [2]:
PATH=Path('/media/discoD/repositorios/1-billion-word-language-modeling-benchmark/')

## Standardize format

The imdb dataset has 3 classes. positive, negative and unsupervised(sentiment is unknown). 
There are 75k training reviews(12.5k pos, 12.5k neg, 50k unsup)
There are 25k validation reviews(12.5k pos, 12.5k neg & no unsup)

Refer to the README file in the imdb corpus for further information about the dataset.

In [1]:
class VocabularyTokenizer():
    "Put together rules, a tokenizer function and a language to tokenize text with multiprocessing."
    def __init__(self, tok_func:Callable=SpacyTokenizer, lang:str='pt', n_cpus:int=None):
        self.tok_func,self.lang = tok_func,lang
        self.n_cpus = n_cpus or num_cpus()//2

    def process_text(self, t:str, tok:BaseTokenizer) -> List[str]:
        "Processe one text `t` with tokenizer `tok`."
        return tok.tokenizer(t)

    def _process_all_1(self, texts:Collection[str]) -> List[List[str]]:
        "Process a list of `texts` in one process."
        tok = self.tok_func(self.lang)
        return [self.process_text(t, tok) for t in texts]

    def process_all(self, texts:Collection[str]) -> List[List[str]]:
        "Process a list of `texts`."
        if self.n_cpus <= 1: return self._process_all_1(texts)
        with ProcessPoolExecutor(self.n_cpus) as e:
            return sum(e.map(self._process_all_1, partition_by_cores(texts, self.n_cpus)), [])

NameError: name 'SpacyTokenizer' is not defined

In [4]:
def save_texts(paths, filename, lang):
    CLASSES = ['unsup']
    file_count = 0
    filename = filename + '_' + lang + '.csv'
    if os.path.isfile(filename):
        os.remove(filename)
    with open(filename, 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quoting=csv.QUOTE_NONE, escapechar='\\')
        for idx,label in enumerate(CLASSES):
            for path in paths:
                for fname in (path).glob('*'):
                    file_count += 1
                    print('writing from %s' % fname)
                    [writer.writerow([line, idx]) for line in fname.open('r', encoding='utf-8').read().split('\n')]
    print('%d texts saved to %s' % (file_count, filename))

In [5]:
save_texts([PATH/'training-harem/'], 'train_harem', 'pt')
save_texts([PATH/'heldout-harem/'], 'test_harem', 'pt')
save_texts([PATH/'training-harem/',PATH/'heldout-harem/'], 'full_harem', 'pt')

writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00001-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00002-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00003-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00004-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00005-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00006-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00007-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00008-of-00100
writing from /media/discoD/repositorios/1-billio

writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00050-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00051-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00052-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00053-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00054-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00055-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00056-of-00100
writing from /media/discoD/repositorios/1-billion-word-language-modeling-benchmark/training-harem/harem-00057-of-00100
writing from /media/discoD/repositorios/1-billio

In [6]:
def get_tokens(filename):
    data = pd.read_csv(filename, header=None, escapechar='\\', chunksize=500000)
    for idx, df in enumerate(data):
        print(idx)
        yield VocabularyTokenizer().process_all(df[0].astype(str))

In [7]:
freq_full = Counter(p for o in chain.from_iterable(get_tokens('full_harem_pt.csv')) for p in o)
freq_full.most_common()

0


[(',', 39537),
 ('"', 39255),
 ('de', 24784),
 ('.', 21202),
 ('a', 18659),
 ('e', 14020),
 ('o', 13513),
 ('que', 12737),
 ('do', 8234),
 ('da', 7947),
 ('em', 6072),
 ('para', 5626),
 ('os', 4893),
 ('um', 4565),
 ('com', 4399),
 ('uma', 4118),
 ('no', 3561),
 ('é', 3558),
 ('não', 3472),
 ('na', 3263),
 ('-', 3053),
 ('as', 2934),
 ('dos', 2831),
 ('se', 2781),
 (':', 2745),
 ('por', 2650),
 (')', 2641),
 ('(', 2571),
 ('à', 2445),
 ('O', 2381),
 ('mais', 2361),
 ('A', 2296),
 ('como', 2017),
 ('das', 1854),
 ('?', 1803),
 ('d', 1579),
 ('«', 1520),
 ('»', 1515),
 ('foi', 1501),
 ('/', 1411),
 ('sua', 1250),
 ('ou', 1221),
 (';', 1193),
 ('mas', 1164),
 ('E', 1138),
 ('ser', 1119),
 ('era', 1104),
 ('muito', 1073),
 ('nos', 993),
 ('eu', 986),
 ('seu', 977),
 ('pela', 944),
 ('também', 934),
 ('já', 931),
 ('anos', 908),
 ('sobre', 894),
 ('pelo', 892),
 ('!', 889),
 ('tem', 879),
 ('n', 873),
 ('este', 870),
 ('ele', 824),
 ('...', 821),
 ('esta', 763),
 ('são', 758),
 ('entre', 74

In [8]:
palavras = [palavra for palavra, contagem in freq_full.most_common()]
palavras

[',',
 '"',
 'de',
 '.',
 'a',
 'e',
 'o',
 'que',
 'do',
 'da',
 'em',
 'para',
 'os',
 'um',
 'com',
 'uma',
 'no',
 'é',
 'não',
 'na',
 '-',
 'as',
 'dos',
 'se',
 ':',
 'por',
 ')',
 '(',
 'à',
 'O',
 'mais',
 'A',
 'como',
 'das',
 '?',
 'd',
 '«',
 '»',
 'foi',
 '/',
 'sua',
 'ou',
 ';',
 'mas',
 'E',
 'ser',
 'era',
 'muito',
 'nos',
 'eu',
 'seu',
 'pela',
 'também',
 'já',
 'anos',
 'sobre',
 'pelo',
 '!',
 'tem',
 'n',
 'este',
 'ele',
 '...',
 'esta',
 'são',
 'entre',
 'ainda',
 's',
 'está',
 'até',
 'isso',
 'todos',
 'porque',
 'Os',
 'quando',
 'Em',
 'tinha',
 'Não',
 'seus',
 'bem',
 'ter',
 'onde',
 'depois',
 'fazer',
 'mesmo',
 'aqui',
 'só',
 'nas',
 'É',
 'grande',
 'me',
 "'",
 'dia',
 'Brasil',
 'há',
 'tempo',
 'lá',
 'eles',
 '--',
 'São',
 'ela',
 'ano',
 'outros',
 'pessoas',
 'vez',
 'sem',
 'Mas',
 'dois',
 'essa',
 'R',
 'assim',
 'esse',
 'minha',
 'vida',
 'meu',
 'casa',
 'parte',
 'sempre',
 'No',
 'hoje',
 'P',
 'gente',
 'tudo',
 'nós',
 'pode',
 

In [9]:
len(palavras)

54645

In [10]:
palavras.insert(0, '<UNK>')
palavras.insert(0, '<S>')
palavras.insert(0, '</S>')
palavras

['</S>',
 '<S>',
 '<UNK>',
 ',',
 '"',
 'de',
 '.',
 'a',
 'e',
 'o',
 'que',
 'do',
 'da',
 'em',
 'para',
 'os',
 'um',
 'com',
 'uma',
 'no',
 'é',
 'não',
 'na',
 '-',
 'as',
 'dos',
 'se',
 ':',
 'por',
 ')',
 '(',
 'à',
 'O',
 'mais',
 'A',
 'como',
 'das',
 '?',
 'd',
 '«',
 '»',
 'foi',
 '/',
 'sua',
 'ou',
 ';',
 'mas',
 'E',
 'ser',
 'era',
 'muito',
 'nos',
 'eu',
 'seu',
 'pela',
 'também',
 'já',
 'anos',
 'sobre',
 'pelo',
 '!',
 'tem',
 'n',
 'este',
 'ele',
 '...',
 'esta',
 'são',
 'entre',
 'ainda',
 's',
 'está',
 'até',
 'isso',
 'todos',
 'porque',
 'Os',
 'quando',
 'Em',
 'tinha',
 'Não',
 'seus',
 'bem',
 'ter',
 'onde',
 'depois',
 'fazer',
 'mesmo',
 'aqui',
 'só',
 'nas',
 'É',
 'grande',
 'me',
 "'",
 'dia',
 'Brasil',
 'há',
 'tempo',
 'lá',
 'eles',
 '--',
 'São',
 'ela',
 'ano',
 'outros',
 'pessoas',
 'vez',
 'sem',
 'Mas',
 'dois',
 'essa',
 'R',
 'assim',
 'esse',
 'minha',
 'vida',
 'meu',
 'casa',
 'parte',
 'sempre',
 'No',
 'hoje',
 'P',
 'gente',


In [11]:
sum(freq_full.values())

642977

In [12]:
def write_list(array, filename):
    with open(filename, 'w') as file:
        for item in array:
            file.write(item + '\n')
    file.close()

In [13]:
write_list(array=palavras, filename='vocabulario_harem.txt')

In [14]:
!tail -n 10 vocabulario_harem.txt

Belenenses
osTorneios
Partiram
Dorado
senta-se
levá-las
magras
recordar-lhe
Lafaiete
1370


In [15]:
singletons = [palavra for palavra, contagem in freq_full.most_common() if contagem == 1]
print(len(singletons))
singletons

29086


['intervenham',
 'ciber-mundo',
 'éextremamente',
 'suportando',
 'Agencias',
 'consultam',
 'acessores',
 'presunção',
 'anteparo',
 'tacto',
 'olfato',
 'Orgulhosos',
 'Valli',
 'Artplan',
 'cumprir-se',
 'CSP',
 'apetência',
 '4.º',
 'tabacaria',
 'esgotar',
 'toto',
 'dispendido',
 'Dou-me',
 'suspendia',
 'CLIENTE',
 'Aponta-se',
 'nomeia-se',
 'açucareira',
 'silvícola',
 'concentrara',
 'Hás-de',
 'maispróximo',
 'desburocratização',
 'Markaz',
 'Al-Dawat',
 'wahabita',
 'FERMIONS',
 'latina',
 'tabernas',
 'Negativo',
 '21º',
 'distribuíram',
 'X-15',
 'procedamos',
 'dividirei',
 'louvar-vos-ei',
 'repreender-vos-ei',
 'cosidas',
 'formarem',
 'Piñero',
 'Pratt',
 'Vegetariana',
 'Ulla',
 'Sandbæk',
 'tolos',
 'Jürgen',
 'Israelita',
 '105º',
 'Imagina-se',
 'proveniência',
 'Beiras',
 'RESUMO',
 'master-franqueado',
 'rosbife',
 'batatinhas',
 'protótipo',
 'porcento',
 'voltaremos',
 'incorporados',
 'Surpreende-o',
 'descaramento',
 'obgetivos',
 'aparencia',
 'Joaquín',
 '

In [16]:
currency_pattern = r"\b(?<![.,-])[0-9]{1,3}(?:,?[0-9]{3})*\.[0-9]{2}(?![.,-])\b|\b(?<![.,-])[0-9]{1,3}(?:.?[0-9]{3})*\,[0-9]{2}(?![.,-])\b"

In [17]:
valores = [singleton for singleton in singletons if re.match(pattern=currency_pattern, string=singleton)]
print(len(valores))
valores

183


['2,02',
 '3,41',
 '310,00',
 '44,16',
 '1,81',
 '1.368,23',
 '12,07',
 '11,00',
 '78,00',
 '0,30',
 '0,40',
 '0,70',
 '2.409,66',
 '21,00',
 '2,99',
 '2.694,218,00',
 '56,00',
 '66,00',
 '2.176,26',
 '3.00',
 '51.569,30',
 '61,87',
 '62,84',
 '63,09',
 '73,22',
 '48,16',
 '89,00',
 '80.000,00',
 '55,20',
 '44,00',
 '55,00',
 '2,30',
 '3,12',
 '7,90',
 '2,75',
 '0.10',
 '0.20',
 '0.50',
 '54,00',
 '25,00',
 '25,90',
 '1,60',
 '81,00',
 '13,56',
 '63,70',
 '72,00',
 '0,55',
 '2,05',
 '376,82',
 '456,00',
 '74,00',
 '51,00',
 '175.50',
 '147.50',
 '57,00',
 '4,49',
 '0,71',
 '29,30',
 '68,00',
 '368,00',
 '23,00',
 '148.00',
 '49.879,79',
 '33,53',
 '5,30',
 '2,54',
 '5,36',
 '428,00',
 '376,00',
 '4,51',
 '285,00',
 '196,00',
 '0,20',
 '630,00',
 '73,60',
 '63,00',
 '100,00',
 '33,00',
 '862,00',
 '288,00',
 '287,00',
 '49,99',
 '1.921.598,00',
 '8666,91',
 '12.00',
 '592.00',
 '39,00',
 '59,00',
 '129,00',
 '1,40',
 '175,00',
 '292,00',
 '68.00',
 '118.00',
 '300.000,00',
 '1.000,00',
