In [1]:
import operator

from collections import Counter

# Get text

In [2]:
with open('data/one_txt/blogger.txt') as f:
    blogger = f.read()

In [3]:
with open('data/one_txt/wordpress.txt') as f:
    wordpress = f.read()

In [4]:
txt = wordpress + blogger

# Explore vocabulary

In [5]:
vocab_count = dict(Counter(txt))

In [6]:
vocab_freq = {char: count / len(txt) for char, count in vocab_count.items()}

In [7]:
sorted(zip(vocab_count.keys(), vocab_count.values(), vocab_freq.values()), key=operator.itemgetter(1))

[('¼', 1, 3.1114351152211095e-07),
 ('ū', 1, 3.1114351152211095e-07),
 ('ǎ', 1, 3.1114351152211095e-07),
 ('Э', 1, 3.1114351152211095e-07),
 ('Н', 1, 3.1114351152211095e-07),
 ('О', 1, 3.1114351152211095e-07),
 ('э', 1, 3.1114351152211095e-07),
 ('Р', 1, 3.1114351152211095e-07),
 ('Е', 1, 3.1114351152211095e-07),
 ('ф', 1, 3.1114351152211095e-07),
 ('В', 1, 3.1114351152211095e-07),
 ('С', 1, 3.1114351152211095e-07),
 ('中', 1, 3.1114351152211095e-07),
 ('华', 1, 3.1114351152211095e-07),
 ('共', 1, 3.1114351152211095e-07),
 ('和', 1, 3.1114351152211095e-07),
 ('国', 1, 3.1114351152211095e-07),
 ('世', 1, 3.1114351152211095e-07),
 ('界', 1, 3.1114351152211095e-07),
 ('大', 1, 3.1114351152211095e-07),
 ('团', 1, 3.1114351152211095e-07),
 ('结', 1, 3.1114351152211095e-07),
 ('¾', 1, 3.1114351152211095e-07),
 ('ů', 1, 3.1114351152211095e-07),
 ('½', 1, 3.1114351152211095e-07),
 ('😂', 1, 3.1114351152211095e-07),
 ('Ô', 1, 3.1114351152211095e-07),
 ('≈', 1, 3.1114351152211095e-07),
 ('‰', 1, 3.11143511

In [8]:
full_vocab = sorted(vocab_count.keys(), key=vocab_count.get, reverse=True)
full_vocab = ''.join(full_vocab)
full_vocab

' esanrutioldpmcév,gq.f’bhjàyèx-\'0Aê\xa0LCJ)(kzPEM12!ISN:D…´B53T»«çùôwUFO4VR/?G"HQ6âK87—9ûW€îïZ$YœÇ–‘%=X_À;°>+~É“”ëо*íе[]тани×&срмукñлв<пóбá²дь`″℅úйö„#чы¿гя′юх@ºü|шзЯşăìÊãÎж→\tщИМ人民万岁¡￼¼ūǎЭНОэРЕфВС中华共和国世界大团结¾ů½😂Ô≈‰Čý😞șš™⅓³'

## Normalize some of the text characters

In [9]:
def normalize_txt(txt):

    # Non-breaking spaces -> regular spaces
    txt = txt.replace('\xa0', ' ')

    # Double quotes
    double_quotes_chars = '“”»«'
    for double_quotes_char in double_quotes_chars:
        txt = txt.replace(double_quotes_char, '"')

    # Single quotes
    single_quote_chars = '‘`´’'
    for single_quote_char in single_quote_chars:
        txt = txt.replace(single_quote_char, "'")

    # Triple dots
    txt = txt.replace('…', '...')

    # Hyphens
    hyphen_chars = '–—'
    for hyphen_char in hyphen_chars:
        txt = txt.replace(hyphen_char, '-')

    return txt

In [10]:
txt = normalize_txt(txt)

In [11]:
vocab_count = dict(Counter(txt))
full_vocab = sorted(vocab_count.keys(), key=vocab_count.get, reverse=True)
full_vocab = ''.join(full_vocab)
full_vocab

' esanrutioldpmcév.\',gqfbhjàyèx-0AêLC"J)(kzPEM12!ISN:DB53TçùôwUFO4VR/?GHQ6âK879ûW€îïZ$YœÇ%=X_À;°>+~Éëо*íе[]тани×&срмукñлв<пóбá²дь″℅úйö„#чы¿гя′юх@ºü|шзЯşăìÊãÎж→\tщИМ人民万岁¡￼¼ūǎЭНОэРЕфВС中华共和国世界大团结¾ů½😂Ô≈‰Čý😞șš™⅓³'

## Restrict text to a sensible vocabulary

In [12]:
vocab = ' !"$%\'()+,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~°àâçèéêëîïôùûœо€'

In [13]:
# Restrict text to vocabulary
def restrict_to_vocab(txt, vocab):
    txt = ''.join(char for char in txt if char in vocab)
    return txt

In [14]:
txt = restrict_to_vocab(txt, vocab)

In [15]:
# Double check new vocabulary
assert ''.join(sorted(set(txt))) == vocab