In [1]:
import operator

from collections import Counter

# Get text

In [2]:
with open('data/one_txt/wordpress.txt') as f:
    blogger = f.read()

In [3]:
with open('data/one_txt/wordpress.txt') as f:
    wordpress = f.read()

In [4]:
txt = wordpress + blogger

# Explore vocabulary

In [5]:
vocab_count = dict(Counter(txt))

In [6]:
vocab_freq = {char: count / len(txt) for char, count in vocab_count.items()}

In [7]:
sorted(zip(vocab_count.keys(), vocab_count.values(), vocab_freq.values()), key=operator.itemgetter(1))

[('À', 2, 3.6090724864163534e-07),
 ('¼', 2, 3.6090724864163534e-07),
 ('ū', 2, 3.6090724864163534e-07),
 ('ǎ', 2, 3.6090724864163534e-07),
 ('Î', 2, 3.6090724864163534e-07),
 ('Э', 2, 3.6090724864163534e-07),
 ('Н', 2, 3.6090724864163534e-07),
 ('О', 2, 3.6090724864163534e-07),
 ('э', 2, 3.6090724864163534e-07),
 ('Р', 2, 3.6090724864163534e-07),
 ('Е', 2, 3.6090724864163534e-07),
 ('ф', 2, 3.6090724864163534e-07),
 ('В', 2, 3.6090724864163534e-07),
 ('С', 2, 3.6090724864163534e-07),
 ('中', 2, 3.6090724864163534e-07),
 ('华', 2, 3.6090724864163534e-07),
 ('共', 2, 3.6090724864163534e-07),
 ('和', 2, 3.6090724864163534e-07),
 ('国', 2, 3.6090724864163534e-07),
 ('世', 2, 3.6090724864163534e-07),
 ('界', 2, 3.6090724864163534e-07),
 ('大', 2, 3.6090724864163534e-07),
 ('团', 2, 3.6090724864163534e-07),
 ('结', 2, 3.6090724864163534e-07),
 ('¾', 2, 3.6090724864163534e-07),
 ('ů', 2, 3.6090724864163534e-07),
 ('½', 2, 3.6090724864163534e-07),
 ('ì', 4, 7.218144972832707e-07),
 ('→', 4, 7.218144972

In [8]:
full_vocab = sorted(vocab_count.keys(), key=vocab_count.get, reverse=True)
full_vocab = ''.join(full_vocab)
full_vocab

' esanrutioldpcmvé,’gqfb.hjàyèx-0A\xa0êLCzk)(P!JEMN12SI:…D´»«BT53ùwôFUçV4O/RGKH?Qâ678—W9€ïûZî$Yœ‘_X–=%“~;”>+ëо°е*É\'тани×&срм[]уклвп<бдь`²″йúö„#"чыгяÇ′юх@ºáüñó|шзЯşăÊжì→\tíãщИМ人民万岁À¼ūǎÎЭНОэРЕфВС中华共和国世界大团结¾ů½'

## Normalize some of the text characters

In [9]:
def normalize(txt):

    # Non breaking spaces -> regular spaces
    txt = txt.replace('\xa0', ' ')

    # Double quotes
    double_quotes_chars = '“”»«'
    for double_quotes_char in double_quotes_chars:
        txt = txt.replace(double_quotes_char, '"')

    # Single quotes
    single_quote_chars = '‘`´’'
    for single_quote_char in single_quote_chars:
        txt = txt.replace(single_quote_char, "'")

    # Triple dots
    txt = txt.replace('…', '...')

    # Hyphens
    hyphen_chars = '–—'
    for hyphen_char in hyphen_chars:
        txt = txt.replace(hyphen_char, '-')

    return txt

In [10]:
txt = normalize(txt)

In [11]:
vocab_count = dict(Counter(txt))
full_vocab = sorted(vocab_count.keys(), key=vocab_count.get, reverse=True)
full_vocab = ''.join(full_vocab)
full_vocab

' esanrutioldpcmvé.\',gqfbhjàyèx-0AêLC"zk)(P!JEMN12SI:DBT53ùwôFUçV4O/RGKH?Qâ678W9€ïûZî$Yœ_X=%~;>+ëо°е*Éтани×&срм[]уклвп<бдь²″йúö„#чыгяÇ′юх@ºáüñó|шзЯşăÊжì→\tíãщИМ人民万岁À¼ūǎÎЭНОэРЕфВС中华共和国世界大团结¾ů½'

## Restrict text to a sensible vocabulary

In [12]:
vocab = ' !"$%\'()+,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~°àâçèéêëîïôùûœо€'

In [13]:
# Restrict text to vocabulary
def restrict_to_vocab(txt, vocab):
    txt = ''.join(char for char in txt if char in vocab)
    return txt

In [14]:
txt = restrict_to_vocab(txt, vocab)

In [15]:
# Double check new vocabulary
assert ''.join(sorted(set(txt))) == vocab

# Sanitize text

In [16]:
vocab

' !"$%\'()+,-./0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~°àâçèéêëîïôùûœо€'

In [17]:
blogger = restrict_to_vocab(normalize(blogger), vocab)

with open('data/one_txt/sanitized_blogger.txt', 'w') as f:
    f.write(blogger)

In [18]:
wordpress = restrict_to_vocab(normalize(wordpress), vocab)

with open('data/one_txt/sanitized_wordpress.txt', 'w') as f:
    f.write(wordpress)