# Baby Babel

DS 5001 Text as Data


## Purpose 

Implements the Library of Babel with a small symbol set and message length.

## Set Up

In [4]:
import re
import pandas as pd
import numpy as np
import seaborn as sns
from IPython.core.display import HTML

In [5]:
sns.set()

### Import Config

In [7]:
import configparser
config = configparser.ConfigParser()

In [8]:
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [9]:
data_home, output_dir

('/Users/Samantha/Desktop/MSDS/DS5001/data',
 '/Users/Samantha/Desktop/MSDS/DS5001/output')

## Mini Babel

We create a miniature Library of Babel, one based on only four characters, and a message length of six. 

### The Symbol Set

In [12]:
mini_alpha = list('abt ')

### All possible Messages of Length 6

This is a clumsy but visually effective way to demonstrate how the Library of Babel might have been constructed. It is essentially the cartesian product of the alphabet, multiplying by the length of the message.

In [14]:
mini_library_list = []
for L1 in mini_alpha:
    for L2 in mini_alpha:
        for L3 in mini_alpha:
            for L4 in mini_alpha:
                for L5 in mini_alpha:
                    for L6 in mini_alpha:
                        mini_library_list.append(''.join((L1,L2,L3,L4,L5,L6)))

In [15]:
df1 = pd.DataFrame(mini_library_list, columns=['book'])

In [16]:
df1.sample(10)

Unnamed: 0,book
2185,tatatb
3667,tbba
757,at bb
859,a bbt
3914,batt
2223,tatt
2484,tbt ba
3526,b abt
3854,aa t
2069,taabbb


How many books are in the library?

In [18]:
len(mini_library_list), len(mini_alpha) ** 6, df1.shape[0]

(4096, 4096, 4096)

Can we find a specific book?

In [20]:
my_book = 'at bat'

In [21]:
mini_library_list.index(my_book)

722

In [22]:
df1[df1.book == my_book].index[0]

722

### The Pandas Way

Pandas provides a method -- `pd.MultiIndex.from_product()` -- to create a cartesian product of an arbitrary list of lists.

Let's create a library based on a book length $L = 6$.

In [24]:
L = 6

In [25]:
alpha_lists = [mini_alpha] * L
book_idx = pd.MultiIndex.from_product(alpha_lists)
mini_library = pd.DataFrame(index=book_idx)\
    .reset_index()\
    .sum(1)\
    .to_frame('book')
    #.apply(lambda x: ''.join(x), 1)\

In [26]:
mini_library

Unnamed: 0,book
0,aaaaaa
1,aaaaab
2,aaaaat
3,aaaaa
4,aaaaba
...,...
4091,t
4092,a
4093,b
4094,t


Should be the same as $|a|^L$ where $|a|$ is the symbol set size and $L$ is the average message length.

In [28]:
len(mini_library) == len(mini_alpha)**L

True

In [29]:
mini_library[mini_library.book == 'at bat']

Unnamed: 0,book
722,at bat


### Probability of a book

In [31]:
N = len(mini_library)

In [32]:
assert N == len(mini_alpha)**L # types**tokens

In [33]:
p_book = 1 / N

In [34]:
p_book

0.000244140625

### Entropy of `mini_library`

Max Entropy: $H_{max} = \sum_N\frac{1}{N}\log_2(\frac{N}{1}) = N\frac{1}{N}\log_2(\frac{N}{1}) = \log_2(N)$

In [36]:
H_max = np.log2(N)

In [37]:
H_max

12.0

### Sample text

In [39]:
mini_text = mini_library.sample(100, replace=True).book.str.cat(sep=' ')

In [40]:
mini_text

'tt a   baba t b batt t tbaa t attb aaba a t a a  bat  b aatatb  a  b  aa a   b  b   b bata   aatb aaat a tbb tt bbta b b att  tatb b abtb b tab bt ba   a ttbbaa ataa   aabb a    abt atttbb t  tat bb  t  tattta  abbta b a ta a b ta tababb     t  atttb   atbaa abb b  atattt btat b baat t ttabbb t t bt a btab aa a a a b tb    att attb t b t a  ab aab  b atb b ba t bbt  t a aa t  bb b   batb  tta tt atabbb bb       a a  a  b a tb aat a t  t b tb a bbtba  t t ab taaabt  b  tt b ab t   tt t tttbaa baattt t tb    bb tt batb a t ba b btt b  a t t    bttt bata   a tb   bataab t  bbb aaa  b at att ttatbt bb b b baatbb tbt at bbb  t  t atb t   aa t tt a btabbb taaa   btt bb  b btt a  tab a  bta tbt bb'

In [41]:
display(HTML(mini_text))

## A Bigger Babel

In [43]:
class UnigramModel():
    """A simple character level language model. A language model is just
    the sample space of the symbol system with associated probabilities."""
    
    alpha:[] = list(' abcdefghijklmnopqrstuvwxyz')
    
    def __init__(self):
        self.model = pd.DataFrame(index=self.alpha)
        self.model.index.name = 'char'
        self.model['n'] = 1
        self.model['p_x'] = 1 / len(self.alpha)
        
    def update_weights(self, char_str=''):
        self.char_str = char_str.lower()
        self.chars = pd.Series(list(self.char_str))
        self.chars = self.chars[self.chars.isin(self.alpha)]       
        self.model['n'] = self.chars.value_counts()
        self.model['p_x'] = self.model.n / self.model.n.sum()

In [44]:
UGM = UnigramModel()

In [45]:
UGM.model

Unnamed: 0_level_0,n,p_x
char,Unnamed: 1_level_1,Unnamed: 2_level_1
,1,0.037037
a,1,0.037037
b,1,0.037037
c,1,0.037037
d,1,0.037037
e,1,0.037037
f,1,0.037037
g,1,0.037037
h,1,0.037037
i,1,0.037037


In [46]:
class Babel():
    """Generate messages based on a character level language model."""

    msg_len:int = 40 * 80
    use_html = True
    
    def __init__(self, UGM:UnigramModel=UGM):
        self.UGM = UGM
        
    def get_message(self):
        self.msg = self.UGM.model.sample(self.msg_len, weights='p_x', replace=True).index.str.cat()
        if self.use_html:
            self.msg = f"<div style='width:6in;font-size:14pt;font-family:monospace;'>{self.msg}</div>"
            
    def print_message(self):
        if self.use_html:
            display(HTML(self.msg))
        else:
            display(self.msg)

In [47]:
B1 = Babel(UGM)

In [48]:
B1.get_message()
B1.print_message()

## Add Data to Model

### Import corpus

In [51]:
text_csv = f'{output_dir}/austen-combo-TOKENS.csv'

In [52]:
text_df = pd.read_csv(text_csv)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/Samantha/Desktop/MSDS/DS5001/output/austen-combo-TOKENS.csv'

In [61]:
text_df.head()

NameError: name 'text_df' is not defined

### Convert to one big string

In [None]:
text_str = text_df.token_str.str.cat(sep=' ')

In [None]:
len(text_str)

In [None]:
text_str[:80]

In [None]:
CHARS = pd.DataFrame(dict(char_token=list(text_str)))
CHARS['char_type'] = CHARS.char_token.str.lower()
CHARSET = CHARS.char_type.value_counts().to_frame('n')

In [None]:
CHARSET.plot.bar(rot=0, figsize=(15,5));

### Update weights in model

In [None]:
UGM.update_weights(text_str)

In [None]:
UGM.model.head()

In [None]:
UGM.model.p_x.sort_values().plot(kind='barh', figsize=(5,10));

In [None]:
B1.get_message()
B1.print_message()

## Create a Bigram Model

The index is the sample space.

In [None]:
class BigramModel():
    
    alpha:[] = list(' abcdefghijklmnopqrstuvwxyz')
    
    def __init__(self):
        self.idx = pd.MultiIndex.from_product([self.alpha, self.alpha], names=['char_x', 'char_y'])
        self.model = pd.DataFrame(dict(n=1, p_xy=(1/len(self.idx))), index=self.idx).sort_index()
    
    def update_weights(self, text_str):
        char_list = [char for char in text_str if char in self.alpha]
        df_cols = dict(
            char_x = [' '] + char_list,
            char_y = char_list + [' ']
        )
        self.model['n'] = pd.DataFrame(df_cols).value_counts()\
            .to_frame()
        self.model.n = self.model.n.fillna(0)
        self.model.n += 1 # LaPlace smoothing
        self.model['p_xy'] = self.model.n / self.model.n.sum()
        
    def add_conditional_probs(self):
        self.model['p_yGx'] = self.model.groupby('char_x')\
            .apply(lambda row: row.n / row.n.sum())\
            .to_frame('p_yGx').droplevel(0)
        
    def get_conditional_entropy(self):
        """Computes the entropy for each character of the distribution of following characters."""
        self.H = self.model.groupby('char_x').apply(lambda row: row.p_yGx * np.log2(1/row.p_yGx))\
            .droplevel(0).to_frame('h_yGx')\
            .groupby('char_x').h_yGx.sum().to_frame()

In [None]:
BGM = BigramModel()

### Get Data to Estimate Model

In [None]:
BGM.update_weights(text_str)

In [None]:
BGM.model.n.unstack()

### Add Conditional Probabilities

In [None]:
BGM.add_conditional_probs()

In [None]:
BGM.model.p_yGx.sort_values(ascending=False).head(10).plot.barh();

### Get conditional entropy of characters as antecendents

Note that all the vowels have high entropy rates.

Interestingly, so does n.

In [None]:
BGM.get_conditional_entropy()

In [None]:
BGM.H.h_yGx.sort_values().plot.barh(figsize=(10,10));

### Look at Examples

In [None]:
X = BGM.model.p_yGx.unstack()
X = round(X * 100, 2)

In [None]:
X.style.format("{:.2f}").background_gradient(cmap='YlGnBu', axis=1)

In [None]:
X.style.format("{:.2f}").background_gradient(cmap='YlGnBu', axis=0)

In [None]:
X.style.format("{:.2f}").background_gradient(cmap='YlGnBu', axis=None)

In [None]:
# sns.set(rc = {'figure.figsize':(15,8)})
# sns.heatmap(data=BGM.model.p_yGx.unstack(), 
#     cmap='YlGnBu', 
#     square=True, 
#     vmin=0, 
#     vmax=1, 
#     cbar=False);

In [None]:
def plot_char(char):
    global BGM
    h = BGM.H.loc[char].h_yGx.round(2)
    title = f"Char {char}, H={h}"
    BGM.model.loc[char].p_yGx.sort_values(ascending=False).plot.bar(rot=0, figsize=(10,2), title=title);

In [None]:
plot_char('q')

In [None]:
plot_char('v')

In [None]:
plot_char('h')

In [None]:
plot_char('p')

In [None]:
plot_char('a')

In [None]:
plot_char(' ')

## Generate text

In [None]:
english_words = set([word.strip().lower() 
                     for word in open(f"{data_home}/misc/english-words.txt", 'r')\
                        .readlines()])

In [None]:
class Babel2():
    """Generate messages based on a character level language model."""

    msg_len:int = 80 * 40
    use_html = True
    
    def __init__(self, BGM:BigramModel, english_words):
        self.BGM = BGM
        self.english_words = english_words
        
    def get_message(self):
        self.msg = ' '
        for i in range(self.msg_len):
            self.msg += self.BGM.model.loc[self.msg[-1]]\
                .sample(weights='p_yGx').index.values[0]
        self.tokens = pd.DataFrame(self.msg.split(), columns=['token_str'])
        self.vocab = self.tokens.token_str.value_counts().to_frame('n')
        self.vocab['en'] = False
        self.vocab.loc[self.vocab.index.isin(english_words), 'en'] = True
        self.vocab['len'] = self.vocab.index.str.len()
        self.tokens['en'] = self.tokens.token_str.map(self.vocab.en)
            
    def print_message(self):
        if self.use_html:
            html_msg = ''
            for token in self.msg.split():
                if token in english_words:
                    token = f"<b style='color:red;'>{token}</b>"
                html_msg += ' ' + token
            self.msg = f"<p style='color:gray;width:6in;font-size:14pt;font-family:monospace;'>{html_msg}</p>"            
            display(HTML(self.msg))
        else:
            display(self.msg)

In [None]:
B2 = Babel2(BGM, english_words)
B2.get_message()
B2.print_message()

## Extra: Look at Babel Vocab Stats

### English words

In [None]:
B2.vocab.loc[B2.vocab.en == True, ['n','len']]\
    .sort_values('n', ascending=False)

### Type and token ratios

In [None]:
type_rate = round(B2.vocab[B2.vocab.en == True].n.count() / B2.vocab.n.count(), 2)
token_rate = round(B2.vocab[B2.vocab.en == True].n.sum() / B2.vocab.n.sum(), 2)
type_rate, token_rate, round(type_rate/token_rate, 2)

### Long words

In [None]:
B2.vocab.query("en == False").sort_values('len', ascending=False).head(20)

### Word lengths

In [None]:
B2.vocab.len.value_counts().sort_index().plot.bar(rot=0);

In [None]:
B2.vocab.query("en == True").len.value_counts().sort_index().plot.bar();

In [None]:
B2.vocab.len.mean()

In [None]:
B2.vocab[B2.vocab.en == True].len.mean()

## Memorable passwords?

In [None]:
B2.vocab[(B2.vocab.en == False) & B2.vocab.len.isin([6,7,8])].sample(10)

## Challenge

Build a language model using word lengths. Requires training a word length model from real English.