# Homework 3

```yaml
Course:   DS 5001 
Module:   03 Language Models
Topic:    Homework 3
Author:   Ryan Lipps
Date:     2/1/2024
```

In [1]:
import numpy as np
import pandas as pd
import textimporter

In [2]:
import configparser
config = configparser.ConfigParser()
config.read("../../../env.ini")
data_home = config['DEFAULT']['data_home']
output_dir = config['DEFAULT']['output_dir']

In [3]:
OHCO = ['chap_id','para_num','sent_num','token_num']

In [4]:
text_file = f"{data_home}/gutenberg/pg42324.txt"

In [5]:
ohco_pats = [('chap', r'^(?:LETTER|CHAPTER|PREFACE)\b', 'm')]
clip_pats = [r'START', r'END']
timporter = textimporter.TextImporter(src_file=text_file, ohco_pats=ohco_pats, clip_pats=clip_pats)
timporter.import_source().parse_tokens(special_tokens=['_'])
print(timporter.TOKENS.head())
print(timporter.gather_tokens(1))

Importing  /Users/ryanlipps/Documents/MSDS/DS5001/data/gutenberg/pg42324.txt
Clipping text
Parsing OHCO level 0 chap_id by milestone ^(?:LETTER|CHAPTER|PREFACE)\b
Parsing OHCO level 1 para_num by delimiter \n\n
Parsing OHCO level 2 sent_num by delimiter [.?!;:]+
Parsing OHCO level 3 token_num by delimiter [\s',-,_,--]+
                                    token_str term_str
chap_id para_num sent_num token_num                   
1       0        0        0               The      the
                          1             event    event
                          2                on       on
                          3             which    which
                          4              this     this
                                                       para_num_str
chap_id para_num                                                   
1       0         the event on which this fiction is founded has...
        1         i have thus endeavoured to preserve the truth ...
        2         the 

  new_pat = re.compile(pat)


In [6]:
TOKENS = timporter.TOKENS
TOKENS.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,term_str
chap_id,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,0,0,The,the
1,0,0,1,event,event
1,0,0,2,on,on
1,0,0,3,which,which
1,0,0,4,this,this


In [7]:
VOCAB = timporter.extract_vocab().VOCAB
VOCAB

Unnamed: 0_level_0,n,n_chars,p,s,i,h
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
the,4248,3,0.055695,17.955038,4.166317,0.232042
and,2991,3,0.039214,25.500836,4.672473,0.183228
i,2858,1,0.037471,26.687544,4.738095,0.177540
of,2683,2,0.035176,28.428252,4.829253,0.169875
to,2118,2,0.027769,36.011804,5.170398,0.143575
...,...,...,...,...,...,...
execrated,1,9,0.000013,76273.000000,16.218885,0.000213
spectators,1,10,0.000013,76273.000000,16.218885,0.000213
constrained,1,11,0.000013,76273.000000,16.218885,0.000213
attest,1,6,0.000013,76273.000000,16.218885,0.000213


In [8]:
def get_ngrams(TOKEN, n=2, sent_key='sent_num'):

    OHCO = TOKEN.index.names
    grouper = list(OHCO)[:OHCO.index(sent_key)+1]

    PADDED = TOKEN.groupby(grouper)\
        .apply(lambda x: '<s> ' + ' '.join(x.term_str) + ' <s>')\
        .apply(lambda x: pd.Series(x.split()))\
        .stack().to_frame('term_str')
    PADDED.index.names = grouper + ['token_num']

    for i in range(1, n):
        PADDED = PADDED.join(PADDED.term_str.shift(-i), rsuffix=i)

    PADDED.columns = [f'w{j}' for j in range(n)]

    return PADDED

In [9]:
unigrams = get_ngrams(TOKENS, 1)
unigrams.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,w0
chap_id,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,<s>
1,0,0,1,the
1,0,0,2,event
1,0,0,3,on
1,0,0,4,which


In [10]:
bigrams = get_ngrams(TOKENS, 2)
bigrams.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,w0,w1
chap_id,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0,0,0,<s>,the
1,0,0,1,the,event
1,0,0,2,event,on
1,0,0,3,on,which
1,0,0,4,which,this


In [11]:
trigrams = get_ngrams(TOKENS, 3)
trigrams.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,w0,w1,w2
chap_id,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,<s>,the,event
1,0,0,1,the,event,on
1,0,0,2,event,on,which
1,0,0,3,on,which,this
1,0,0,4,which,this,fiction


## Question 1:
List six words that precede the word "monster," excluding stop words (and sentence boundary markers).

### Answer 1:

In [12]:
stopwords = [
    '<s>',
    'a',
    'an',
    'and',
    'are',
    'as',
    'at',
    'be',
    'but',
    'by',
    'for',
    'if',
    'in',
    'into',
    'is',
    'it',
    'no',
    'not',
    'of',
    'on',
    'or',
    'such',
    'that',
    'the',
    'their',
    'then',
    'there',
    'these',
    'they',
    'this',
    'to',
    'was',
    'will',
    'with'
]

trigrams.query('(w1 == "monster" or w2 == "monster")\
             and ~((w0.isin(@stopwords) or w1.isin(@stopwords)))')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,w0,w1,w2
chap_id,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12,3,17,25,miserable,monster,whom
17,8,0,1,abhorred,monster,<s>
22,25,4,23,detestable,monster,<s>
23,28,0,1,hideous,monster,<s>
31,4,9,5,hellish,monster,drink
31,17,6,2,gigantic,monster,they


In [13]:
def get_ngram_counts(NGRAM):
    "Compress the sequences into counts"
    
    n = len(NGRAM.columns)
    C = [None for i in range(n)]
    
    for i in range(n):

        # Count distinct ngrams
        C[i] = NGRAM.iloc[:, :i+1].value_counts().to_frame('n').sort_index()
    
        # Get joint probabilities (MLE)
        C[i]['p'] = C[i].n / C[i].n.sum()
        C[i]['i'] = np.log2(1/C[i].p)

        # Get conditional probabilities (MLE)
        if i > 0:
            C[i]['cp'] = C[i].n / C[i-1].n
            C[i]['ci'] = np.log2(1/C[i].cp)
            
    return C

In [14]:
tgcounts = get_ngram_counts(trigrams)
tgcounts[2].sort_values('n')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,n,p,i,cp,ci
w0,w1,w2,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
life,which,from,1,0.000012,16.396337,0.333333,1.584963
paper,it,is,1,0.000012,16.396337,1.000000,0.000000
paper,signs,for,1,0.000012,16.396337,1.000000,0.000000
papers,<s>,<s>,1,0.000012,16.396337,1.000000,0.000000
papers,can,come,1,0.000012,16.396337,1.000000,0.000000
...,...,...,...,...,...,...,...
<s>,<s>,<s>,366,0.004243,7.880637,0.068411,3.869623
<s>,<s>,the,373,0.004324,7.853305,0.069720,3.842291
<s>,<s>,and,421,0.004881,7.678661,0.078692,3.667647
<s>,<s>,but,457,0.005298,7.560287,0.085421,3.549273


## Question 2:
List the following sentences in ascending order of bigram perplexity according to the language model generated from the text: 

The monster is on the ice.
Flowers are happy things.
I have never seen the aurora borealis.
He never knew the love of a family.

### Answer 2:

In [15]:
ngrams = 3
widx = [f"w{i}" for i in range(ngrams)]

In [16]:
def ngrams_to_models(ngrams):
    global widx
    n = len(ngrams.columns)
    model = [None for i in range(n)]
    for i in range(n):
        if i == 0:
            model[i] = ngrams.value_counts('w0').to_frame('n')
            model[i]['p'] = model[i].n / model[i].n.sum()
            model[i]['i'] = np.log2(1/model[i].p)
        else:
            model[i] = ngrams.value_counts(widx[:i+1]).to_frame('n')    
            model[i]['cp'] = model[i].n / model[i-1].n
            model[i]['i'] = np.log2(1/model[i].cp)
        model[i] = model[i].sort_index()
    return model

#### Build model from Frankenstein text

In [17]:
frank_model = ngrams_to_models(trigrams)

In [18]:
VOCAB['n_chars'] = VOCAB.index.str.len()
VOCAB['modified_term_str'] = VOCAB.index
VOCAB.loc[(VOCAB.n == 1) & (VOCAB.n_chars < 3), 'modified_term_str'] = "<UNK>"

In [19]:
VOCAB.loc[VOCAB['modified_term_str'] == '<UNK>']

Unnamed: 0_level_0,n,n_chars,p,s,i,h,modified_term_str
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2d,1,2,1.3e-05,76273.0,16.218885,0.000213,<UNK>
3,1,1,1.3e-05,76273.0,16.218885,0.000213,<UNK>
du,1,2,1.3e-05,76273.0,16.218885,0.000213,<UNK>
la,1,2,1.3e-05,76273.0,16.218885,0.000213,<UNK>
dr,1,2,1.3e-05,76273.0,16.218885,0.000213,<UNK>
19,1,2,1.3e-05,76273.0,16.218885,0.000213,<UNK>
w,1,1,1.3e-05,76273.0,16.218885,0.000213,<UNK>
ne,1,2,1.3e-05,76273.0,16.218885,0.000213,<UNK>
er,1,2,1.3e-05,76273.0,16.218885,0.000213,<UNK>
n,1,1,1.3e-05,76273.0,16.218885,0.000213,<UNK>


In [20]:
TOKENS['modified_term_str'] = TOKENS.term_str.map(VOCAB.modified_term_str)
TOKENS.loc[TOKENS['modified_term_str'] == '<UNK>']

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,token_str,term_str,modified_term_str
chap_id,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,12,Dr,dr,<UNK>
6,4,1,1,W,w,<UNK>
7,34,0,2,19,19,<UNK>
13,16,4,2,n,n,<UNK>
17,3,8,5,ne,ne,<UNK>
17,3,8,6,er,er,<UNK>
25,17,9,7,La,la,<UNK>
25,20,0,1,3,3,<UNK>
28,45,5,7,du,du,<UNK>
31,39,0,1,2d,2d,<UNK>


In [21]:
def token_to_padded(token, grouper=['sent_num'], term_str='term_str'):
    ohco = token.index.names # We preserve these since they get lost in the shuffle
    padded = token.groupby(grouper)\
        .apply(lambda x: '<s> ' + ' '.join(x[term_str]) + ' </s>')\
        .apply(lambda x: pd.Series(x.split()))\
        .stack().to_frame('term_str')
    padded.index.names = ohco
    return padded

In [22]:
PADDED = token_to_padded(TOKENS, grouper=OHCO[:3], term_str='modified_term_str')
PADDED

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,term_str
chap_id,para_num,sent_num,token_num,Unnamed: 4_level_1
1,0,0,0,<s>
1,0,0,1,the
1,0,0,2,event
1,0,0,3,on
1,0,0,4,which
...,...,...,...,...
31,82,1,11,in
31,82,1,12,darkness
31,82,1,13,and
31,82,1,14,distance


In [23]:
def padded_to_ngrams(padded, grouper=['sent_num'], n=2):
    
    ohco = padded.index.names
    ngrams = padded.groupby(grouper)\
        .apply(lambda x: pd.concat([x.shift(0-i) for i in range(n)], axis=1))\
        .reset_index(drop=True)
    ngrams.index = padded.index
    ngrams.columns = widx

    # ngrams = pd.concat([padded.shift(0-i) for i in range(n)], axis=1)
    # ngrams.index.name = 'ngram_num'
    # ngrams.columns = widx
    # ngrams = ngrams.fillna('<EOF>')
    
    return ngrams

In [24]:
NGRAMS = padded_to_ngrams(PADDED, OHCO[:3], ngrams)
NGRAMS

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,w0,w1,w2
chap_id,para_num,sent_num,token_num,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,0,0,<s>,the,event
1,0,0,1,the,event,on
1,0,0,2,event,on,which
1,0,0,3,on,which,this
1,0,0,4,which,this,fiction
...,...,...,...,...,...,...
31,82,1,11,in,darkness,and
31,82,1,12,darkness,and,distance
31,82,1,13,and,distance,</s>
31,82,1,14,distance,</s>,


In [25]:
def ngrams_to_models(ngrams):
    global widx
    n = len(ngrams.columns)
    model = [None for i in range(n)]
    for i in range(n):
        if i == 0:
            model[i] = ngrams.value_counts('w0').to_frame('n')
            model[i]['p'] = model[i].n / model[i].n.sum()
            model[i]['i'] = np.log2(1/model[i].p)
        else:
            model[i] = ngrams.value_counts(widx[:i+1]).to_frame('n')    
            model[i]['cp'] = model[i].n / model[i-1].n
            model[i]['i'] = np.log2(1/model[i].cp)
        model[i] = model[i].sort_index()
    return model

In [26]:
M = ngrams_to_models(NGRAMS)

In [27]:
qsents = """
The monster is on the ice
Flowers are happy things
I have never seen the aurora borealis
He never knew the love of a family
""".split('\n')[1:-1]

In [28]:
QUEST_SENTS = pd.DataFrame({'sent_str':qsents})
QUEST_SENTS.index.name = 'sent_num'
QUEST_SENTS

Unnamed: 0_level_0,sent_str
sent_num,Unnamed: 1_level_1
0,The monster is on the ice
1,Flowers are happy things
2,I have never seen the aurora borealis
3,He never knew the love of a family


In [29]:
TEST_TOKENS = QUEST_SENTS.sent_str.str.split(expand=True).stack().to_frame('token_str')
TEST_TOKENS.index.names = ['sent_num', 'token_num']
TEST_TOKENS['term_str'] = TEST_TOKENS.token_str.str.replace(r'[\W_]+', '').str.lower()
TEST_TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,token_str,term_str
sent_num,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,The,the
0,1,monster,monster
0,2,is,is
0,3,on,on
0,4,the,the
0,5,ice,ice
1,0,Flowers,flowers
1,1,are,are
1,2,happy,happy
1,3,things,things


In [30]:
TEST_TOKENS.loc[~TEST_TOKENS.term_str.isin(M[0].index), 'term_str'] = "<UNK>"
TEST_TOKENS

Unnamed: 0_level_0,Unnamed: 1_level_0,token_str,term_str
sent_num,token_num,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0,The,the
0,1,monster,monster
0,2,is,is
0,3,on,on
0,4,the,the
0,5,ice,ice
1,0,Flowers,flowers
1,1,are,are
1,2,happy,happy
1,3,things,things


In [31]:
TEST_PADDED = token_to_padded(TEST_TOKENS)
TEST_NGRAMS = padded_to_ngrams(TEST_PADDED, 'sent_num', ngrams)

In [32]:
def test_model(model, ngrams, sents):
    
    global widx
    
    assert len(model) == len(ngrams.columns)
    
    n = len(model)
    ohco = ngrams.index.names
    
    R = []
    for i in range(n):
        T = ngrams.merge(M[i], on=widx[:i+1], how='left')
        T.index = ngrams.index
        T = T.reset_index().set_index(ohco + widx).i #.to_frame(f"i{i}")
        
        # This how we handle unseen combos
        T[T.isna()] = T.max()
        R.append(T.to_frame(f"i{i}"))
                
    return pd.concat(R, axis=1)

In [33]:
R = test_model(M,TEST_NGRAMS, QUEST_SENTS)

In [34]:
def compute_perplexity(results, test_sents, n=3):
    for i in range(n):
        test_sents[f"pp{i}"] = np.exp2(results.groupby('sent_num')[f"i{i}"].mean())
    return test_sents

In [35]:
PP = compute_perplexity(R, QUEST_SENTS)
PP.sort_values('pp1', ascending=True)

Unnamed: 0_level_0,sent_str,pp0,pp1,pp2
sent_num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,I have never seen the aurora borealis,293.194117,38.781269,82.690599
0,The monster is on the ice,116.027537,81.175781,70.041494
3,He never knew the love of a family,171.288239,137.555757,65.905218
1,Flowers are happy things,590.352464,538.146514,187.0


## Question 3:
Using the bigram model represented as a matrix, explore the relationship between bigram pairs using the following lists. Hint: use the .unstack() method on the feature n and then use .loc[] to select the first list from the index, and the second list from the columns.

1) ['he','she'] to select the indices.
2) ['said','heard'] to select the columns.

### Answer 3:

In [36]:
M[1].n.unstack()

w1,1,11th,12th,13th,17,1816,1817,18th,2,26th,...,younger,youngest,youngster,your,yours,yourself,yourselves,youth,youthful,zeal
w0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
11th,,,,,1.0,,,,,,...,,,,,,,,,,
12th,,,,,1.0,,,,,,...,,,,,,,,,,
13th,,,,,1.0,,,,,,...,,,,,,,,,,
17,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yourself,,,,,,,,,,,...,,,,,,,,,,
yourselves,,,,,,,,,,,...,,,,,,,,,,
youth,,,,,,,,,,,...,,,,,,,,,,
youthful,,,,,,,,,,,...,,,,,,,,,,


In [37]:
M[1].n.unstack().loc[['he', 'she'],['said', 'heard']]

w1,said,heard
w0,Unnamed: 1_level_1,Unnamed: 2_level_1
he,21.0,5.0
she,3.0,3.0


'He said' occurrs more than 'he heard', 'she said', or 'she heard'. This seems to indicate that there is more speaking than listening, but this makes sense as it is generally understood that people hear what is spoken in books — it does not need to be explicitly said unless it is some kind of gossip situation.

## Question 4:
Generate 20 sentences using the generate_text() function. Display the results.

### Answer 4

In [38]:
def generate_text(M, n=250):
    
    if len(M) < 3:
        raise ValueError("Must have trigram model generated.")
    
    # Start list of words
    first_word = M[1].loc['<s>'].sample(weights='cp').index[0]
    
    words = ['<s>', first_word]
    
    for i in range(n):
        
        bg = tuple(words[-2:])

        # Try trigram model
        try:
            next_word = M[2].loc[bg].sample(weights='cp').index[0]

        # If not found in model, back off ...
        except KeyError as e1:
            try:
                # Get the last word in the bigram
                ug = bg[1]
                next_word = M[1].loc[ug].sample(weights='cp').index[0]
            
            except KeyError as e2:
                next_word = M[0].sample(weights='p').index[0]
                
        words.append(next_word)
    
    
    text = ' '.join(words[2:])
    print('\n\n'.join([str(i+1) + ' ' + line.replace('<s>','')\
        .strip().upper() for i, line in enumerate(text.split('</s>'))]))

In [39]:
generate_text(M, n=300)

1 SYMPATHY WAS OURS

2 THEY ARE PREJUDICED AGAINST ME A WIDE FIELD FOR THE MOMENT

3 THE VARIOUS LAKES OF LUCERNE AND URI WHERE THE AFFECTIONS OF A RELATION

4 DRAUGHT AND ORDERED US TO PROLONG OUR JOURNEY AS FAR AS WAS IN REALITY VERY ILL

5 SUBSTANCE OF THEM

6 UNDERTAKING SUCH AS NO LANGUAGE CAN DESCRIBE THEIR HORROR AND DESPAIR

7 TO EVERY PART OF IT WAS DARK WHEN I AM AT LENGTH SHE FORMED HER DETERMINATION

8 DEMANDED ADMISSION INTO THE MERITS OF THE JUDGES FROM THEIR SETTLED CONVICTION IN THE HABITATION I HAD JUST PASSED OF SUCH SCENES TO FORGET THE WORLD AND ITS INHABITANTS

9 

10 A TENDENCY TO WEAKEN YOUR AFFECTIONS AND HIS FATHER SISTER AND ALL WILL BE COOL PERSEVERING AND PRUDENT

11 MONSTER WOULD DEPART FOR THE PRECARIOUS STATE OF MIND I BETOOK MYSELF TO THE SCIENCE OF WORDS OR LETTERS

12 

13 WHICH YOUR EYES WAS THE UTMOST SADNESS AND DESPONDENCY

14 I AM ACQUAINTED

15 I DID CONFESS

16 SUPPOSE SOME ASTONISHMENT WAS EXHIBITED IN MY DESTRUCTION AND AS HE SHOULD SPEEDILY B

## Question 5
Compute the redundancy for each of the n-gram models using the MLE of the joint probability of each ngram type. In other words, for each model, just use the `.mle` feature as $p$ in computing $H = \sum{p(ng)log_2(1/p(ng))}$. Does $R$ increase, decrease, or remain the same as the choice of n-gram increases in length? Hint: Remember that $R = 1-\frac{H}{H_{max}}$, where $H$ is the actual entropy of the model and $H_{max}$ is its maximum entropy.

* If mle is not a feature in your models, just use p for the unigram model and compute p for the other two models by dividing n by the sum of n, i.e.

    ```python
    M[1]['p'] = M[1].n /  M[1].n.sum()
    M[2]['p'] = M[2].n /  M[2].n.sum()
    ```

* N is computed as the number of all possible combinations for each ngram. So, for the bigram model N is the number of unigrams (i.e. the vocabulary size plus the sentence boundary signs) squared, and for the trigram model the value is cubed, i.e.

    ```python
    N = len(M[0].index)**{i+1}
    ```

### Answer 5:


In [40]:
H_uni = sum(M[0].p*np.log2(1/M[0].p))
H_uni


8.834341350295313

In [41]:
M[1]['p'] = M[1].n /  M[1].n.sum()
M[2]['p'] = M[2].n /  M[2].n.sum()

In [42]:
H_bi = sum(M[1].p*np.log2(1/M[1].p))
H_tri = sum(M[2].p*np.log2(1/M[2].p))

In [43]:
H_max_uni = np.log2(len(M[0].index))
H_max_uni


12.776021715252812

In [44]:
H_max_bi = np.log2(len(M[0].index)**2)
H_max_bi

25.552043430505623

In [45]:
H_max_tri = np.log2(len(M[0].index)**3)
H_max_tri

38.32806514575844

In [46]:
R_uni = 1 - (H_uni/H_max_uni)
R_bi = 1 - (H_bi/H_max_bi)
R_tri = 1 - (H_tri/H_max_tri)

In [47]:
R_uni, R_bi, R_tri

(0.3085217333539497, 0.4466058378572145, 0.5876206389812104)

#### The redundancy increases as the n-gram increases in length