In [1]:
import requests
from os import path
import csv
import numpy as np
import sys
import os
import codecs
import re
from IPython.display import Markdown
sys.path.insert(0, os.path.abspath('../..'))
from probability.distributions import FrequencyTable, DiscreteDistribution

%matplotlib inline

fname = 'data/Moby.Dick.txt'
if not path.exists('data'):
    os.mkdir('data')
if not path.exists(fname):
    url = 'http://www.gutenberg.org/files/2701/2701-0.txt'
    r = requests.get(url, allow_redirects=True)    
    file = codecs.open(fname, 'w', "utf-8")
    file.write(r.content.decode('utf-8'))


In [2]:
# Load the text
file = codecs.open(fname, 'r', 'utf-8')
# Remove all '\r' and '\n'
lines = [l.replace('\r\n','') for l in file.readlines()]
file.close()

In [3]:
single_line = np.r_[lines]

In [4]:
def replace_char(line):    
    # convert more-than-one spaces to one
    line = re.sub(" +", " " , line) 
    # remove punctuations
    line = re.sub("[,;:’\*\#\[\]()!?“”_/\-]", "" , line) 
    # replace . by STOP_WORD    
    line = line.replace(".", " STOP_WORD ")    
    return line
#np.flatiter()
words = [w for line in [replace_char(line).split(" ") for line in single_line if len(line) > 0] 
           for w in line if len(w) > 0]
dist = FrequencyTable(words)

In [5]:
r = {k:v for k, v in dist.most_common(10)}
Markdown(FrequencyTable(r).to_table(normalised=False,sort=True))

|X1       |frequency|
|---------|---------|
|      the|13873.0  |
|STOP_WORD|8188.0   |
|       of|6670.0   |
|      and|6078.0   |
|       to|4589.0   |
|        a|4556.0   |
|       in|3972.0   |
|     that|2886.0   |
|      his|2464.0   |
|       it|2104.0   |
|**total**|55380.0    |

In [6]:
from itertools import tee

def to_tuple(iterable, n=2):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    iterables = tee(iterable, n)
    for i in range(n):
        for j in range(i,n):
            next(iterables[j], None)
        
    return zip(*tuple(iterables))

two_words = to_tuple(words)
dist2 = DiscreteDistribution([w2 for w2 in two_words if w2[0] != "STOP_WORD"])

In [7]:
r = {k:v for k, v in dist2.most_common(10)}
Markdown(DiscreteDistribution(r).to_table(normalised=False,sort=True))

|X1       |X2       |title
|---------|---------|---------
|       of|      the|1896.0   |
|       in|      the|1139.0   |
|       to|      the|733.0    |
|     from|      the|437.0    |
|       of|      his|371.0    |
|      and|      the|365.0    |
|       of|        a|337.0    |
|       on|      the|336.0    |
|     with|      the|325.0    |
|       to|       be|320.0    |


In [8]:
three_words = to_tuple(words, 3)
dist3 = DiscreteDistribution([w3 for w3 in three_words if w3[0] != "STOP_WORD" and w3[1] != "STOP_WORD"])

In [9]:
r = {k:v for k, v in dist3.most_common(20)}
Markdown(DiscreteDistribution(r).to_table(normalised=False,sort=True))

|X1       |X2       |X3       |title
|---------|---------|---------|---------
|       of|      the|    whale|73.0     |
|      the|    Sperm|    Whale|62.0     |
|      one|       of|      the|61.0     |
|       of|      the|      sea|57.0     |
|      the|    White|    Whale|53.0     |
|     part|       of|      the|53.0     |
|      out|       of|      the|53.0     |
|        a|     sort|       of|49.0     |
|      the|      sea|STOP_WORD|45.0     |
|      the|    whale|STOP_WORD|31.0     |
|       of|      the|    Sperm|31.0     |
|       in|      the|      sea|31.0     |
|       of|      the|     boat|29.0     |
|       to|      the|     deck|27.0     |
|       of|      the|     ship|27.0     |
|      for|        a|   moment|26.0     |
|       of|      the|   whales|26.0     |
|       by|       no|    means|25.0     |
|      for|      the|     time|25.0     |
|       in|    order|       to|24.0     |


In [10]:
cond = dist3.condition_on(["X1"])

In [11]:
dist4 = cond["sea"]

In [12]:
for word2, word3 in list(dist4)[:10]:
    print(word2, word3, "\t\t%.5f" % dist4.prob(X2=word2,X3=word3))

Bonapartes and 		0.00340
Both were 		0.00340
But I 		0.00340
But hardly 		0.00340
But suddenly 		0.00340
Gods voice 		0.00340
I go 		0.00340
I once 		0.00340
I protested 		0.00340
Look see 		0.00340


In [13]:
reduced_dist = dist3.reduce({"X1":"sea"})

In [14]:
r = {k:v for k, v in reduced_dist.most_common(20)}
Markdown(DiscreteDistribution(r).to_table(normalised=True,sort=True))

|X1         |X2       |title
|-----------|---------|-----------
|        and|      the|0.15       |
|         as|        a|0.125      |
|         as|       if|0.075      |
|         on|      the|0.05       |
|         in|        a|0.05       |
|       even|       as|0.05       |
|disappeared|       in|0.05       |
|         by|      the|0.05       |
|         as|      the|0.05       |
|        and|   though|0.05       |
|        and|       in|0.05       |
|        and|  finally|0.05       |
|          I|     once|0.025      |
|          I|       go|0.025      |
|       Gods|    voice|0.025      |
|        But| suddenly|0.025      |
|        But|   hardly|0.025      |
|        But|        I|0.025      |
|       Both|     were|0.025      |
| Bonapartes|      and|0.025      |


In [15]:
marginalised_dist = dist3.marginal(["X2", "X3"])

In [16]:
r = {k:v for k, v in marginalised_dist.most_common(20)}
Markdown(DiscreteDistribution(r).to_table(normalised=False,sort=True))

|X1       |frequency|
|---------|---------|
|  the|13873.0  |
|   of|6663.0   |
|   and|6078.0   |
|   to|4572.0   |
|   a|4556.0   |
|   in|3961.0   |
|   that|2861.0   |
|   his|2453.0   |
|   it|1911.0   |
|   I|1906.0   |
|   with|1703.0   |
|   is|1700.0   |
|   was|1620.0   |
|   as|1609.0   |
|   he|1534.0   |
|   for|1420.0   |
|   all|1400.0   |
|   this|1281.0   |
|   at|1238.0   |
|   by|1156.0   |
