In [1]:
import requests
from os import path
import csv
import numpy as np
import sys
import os
import codecs
import re
from IPython.display import Markdown
sys.path.insert(0, os.path.abspath('../..'))
from probability.empirical_distributions import FrequencyTable, DiscreteDistribution

%matplotlib inline

fname = 'data/Moby.Dick.txt'
if not path.exists('data'):
    os.mkdir('data')
if not path.exists(fname):
    url = 'http://www.gutenberg.org/files/2701/2701-0.txt'
    r = requests.get(url, allow_redirects=True)    
    file = codecs.open(fname, 'w', "utf-8")
    file.write(r.content.decode('utf-8'))


In [2]:
# Load the text
file = codecs.open(fname, 'r', 'utf-8')
# Remove all '\r' and '\n'
lines = [l.replace('\r\n','') for l in file.readlines()]
file.close()

In [3]:
single_line = np.r_[lines]

In [4]:
def replace_char(line):    
    # convert more-than-one spaces to one
    line = re.sub(" +", " " , line) 
    # remove punctuations
    line = re.sub("[,;:’\*\#\[\]()!?“”_/\-]", "" , line) 
    # lower case
    line = line.lower()
    # replace . by STOP_WORD    
    line = line.replace(".", " STOP_WORD ")    
    return line
#np.flatiter()
words = [w for line in [replace_char(line).split(" ") for line in single_line if len(line) > 0] 
           for w in line if len(w) > 0]
dist = FrequencyTable(words)

In [5]:
r = {k:v for k, v in dist.most_common(10)}
Markdown(FrequencyTable(r).to_table(normalised=False,sort=True))

|X1       |frequency|
|---------|---------|
|      the|14594.0  |
|STOP_WORD|8188.0   |
|       of|6711.0   |
|      and|6448.0   |
|        a|4705.0   |
|       to|4659.0   |
|       in|4210.0   |
|     that|2951.0   |
|      his|2522.0   |
|       it|2383.0   |
|**total**|57371.0    |

In [6]:
from itertools import tee

def to_tuple(iterable, n=2):
    "s -> (s0,s1), (s1,s2), (s2, s3), ..."
    iterables = tee(iterable, n)
    for i in range(n):
        for j in range(i,n):
            next(iterables[j], None)
        
    return zip(*tuple(iterables))

two_words = to_tuple(words)
dist2 = DiscreteDistribution([w2 for w2 in two_words if w2[0] != "STOP_WORD"])

In [7]:
r = {k:v for k, v in dist2.most_common(10)}
Markdown(DiscreteDistribution(r).to_table(normalised=False,sort=True))

|X1 |X2 |title
|---|---|----
| of|the|1911.0|
| in|the|1189.0|
| to|the|743.0|
|from|the|444.0|
|and|the|372.0|
| of|his|371.0|
| on|the|356.0|
| of|  a|338.0|
| at|the|332.0|
| to| be|329.0|


In [8]:
three_words = to_tuple(words, 3)
dist3 = DiscreteDistribution([w3 for w3 in three_words if w3[0] != "STOP_WORD" and w3[1] != "STOP_WORD"])

In [9]:
r = {k:v for k, v in dist3.most_common(20)}
Markdown(DiscreteDistribution(r).to_table(normalised=False,sort=True))

|X1   |X2   |X3   |title
|-----|-----|-----|--
|  the|sperm|whale|86.0|
|   of|  the|whale|78.0|
|  the|white|whale|71.0|
|  one|   of|  the|64.0|
|  out|   of|  the|57.0|
|   of|  the|  sea|57.0|
| part|   of|  the|53.0|
|    a| sort|   of|51.0|
|  the|  sea|STOP_WORD|45.0|
|   of|  the|sperm|43.0|
|  the|whale|STOP_WORD|35.0|
|   it|  was|    a|33.0|
|   in|  the|  sea|32.0|
|  the|sperm|whales|31.0|
|  for|    a|moment|29.0|
|   of|  the| boat|29.0|
|   it|   is|    a|29.0|
|   of|  the| ship|28.0|
|   of|  the|whales|28.0|
|   to|  the| deck|27.0|


In [10]:
cond = dist3.condition_on("X1")

In [11]:
dist4 = cond["sea"]

In [12]:
for word2, word3 in list(dist4)[:10]:
    print(word2, word3, "\t\t%.5f" % dist4.prob(X2=word2,X3=word3))

a bouncing 		0.00324
adding largely 		0.00324
after them 		0.00324
again in 		0.00324
air as 		0.00324
all whose 		0.00324
almost perpetually 		0.00324
alternate with 		0.00324
among waves 		0.00324
and a 		0.00324


In [13]:
reduced_dist = dist3.reduce(X1="sea")

In [14]:
r = {k:v for k, v in reduced_dist.most_common(20)}
Markdown(DiscreteDistribution(r).to_table(normalised=True,sort=True))

|X1 |X2|title
|---|--|-
|and|the|0.14285714285714285|
| as| a|0.11904761904761904|
| in| a|0.07142857142857142|
| as|if|0.07142857142857142|
|such| a|0.047619047619047616|
| on|the|0.047619047619047616|
|even|as|0.047619047619047616|
|disappeared|in|0.047619047619047616|
| by|the|0.047619047619047616|
| as|the|0.047619047619047616|
|and|though|0.047619047619047616|
|and|in|0.047619047619047616|
|and|finally|0.047619047619047616|
|almost|perpetually|0.023809523809523808|
|all|whose|0.023809523809523808|
|air|as|0.023809523809523808|
|again|in|0.023809523809523808|
|after|them|0.023809523809523808|
|adding|largely|0.023809523809523808|
|  a|bouncing|0.023809523809523808|


In [15]:
marginalised_dist = dist3.marginal("X2", "X3")

In [16]:
r = {k:v for k, v in marginalised_dist.most_common(20)}
Markdown(DiscreteDistribution(r).to_table(normalised=False,sort=True))

|X1 |frequency|
|---|-----|
|the|14594.0|
|of|6704.0|
|and|6448.0|
|a|4691.0|
|to|4642.0|
|in|4197.0|
|that|2926.0|
|his|2511.0|
|it|2190.0|
|i|1906.0|
|but|1775.0|
|with|1763.0|
|he|1742.0|
|as|1730.0|
|is|1720.0|
|was|1630.0|
|for|1621.0|
|all|1457.0|
|this|1396.0|
|at|1322.0|


In [17]:
dist_ten_1 = FrequencyTable(words[:100000], "X1")
dist_ten_2 = FrequencyTable(words[:100000], "X2")

In [20]:
%prun prod_1 = dist_ten_1 * dist_ten_2

 

In [19]:
%prun prod_1 = dist_ten_1.product_multi_proc(dist_ten_2, 4)

 

In [20]:
len(prod_1.keys())

166435801

In [21]:
len(dist_ten_1.keys())

12901