# Prototype of Minimal Google NGram Viewer  
Bo Shen  
11/26/2017

In [1]:
from os import listdir
from os.path import isfile, join

In [3]:
mypath = 'test'
print listdir(mypath)

['tiny.txt']


## Build ngram counter on a tiny text file

In [22]:
# build ngram counter on a tiny text file
from collections import deque
counter = {}

n = 2
for fileName in listdir(mypath):
    with open( mypath + '/' + fileName) as f:
        ngram_temp = deque()
        for word in f.read().split():
            print word
            ngram_temp.append(word)
            if len(ngram_temp) == n:
                print ngram_temp
                ngram = ' '.join(ngram_temp)
                ngram_temp.popleft()
                if ngram in counter:
                    counter[ngram] += 1
                else:
                    counter[ngram] = 1
print counter

to
be
deque(['to', 'be'])
or
deque(['be', 'or'])
not
deque(['or', 'not'])
to
deque(['not', 'to'])
be
deque(['to', 'be'])
just
deque(['be', 'just'])
be
deque(['just', 'be'])
who
deque(['be', 'who'])
you
deque(['who', 'you'])
want
deque(['you', 'want'])
to
deque(['want', 'to'])
be
deque(['to', 'be'])
or
deque(['be', 'or'])
not
deque(['or', 'not'])
okay
deque(['not', 'okay'])
you
deque(['okay', 'you'])
want
deque(['you', 'want'])
okay
deque(['want', 'okay'])
{'not okay': 1, 'who you': 1, 'you want': 2, 'want to': 1, 'be or': 2, 'be who': 1, 'be just': 1, 'okay you': 1, 'just be': 1, 'to be': 3, 'not to': 1, 'or not': 2, 'want okay': 1}


In [27]:
# sanity check the total number of ngrams should be added up to 18
print sum(val for val in counter.values())

18


In [34]:
# convert counter to a pandas dataframe
import pandas as pd
s = pd.Series(counter)
s.name = mypath
s

be just      1
be or        2
be who       1
just be      1
not okay     1
not to       1
okay you     1
or not       2
to be        3
want okay    1
want to      1
who you      1
you want     2
Name: test, dtype: int64

## Consider how to store ngram counter

### Save merge results as dataframe and save it as cvs files 

In [35]:
df = pd.DataFrame()
df.join(s, how = 'outer')

Unnamed: 0,test
be just,1
be or,2
be who,1
just be,1
not okay,1
not to,1
okay you,1
or not,2
to be,3
want okay,1


### Save the counter as pickle files?

## Test 2-gram on given input files

### Obtain folders contain text files

In [1]:
from os import listdir
from os import getcwd
cwd = getcwd()
folders = listdir(cwd)
from os import listdir
candidate_folders = set([str(year) for year in range(1912, 2018)])
text_folders = [folder for folder in folders if folder in candidate_folders]
print text_folders

['2012', '2013', '2014', '2015', '2016']


### Create a dataframe for 2-gram

In [2]:
import pandas as pd
from collections import deque
df = pd.DataFrame()

n = 2
# folder contains text files from specific year
for folder in text_folders:
    counter = {}
    cnt = 0
    # read all files from the folder of specific year
    for fileName in listdir(folder):
        with open( folder + '/' + fileName) as f:
            ngram_temp = deque()
            for word in f.read().split():
                #print word
                ngram_temp.append(word)
                if len(ngram_temp) == n:
                    cnt += 1
                    #print ngram_temp
                    ngram = ' '.join(ngram_temp)
                    ngram_temp.popleft()
                    if ngram in counter:
                        counter[ngram] += 1
                    else:
                        counter[ngram] = 1
            print(folder + '/' + fileName)
    # each folder create a column in the dataframe
    s = pd.Series(counter)
    s.name = folder
    #print s.shape
    df = df.join(s, how = 'outer')
    # df[folder] = s  # this maintain maximum number of rows from different s

#print df.shape
df.head()

(615027, 5)


Unnamed: 0,2012,2013,2014,2015,2016
! !,,6.0,,,
! Judge,,2.0,,,
""" That",1.0,,,,
"""$200 reward""",,1.0,,,
"""& factus",1.0,,,,


In [3]:
df.fillna(value = 0, inplace = True)
df.head()

Unnamed: 0,2012,2013,2014,2015,2016
! !,0.0,6.0,0.0,0.0,0.0
! Judge,0.0,2.0,0.0,0.0,0.0
""" That",1.0,0.0,0.0,0.0,0.0
"""$200 reward""",0.0,1.0,0.0,0.0,0.0
"""& factus",1.0,0.0,0.0,0.0,0.0


In [4]:
results = df.loc['" That',:] / df.sum()

In [5]:
results.index

Index([u'2012', u'2013', u'2014', u'2015', u'2016'], dtype='object')

In [48]:
df.to_csv("2gram.csv")

In [51]:
df = pd.read_csv("2gram.csv", index_col = 0)
df.head()

Unnamed: 0,2012,2013,2014,2015,2016
! !,0.0,6.0,0.0,0.0,0.0
! Judge,0.0,2.0,0.0,0.0,0.0
""" That",1.0,0.0,0.0,0.0,0.0
"""$200 reward""",0.0,1.0,0.0,0.0,0.0
"""& factus",1.0,0.0,0.0,0.0,0.0


In [54]:
import os.path
os.path.isfile("2gram.csv") 

True

### Test the ngram_counter module

In [2]:
from ngram_counter import ngramCounter

In [3]:
df1 = ngramCounter(1)
df1.head()

Unnamed: 0,2012,2013,2014,2015,2016
!,0.0,8.0,0.0,0.0,0.0
"""",1.0,0.0,0.0,0.0,0.0
"""$200",0.0,1.0,0.0,0.0,0.0
"""&",2.0,0.0,0.0,0.0,0.0
"""'A",0.0,0.0,0.0,1.0,0.0


In [6]:
df2 = ngramCounter(2)
df2.head()

Unnamed: 0,2012,2013,2014,2015,2016
! !,0.0,6.0,0.0,0.0,0.0
! Judge,0.0,2.0,0.0,0.0,0.0
""" That",1.0,0.0,0.0,0.0,0.0
"""$200 reward""",0.0,1.0,0.0,0.0,0.0
"""& factus",1.0,0.0,0.0,0.0,0.0


In [6]:
df3 = ngramCounter(3)
df3.head()

Unnamed: 0,2012,2013,2014,2015,2016
! ! !,0.0,4.0,0.0,0.0,0.0
! ! Judge,0.0,2.0,0.0,0.0,0.0
! Judge Thatcher,0.0,2.0,0.0,0.0,0.0
""" That in",1.0,0.0,0.0,0.0,0.0
"""$200 reward"" under",0.0,1.0,0.0,0.0,0.0


In [9]:
df4 = ngramCounter(4)
df4.head()

Unnamed: 0,2012,2013,2014,2015,2016
! ! ! !,0.0,2.0,0.0,0.0,0.0
! ! ! Judge,0.0,2.0,0.0,0.0,0.0
! ! Judge Thatcher,0.0,2.0,0.0,0.0,0.0
! Judge Thatcher surprised,0.0,2.0,0.0,0.0,0.0
""" That in Revenges,",1.0,0.0,0.0,0.0,0.0


In [8]:
df5 = ngramCounter(5)
df5.head()

Unnamed: 0,2012,2013,2014,2015,2016
! ! ! ! Judge,0.0,2.0,0.0,0.0,0.0
! ! ! Judge Thatcher,0.0,2.0,0.0,0.0,0.0
! ! Judge Thatcher surprised,0.0,2.0,0.0,0.0,0.0
! Judge Thatcher surprised Jim,0.0,2.0,0.0,0.0,0.0
""" That in Revenges, (that",1.0,0.0,0.0,0.0,0.0


In [7]:
df2.loc['That is',:]

2012     4.0
2013     0.0
2014     9.0
2015    51.0
2016     3.0
Name: That is, dtype: float64

## Plot the result using Bokeh

In [4]:
from __future__ import print_function
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import numpy as np

from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
output_notebook()

In [5]:
p = figure(title="Ngram Frequency", plot_height=300, plot_width=600, x_range=(2010,2018))
results = df.loc['" That',:] / df.sum()
r = p.line(results.index, results.values, color="#2222aa", line_width=3)

In [9]:
def update(ngram):
    if ngram not in df.index:
        p.title.text = "Can not find the ngram in the texts"
        push_notebook()
    else:
        p.title.text = ngram
        results = df.loc[ngram,:] / df.sum()
        r.data_source.data['y'] = results.values
        push_notebook()

In [10]:
show(p, notebook_handle=True)

In [11]:
interact(update, ngram = '" That')

<function __main__.update>