In [1]:
#import nltk
#nltk.download('popular', halt_on_error=False)

In [2]:
import nltk as nltk
import pandas as pd
import re
from collections import Counter, defaultdict

## Edit Distance (a.k.a. Levenshtein Distance)

### Edit Distance between words

In [3]:
nltk.edit_distance('commuter', 'computer')

1

In [5]:
nltk.edit_distance('Computer', 'computer')

1

In [6]:
nltk.edit_distance('acorn', 'corn')

1

In [7]:
nltk.edit_distance('aacorn', 'corn')

2

In [7]:
nltk.edit_distance('soup', 'potato')

5

In [8]:
word_1 = 'chicago'

word_n = ['Chicago', 'university', 'composed', 'undergraduate', 'college', 'various',  'graduate', 'programs']
 
for word in word_n:
    editDistance = nltk.edit_distance(word_1, word)
    print(word, editDistance)

Chicago 1
university 9
composed 7
undergraduate 12
college 5
various 7
graduate 7
programs 7


### Edit distance between sentences

In [9]:
sent1 = "I love cookies"
sent2 = "i love cookies"
sent3 = "cookies I love"
sent4 = "I love cookies with tea"
sent5 = "I love tea with cookies"

ed_sent_1_2 = nltk.edit_distance(sent1, sent2)
ed_sent_1_3 = nltk.edit_distance(sent1, sent3)
ed_sent_1_4 = nltk.edit_distance(sent1, sent4)
ed_sent_1_5 = nltk.edit_distance(sent1, sent5)

print(ed_sent_1_2, 'Edit Distance between sent1 and sent2')
print(ed_sent_1_3, 'Edit Distance between sent1 and sent3')
print(ed_sent_1_4, 'Edit Distance between sent1 and sent4')
print(ed_sent_1_5, 'Edit Distance between sent1 and sent5')

1 Edit Distance between sent1 and sent2
12 Edit Distance between sent1 and sent3
9 Edit Distance between sent1 and sent4
9 Edit Distance between sent1 and sent5


## Jaccard Distance

Unlike Edit Distance, you cannot just run Jaccard Distance on the strings directly; you must first convert them to the set type.

### Jaccard Distance between words

In [11]:
nltk.jaccard_distance(set('commuter'), set('computer'))

0.125

In [10]:
nltk.jaccard_distance(set('Commuter'), set('computer'))

0.3333333333333333

In [12]:
nltk.jaccard_distance(set('acorn'), set('corn'))

0.2

In [13]:
nltk.jaccard_distance(set('aacorn'), set('corn'))

0.2

In [14]:
nltk.jaccard_distance(set('soup'), set('potato'))

0.6666666666666666

In [15]:
word_1 = 'chicago'

word_n = ['Chicago', 'university', 'composed', 'undergraduate', 'college', 'various',  'graduate', 'programs']
 
for word in word_n:
    jaccardDistance = nltk.jaccard_distance(set(word_1), set(word))
    print(word, jaccardDistance)

Chicago 0.14285714285714285
university 0.9285714285714286
composed 0.8181818181818182
undergraduate 0.8333333333333334
college 0.625
various 0.7
graduate 0.8181818181818182
programs 0.7


### Jaccard distance between sentences

In [9]:
sent1 = "I love cookies"
sent2 = "i love cookies"
sent3 = "cookies I love"
sent4 = "I love cookies with tea"
sent5 = "I love tea with cookies"

jd_sent_1_2 = nltk.jaccard_distance(set(sent1), set(sent2))
jd_sent_1_3 = nltk.jaccard_distance(set(sent1), set(sent3))
jd_sent_1_4 = nltk.jaccard_distance(set(sent1), set(sent4))
jd_sent_1_5 = nltk.jaccard_distance(set(sent1), set(sent5))

print(jd_sent_1_2, 'Jaccard Distance between sent1 and sent2')
print(jd_sent_1_3, 'Jaccard Distance between sent1 and sent3')
print(jd_sent_1_4, 'Jaccard Distance between sent1 and sent4')
print(jd_sent_1_5, 'Jaccard Distance between sent1 and sent5')

0.1 Jaccard Distance between sent1 and sent2
0.0 Jaccard Distance between sent1 and sent3
0.2857142857142857 Jaccard Distance between sent1 and sent4
0.2857142857142857 Jaccard Distance between sent1 and sent5


### Experimenting with Book and distances

In [18]:
from textblob import TextBlob

In [17]:
directory = 'C://Users//IBM_ADMIN//Documents//Teaching//Data Projects//Text//Books//'
book = '3boat10.txt'

f = open(directory+book)
book_text = f.read()

In [20]:
blob = TextBlob(book_text)

In [21]:
b_words = blob.words
print (b_words[1020:1080])

['only', 'one', 'or', 'two', 'diseases', 'each', 'So', 'I', 'went', 'straight', 'up', 'and', 'saw', 'him', 'and', 'he', 'said', 'Well', 'what', "'s", 'the', 'matter', 'with', 'you', 'I', 'said', 'I', 'will', 'not', 'take', 'up', 'your', 'time', 'dear', 'boy', 'with', 'telling', 'you', 'what', 'is', 'the', 'matter', 'with', 'me', 'Life', 'is', 'brief', 'and', 'you', 'might', 'pass', 'away', 'before', 'I', 'had', 'finished', 'But', 'I', 'will', 'tell']


In [22]:
lenWords = len(b_words)
print(lenWords)

67696


In [23]:
b_sentences = blob.sentences
print (b_sentences[10:15])

[Sentence("- 
MONTMORENCY LODGES AN OBJECTION."), Sentence("- ORIGINAL MOTION CARRIED BY MAJORITY OF 
THREE TO ONE."), Sentence("THERE were four of us - George, and William Samuel Harris, and myself, 
and Montmorency."), Sentence("We were sitting in my room, smoking, and talking about 
how bad we were - bad from a medical point of view I mean, of course."), Sentence("We were all feeling seedy, and we were getting quite nervous about it.")]


In [24]:
lenSentences = len(b_sentences)
print(lenSentences)

3507


### Defining prior, next and current words

In [25]:
from itertools import tee, islice, chain

def previous_and_next(some_iterable):
    prevs, items, nexts = tee(some_iterable, 3)
    prevs = chain([None], prevs)
    nexts = chain(islice(nexts, 1, None), [None])
    return zip(prevs, items, nexts)
    print(count)

In [26]:
count = 0
for previous, item, nxt in previous_and_next(b_words):
    print ("Item is now", item, "next is", nxt, "previous is", previous)
    count += 1
    if count >= 10:
        break

Item is now THREE next is MEN previous is None
Item is now MEN next is IN previous is THREE
Item is now IN next is A previous is MEN
Item is now A next is BOAT previous is IN
Item is now BOAT next is TO previous is A
Item is now TO next is SAY previous is BOAT
Item is now SAY next is NOTHING previous is TO
Item is now NOTHING next is OF previous is SAY
Item is now OF next is THE previous is NOTHING
Item is now THE next is DOG previous is OF


### Edit distance betwen current and next words

In [27]:
count = 0
for previous, item, nxt in previous_and_next(b_words):
    print ("Edit Distance between: ", item, "& ", nxt, "is: ", nltk.edit_distance(item,nxt))
    count += 1
    if count >= 10:
        break

Edit Distance between:  THREE &  MEN is:  4
Edit Distance between:  MEN &  IN is:  2
Edit Distance between:  IN &  A is:  2
Edit Distance between:  A &  BOAT is:  3
Edit Distance between:  BOAT &  TO is:  3
Edit Distance between:  TO &  SAY is:  3
Edit Distance between:  SAY &  NOTHING is:  7
Edit Distance between:  NOTHING &  OF is:  6
Edit Distance between:  OF &  THE is:  3
Edit Distance between:  THE &  DOG is:  3


### Jaccard distance betwen current and next words

In [28]:
count = 0
for previous, item, nxt in previous_and_next(b_words):
    print ("Jaccard Distance between: ", item, "& ", nxt, "is: ", nltk.jaccard_distance(set(item),set(nxt)))
    count += 1
    if count >= 10:
        break

Jaccard Distance between:  THREE &  MEN is:  0.8333333333333334
Jaccard Distance between:  MEN &  IN is:  0.75
Jaccard Distance between:  IN &  A is:  1.0
Jaccard Distance between:  A &  BOAT is:  0.75
Jaccard Distance between:  BOAT &  TO is:  0.5
Jaccard Distance between:  TO &  SAY is:  1.0
Jaccard Distance between:  SAY &  NOTHING is:  1.0
Jaccard Distance between:  NOTHING &  OF is:  0.8571428571428571
Jaccard Distance between:  OF &  THE is:  1.0
Jaccard Distance between:  THE &  DOG is:  1.0


### Edit distance betwen current and next sentences

In [29]:
count = 0
for previous, item, nxt in previous_and_next(b_sentences):
    print ("Edit Distance between: ", item, "& ", nxt, "is: ", nltk.edit_distance(item,nxt))
    count += 1
    if count >= 10:
        break

Edit Distance between:  THREE MEN IN A BOAT
(TO SAY NOTHING OF THE DOG). &  Three Men in a Boat by Jerome K. Jerome





CHAPTER I. is:  45
Edit Distance between:  Three Men in a Boat by Jerome K. Jerome





CHAPTER I. &  THREE INVALIDS. is:  51
Edit Distance between:  THREE INVALIDS. &  - SUFFERINGS OF GEORGE AND HARRIS. is:  26
Edit Distance between:  - SUFFERINGS OF GEORGE AND HARRIS. &  - A VICTIM TO ONE 
HUNDRED AND SEVEN FATAL MALADIES. is:  36
Edit Distance between:  - A VICTIM TO ONE 
HUNDRED AND SEVEN FATAL MALADIES. &  - USEFUL PRESCRIPTIONS. is:  43
Edit Distance between:  - USEFUL PRESCRIPTIONS. &  - CURE FOR 
LIVER COMPLAINT IN CHILDREN. is:  28
Edit Distance between:  - CURE FOR 
LIVER COMPLAINT IN CHILDREN. &  - WE AGREE THAT WE ARE OVERWORKED, AND NEED 
REST. is:  37
Edit Distance between:  - WE AGREE THAT WE ARE OVERWORKED, AND NEED 
REST. &  - A WEEK ON THE ROLLING DEEP? is:  35
Edit Distance between:  - A WEEK ON THE ROLLING DEEP? &  - GEORGE SUGGESTS THE RIVER. is:

### Jaccard distance betwen current and next sentences

In [30]:
count = 0
for previous, item, nxt in previous_and_next(b_sentences):
    print ("Jaccard Distance between: ", item, "& ", nxt, "is: ", nltk.jaccard_distance(set(item),set(nxt)))
    count += 1
    if count >= 10:
        break

Jaccard Distance between:  THREE MEN IN A BOAT
(TO SAY NOTHING OF THE DOG). &  Three Men in a Boat by Jerome K. Jerome





CHAPTER I. is:  0.6857142857142857
Jaccard Distance between:  Three Men in a Boat by Jerome K. Jerome





CHAPTER I. &  THREE INVALIDS. is:  0.7419354838709677
Jaccard Distance between:  THREE INVALIDS. &  - SUFFERINGS OF GEORGE AND HARRIS. is:  0.4444444444444444
Jaccard Distance between:  - SUFFERINGS OF GEORGE AND HARRIS. &  - A VICTIM TO ONE 
HUNDRED AND SEVEN FATAL MALADIES. is:  0.3333333333333333
Jaccard Distance between:  - A VICTIM TO ONE 
HUNDRED AND SEVEN FATAL MALADIES. &  - USEFUL PRESCRIPTIONS. is:  0.3333333333333333
Jaccard Distance between:  - USEFUL PRESCRIPTIONS. &  - CURE FOR 
LIVER COMPLAINT IN CHILDREN. is:  0.3333333333333333
Jaccard Distance between:  - CURE FOR 
LIVER COMPLAINT IN CHILDREN. &  - WE AGREE THAT WE ARE OVERWORKED, AND NEED 
REST. is:  0.48
Jaccard Distance between:  - WE AGREE THAT WE ARE OVERWORKED, AND NEED 
REST. &  - A W

# N Grams and Tokenization in Python

### Approaching Tokenization

In [31]:
directory = 'C://Users//IBM_ADMIN//Documents//Teaching//Data Projects//Text//Books//'

#book = 'Book_2.txt'
book = '3boat10.txt'
#book_short = '3boat10_short.txt'
#book_out = '3boat10_out.txt'

### Using Re

In [32]:
# Find the N most common words in book
top_N = 10

words = re.findall(r'\w+', open(directory+book, encoding="utf8").read().lower())
# \w -- matches a "word" character: a letter or digit or underbar [a-zA-Z0-9_].
# + pattern must appear at least once. 

word_freq = Counter(words).most_common(top_N)
word_freq

[('the', 3607),
 ('and', 3395),
 ('to', 1790),
 ('a', 1714),
 ('of', 1496),
 ('it', 1422),
 ('i', 1213),
 ('in', 977),
 ('that', 950),
 ('he', 920)]

In [33]:
word_freq = Counter(words).most_common()

word_freq_df = pd.DataFrame(word_freq,
                    columns=['Word', 'Frequency']).set_index('Word')

word_freq_df.sort_values('Frequency', ascending=False, inplace=True)

word_freq_df.head(n=10)

Unnamed: 0_level_0,Frequency
Word,Unnamed: 1_level_1
the,3607
and,3395
to,1790
a,1714
of,1496
it,1422
i,1213
in,977
that,950
he,920


In [34]:
word_freq_df.shape

(6579, 1)

### Using NLTK

In [35]:
f = open(directory+book, encoding="utf8")
raw = f.read()

words = nltk.tokenize.word_tokenize(raw)
fdist = nltk.FreqDist(words)

print(fdist)

#fdist.items() - will give all words
fdist.most_common(10)

<FreqDist with 7773 samples and 79641 outcomes>


[(',', 5702),
 ('the', 3338),
 ('and', 3215),
 ('.', 3081),
 ('to', 1748),
 ('a', 1621),
 ('of', 1425),
 ('I', 1208),
 ('it', 1159),
 ('in', 931)]

### NLTK also has embedded RegexpTokenizer

In [36]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize("Flu season hitting earlier, with dozens more outbreaks — and more severe symptoms")

['Flu',
 'season',
 'hitting',
 'earlier',
 'with',
 'dozens',
 'more',
 'outbreaks',
 'and',
 'more',
 'severe',
 'symptoms']

In [37]:
fdist_df = pd.DataFrame(fdist.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,",",5702
1,the,3338
2,and,3215
3,.,3081
4,to,1748
5,a,1621
6,of,1425
7,I,1208
8,it,1159
9,in,931


In [38]:
fdist_df.shape

(7773, 2)

### Cleaning-up tokenization

In [39]:
#from nltk.corpus import stopwords

#default_stopwords = set(nltk.corpus.stopwords.words('english'))

words = nltk.tokenize.word_tokenize(raw)

#stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

# Remove single-character tokens (mostly punctuation)
words = [word for word in words if len(word) > 1]

# Remove numbers
#words = [word for word in words if not word.isnumeric()]

# Remove punctuation
words = [word for word in words if word.isalpha()]

# Lowercase all words (default_stopwords are lowercase too)
words = [word.lower() for word in words]

# Remove stopwords
words = [word for word in words if word not in stopwords]

fdist = nltk.FreqDist(words)

print(fdist)

#fdist.items() - will give all words
fdist.most_common(10)

<FreqDist with 6240 samples and 29842 outcomes>


[('said', 378),
 ('would', 362),
 ('harris', 316),
 ('george', 308),
 ('one', 246),
 ('us', 228),
 ('boat', 186),
 ('get', 179),
 ('could', 175),
 ('got', 163)]

In [40]:
fdist_df = pd.DataFrame(fdist.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,said,378
1,would,362
2,harris,316
3,george,308
4,one,246
5,us,228
6,boat,186
7,get,179
8,could,175
9,got,163


# N Grams
### Basic N-Gramming

In [42]:
sentence = 'quick brown fox jumps over the lazy dog'
n = 3
kgrams = nltk.ngrams(sentence.split(), n)
for grams in kgrams:
  print (grams)

('quick', 'brown', 'fox')
('brown', 'fox', 'jumps')
('fox', 'jumps', 'over')
('jumps', 'over', 'the')
('over', 'the', 'lazy')
('the', 'lazy', 'dog')


In [41]:
tokens = nltk.word_tokenize(raw)

#Create your bigrams or trigrams
bgs = nltk.bigrams(tokens)
tgs = nltk.trigrams(tokens)

#compute frequency distribution for all the bigrams in the text
fdist_2 = nltk.FreqDist(bgs)
fdist_3 = nltk.FreqDist(tgs)

#for k,v in fdist.items():
#    print (k,v)

In [43]:
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

print(fdist_df.shape)

fdist_df.head(n=10)

(37516, 2)


Unnamed: 0,Word,Frequency
0,"(,, and)",1859
1,"(of, the)",318
2,"(., I)",293
3,"(in, the)",278
4,"(;, and)",233
5,"(., We)",227
6,"(., It)",220
7,"(., ``)",207
8,"(,, '')",182
9,"(and, the)",180


In [44]:
fdist_df.iloc[10000:10005]

Unnamed: 0,Word,Frequency
10000,"(from, affectation)",1
10001,"(affectation, -)",1
10002,"(often, wished)",1
10003,"(able, .)",1
10004,"(us, anecdotes)",1


In [45]:
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

print(fdist_df.shape)

fdist_df.head(n=10)

(64316, 2)


Unnamed: 0,Word,Frequency
0,"(,, and, the)",121
1,"(,, and, then)",83
2,"(it, ,, and)",72
3,"(., It, was)",69
4,"(,, and, he)",66
5,"(,, and, we)",57
6,"(,, and, I)",55
7,"(,, and, ,)",54
8,"(., He, said)",53
9,"(., It, is)",47


In [46]:
fdist_df.iloc[10000:10005]

Unnamed: 0,Word,Frequency
10000,"(pipes, go, out)",1
10001,"(go, out, -)",1
10002,"(out, -, till)",1
10003,"(till, we, ,)",1
10004,"(we, ,, common-place)",1


## Cleaning-up  N-Grams

#### Eliminating puctuation and case sensitivity from N-Grams

In [47]:
#tokens = nltk.tokenize.word_tokenize(raw)
tokens = nltk.word_tokenize(raw)

#stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

word_list = []

# Filter out words that have punctuation and make everything lower-case
cleaned_words = [w.lower() for w in tokens if w.isalnum()]

bgs = [b for b in nltk.bigrams(cleaned_words)]
tgs = [b for b in nltk.trigrams(cleaned_words)]

In [48]:
fdist_2 = nltk.FreqDist(bgs)
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(of, the)",329
1,"(in, the)",294
2,"(it, was)",261
3,"(and, the)",191
4,"(on, the)",181
5,"(to, the)",175
6,"(and, i)",145
7,"(and, then)",140
8,"(to, be)",130
9,"(the, river)",127


In [49]:
fdist_3 = nltk.FreqDist(tgs)
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(it, was, a)",52
1,"(the, boat, and)",32
2,"(george, and, i)",30
3,"(said, it, was)",28
4,"(he, said, he)",27
5,"(a, bit, of)",27
6,"(one, of, the)",25
7,"(harris, and, i)",25
8,"(that, it, was)",24
9,"(and, then, he)",21


#### Eliminating puctuation, case sensitivity and stop-words from N-Grams

In [50]:
#tokens = nltk.tokenize.word_tokenize(raw)
tokens = nltk.word_tokenize(raw)

#stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

word_list = []

# Filter out words that have punctuation and make everything lower-case
cleaned_words = [w.lower() for w in tokens if w.isalnum()]

bgs = [b for b in nltk.bigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords]
tgs = [b for b in nltk.trigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and b[2] not in stopwords]

In [51]:
fdist_2 = nltk.FreqDist(bgs)
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(harris, said)",41
1,"(george, said)",34
2,"(said, george)",22
3,"(would, go)",17
4,"(one, another)",17
5,"(five, minutes)",16
6,"(said, oh)",16
7,"(said, harris)",15
8,"(young, men)",14
9,"(would, come)",13


In [53]:
fdist_3 = nltk.FreqDist(tgs)
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(rule, tom, and)",1
1,"(tom, tom, you)",1
2,"(right, tom, we)",1
3,"(cheeses, tom, bought)",1
4,"(tom, bought, them)",1
5,"(tom, say, about)",1
6,"(hi, tom, dick)",1
7,"(tom, dick, ca)",1
8,"(matter, tom, replied)",1
9,"(tom, replied, joe)",1


#### Creating targeted N-Grams

In [52]:
#tokens = nltk.tokenize.word_tokenize(raw)
tokens = nltk.word_tokenize(raw)

#stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

word_list = []

# Filter out words that have punctuation and make everything lower-case
cleaned_words = [w.lower() for w in tokens if w.isalnum()]

bgs = [b for b in nltk.bigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and \
       (b[0] == 'tom' or b[1] == 'tom')]

tgs = [b for b in nltk.trigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and \
       (b[0] == 'tom' or b[1] == 'tom' or b[2] == 'tom')]

In [54]:
#tokens = nltk.tokenize.word_tokenize(raw)
tokens = nltk.word_tokenize(raw)

#stopwords = stopwords.words('english')
stopwords = set(nltk.corpus.stopwords.words('english'))

word_list = []

# Filter out words that have punctuation and make everything lower-case
cleaned_words = [w.lower() for w in tokens if w.isalnum()]

bgs = [b for b in nltk.bigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and \
       (b[0] == 'harris' or b[1] == 'harris')]

tgs = [b for b in nltk.trigrams(cleaned_words) if b[0] not in stopwords and b[1] not in stopwords and \
       (b[0] == 'harris' or b[1] == 'harris' or b[2] == 'harris')]

In [55]:
fdist_2 = nltk.FreqDist(bgs)
fdist_df = pd.DataFrame(fdist_2.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(harris, said)",41
1,"(said, harris)",15
2,"(harris, would)",5
3,"(harris, told)",4
4,"(george, harris)",4
5,"(harris, never)",3
6,"(time, harris)",3
7,"(met, harris)",3
8,"(harris, sat)",3
9,"(cried, harris)",3


In [57]:
fdist_3 = nltk.FreqDist(tgs)
fdist_df = pd.DataFrame(fdist_3.most_common(),
                    columns=['Word', 'Frequency'])

fdist_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(harris, said, he)",11
1,"(harris, said, that)",7
2,"(harris, said, it)",5
3,"(said, harris, and)",4
4,"(harris, said, oh)",3
5,"(ago, harris, said)",2
6,"(harris, said, how)",2
7,"(harris, sat, on)",2
8,"(said, harris, then)",2
9,"(harris, said, i)",2


### Creating N-Grams of custom length

In [56]:
n = 4
fourgrams = nltk.ngrams(raw.split(), n)

n = 5
fivegrams = nltk.ngrams(raw.split(), n)

n = 6
sixgrams = nltk.ngrams(raw.split(), n)

In [58]:
fdist_4 = nltk.FreqDist(fourgrams)
fdist_4_df = pd.DataFrame(fdist_4.most_common(),
                    columns=['Word', 'Frequency'])

fdist_4_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(the, bottom, of, the)",12
1,"(at, the, bottom, of)",11
2,"(in, the, middle, of)",10
3,"(the, middle, of, the)",10
4,"(said, it, was, a)",9
5,"(and, Harris, and, I)",8
6,"(a, good, deal, of)",8
7,"(He, said, he, had)",8
8,"(and, George, and, I)",8
9,"(the, end, of, the)",6


In [60]:
fdist_5 = nltk.FreqDist(fivegrams)
fdist_5_df = pd.DataFrame(fdist_5.most_common(),
                    columns=['Word', 'Frequency'])

fdist_5_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(at, the, bottom, of, the)",10
1,"(in, the, middle, of, the)",6
2,"(He, said, he, had, never)",5
3,"(we, were, going, to, have)",4
4,"(the, opposite, side, of, the)",4
5,"(in, the, nose, of, the)",3
6,"(the, nose, of, the, boat,)",3
7,"(the, other, side, of, the)",3
8,"(and, asked, him, if, he)",3
9,"(I, don't, think, I, ever)",3


In [59]:
fdist_6 = nltk.FreqDist(sixgrams)
fdist_6_df = pd.DataFrame(fdist_6.most_common(),
                    columns=['Word', 'Frequency'])

fdist_6_df.head(n=10)

Unnamed: 0,Word,Frequency
0,"(in, the, nose, of, the, boat,)",3
1,"(if, it, had, not, been, for)",3
2,"(at, the, bottom, of, the, boat,)",3
3,"(when, we, had, given, up, all)",3
4,"(it, was, my, liver, that, was)",2
5,"(was, my, liver, that, was, out)",2
6,"(my, liver, that, was, out, of)",2
7,"(you, are, going, to, have, a)",2
8,"(him, at, the, bottom, of, the)",2
9,"(know, a, place, round, the, corner)",2


In [61]:
writer = pd.ExcelWriter(directory+'n_grams_out.xlsx')
fdist_4_df.to_excel(writer,'FourGrams')
fdist_5_df.to_excel(writer,'FiveGrams')
fdist_6_df.to_excel(writer,'SixGrams')
writer.save()