Name: Sathish Kumar Rajendiran
Chapter :  Regular Expressions and Tokenization
Date: 10/18/2020
Week: 3

In [1]:
#import libraries

# standard library
import os
import sys
from datetime import datetime
import time

# csv, xls, pandas & json
import pandas as pd
import json
import csv
import xlrd

# Language Processing
import nltk
from nltk import FreqDist

# web requests
from urllib import request


##  Regular Expression to match non-alphabetic characters
import re

os.getcwd()

'/Users/sathishrajendiran/ist664-nlp'

In [2]:
# get the book Emma from the Gutenberg collection and keep as raw text
file0 = nltk.corpus.gutenberg.fileids( ) [0]
emmatext = nltk.corpus.gutenberg.raw(file0)

print(type(emmatext))
print(len(emmatext))
# display the first 150 characters of the str emmatext
emmatext[:150]

<class 'str'>
887071


'[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to'

In [5]:
print(emmatext[:20])
for char in emmatext[:20]:
    print(char)

[Emma by Jane Austen
[
E
m
m
a
 
b
y
 
J
a
n
e
 
A
u
s
t
e
n


In [6]:
emmatext[:150]
newemmatext = emmatext.replace('\n', ' ')
newemmatext[:150]


'[Emma by Jane Austen 1816]  VOLUME I  CHAPTER I   Emma Woodhouse, handsome, clever, and rich, with a comfortable home and happy disposition, seemed to'

![image.png](attachment:image.png)

************************************************************************************

# Part 5:  Exploration of Regular Expressions for Tokenization

In [9]:
shorttext = 'That book is interesting.'
pword = re.compile('\w+')
print(re.findall(pword, shorttext))

['That', 'book', 'is', 'interesting']


In [11]:
specialtext = 'That U.S.A. poster-print costs $12.40, but with 10% off.'
print(re.findall(pword, specialtext))

['That', 'U', 'S', 'A', 'poster', 'print', 'costs', '12', '40', 'but', 'with', '10', 'off']


In [15]:
ptoken = re.compile('(\w+(-\w+)*)')
print(re.findall(ptoken, specialtext))
print(re.findall(ptoken, 'end-of-line character'))

[('That', ''), ('U', ''), ('S', ''), ('A', ''), ('poster-print', '-print'), ('costs', ''), ('12', ''), ('40', ''), ('but', ''), ('with', ''), ('10', ''), ('off', '')]
[('end-of-line', '-line'), ('character', '')]


re.findall has reported both the whole matched text and the internal matched text, i.e. it reports the last match of any part of the regular expression in parentheses.  We could fix this by looking at the parts of the re.groups function to access only the outer match.  But let’s assume that we only want to look at outer matches and not at any of the internal matches.  We can instead make the internal parentheses into non-capturing subgroups.  
This regular expression matches the same strings, but the findall function doesn’t report the subgroups.

In [16]:
ptoken = re.compile('(\w+(?:-\w+)*)') #
print(re.findall(ptoken, specialtext))
print(re.findall(ptoken, 'end-of-line character'))

['That', 'U', 'S', 'A', 'poster-print', 'costs', '12', '40', 'but', 'with', '10', 'off']
['end-of-line', 'character']


Now we try to make a pattern to match abbreviations that might have a “.” inside, like U.S.A.  We only allow capitalized letters, and we make a simple pattern that matches alternating capital letters and dots.

In [18]:
pabbrev = re.compile('(([A-Z]\.)+)')
print(re.findall(pabbrev, specialtext))
print(re.findall(ptoken, 'end-of-line character'))

[('U.S.A.', 'A.')]
['end-of-line', 'character']


In [19]:
ptoken = re.compile('(\w+(-\w+)*|([A-Z]\.)+)')
print(re.findall(ptoken, specialtext))

[('That', '', ''), ('U', '', ''), ('S', '', ''), ('A', '', ''), ('poster-print', '-print', ''), ('costs', '', ''), ('12', '', ''), ('40', '', ''), ('but', '', ''), ('with', '', ''), ('10', '', ''), ('off', '', '')]


In [22]:
ptoken = re.compile(' (([A-Z]\.)+|\w+(-\w+)*|\$?\d+(\.\d+)?)')
print(re.findall(ptoken, specialtext))
print(re.findall(ptoken, 'end-of-line character'))

[('U.S.A.', 'A.', '', ''), ('poster-print', '', '-print', ''), ('costs', '', '', ''), ('$12.40', '', '', '.40'), ('but', '', '', ''), ('with', '', '', ''), ('10', '', '', ''), ('off', '', '', '')]
[('character', '', '', '')]


In [23]:
ptoken = re.compile(' (([?:A-Z]\.)+|\w+(?:-\w+)*|\$?\d+(\.\d+)?)')
print(re.findall(ptoken, specialtext))
print(re.findall(ptoken, 'end-of-line character'))

[('U.S.A.', 'A.', ''), ('poster-print', '', ''), ('costs', '', ''), ('$12.40', '', '.40'), ('but', '', ''), ('with', '', ''), ('10', '', ''), ('off', '', '')]
[('character', '', '')]


In [25]:
ptoken = re.compile(r'''(([A-Z]\.)+) # abbreviations, e.g. U.S.A.
| (\w+(-\w+)*)# words with internal hyphens
| (\$?\d+(\.\d+)?)    # currency, like $12.40
''', re.X)# verbose flag
print(re.findall(ptoken, specialtext))
print(re.findall(ptoken, 'end-of-line character'))

[('', '', 'That', '', '', ''), ('U.S.A.', 'A.', '', '', '', ''), ('', '', 'poster-print', '-print', '', ''), ('', '', 'costs', '', '', ''), ('', '', '', '', '$12.40', '.40'), ('', '', 'but', '', '', ''), ('', '', 'with', '', '', ''), ('', '', '10', '', '', ''), ('', '', 'off', '', '', '')]
[('', '', 'end-of-line', '-line', '', ''), ('', '', 'character', '', '', '')]


# More about the function findall()

In [26]:
email_text = "For more information, send a request to info@ischool.syr.edu. Or you can directly contact our information staff at HelpfulHenry@syr.edu and SageSue@syr.edu."

In [29]:
pemail = re.compile('([a-zA-Z]+)@([a-z.]+)')
print(re.findall(pemail, email_text))
print(re.findall(pemail, 'end-of-line character'))

[('info', 'ischool.syr.edu.'), ('HelpfulHenry', 'syr.edu'), ('SageSue', 'syr.edu.')]
[]


In [30]:
matches = re.findall(pemail, email_text)

for m in matches:
    # format function puts each argument into the output string where the {} is
    email = 'User: {}, Domain:{}'.format(m[0],m[1])
    print(email)


User: info, Domain:ischool.syr.edu.
User: HelpfulHenry, Domain:syr.edu
User: SageSue, Domain:syr.edu.


# Part 6:  NLTK Tokenizer

Regular Expression Tokenizer using NLTK Tokenizer
(From section 3.7 in the NLTK book.)

NLTK has built a tokenizing function that helps you write tokenizers by giving it the compiled pattern.  Regular expressions can also be written down in the “verbose” version, using the (?x) flag that allows the alternatives to be on different lines with comments, and it also alleviates the need to put extra parentheses.


In [31]:
pattern = r''' (?x) # set flag to allow verbose regexps
        (?:[A-Z]\.)+    # abbreviations, e.g. U.S.A.
        | \$?\d+(?:\.\d+)?%?  # currency and percentages, $12.40, 50%
        | \w+(?:-\w+)*  # words with internal hyphens
        | \.\.\.        # ellipsis
        | [][.,;"'?():-_%#']  # separate tokens
        '''


In [34]:
print(nltk.regexp_tokenize(shorttext, pattern))

print(nltk.regexp_tokenize(specialtext, pattern))


['That', 'book', 'is', 'interesting', '.']
['That', 'U.S.A.', 'poster-print', 'costs', '$12.40', ',', 'but', 'with', '10%', 'off', '.']


In [36]:
print(nltk.word_tokenize(specialtext))

['That', 'U.S.A.', 'poster-print', 'costs', '$', '12.40', ',', 'but', 'with', '10', '%', 'off', '.']


This word tokenizer has chosen to separate the $ in currency and the % sign from the percentage.  This choice must depend on what later processing is desired.

Next, we’ll try to make a regular expression tokenizer appropriate for tweet text or other social media text.  Some of the patterns in this tokenizer are taken from tweetmotif, a Python regular expression tokenizer written for tweets by Brendan O’Connor. Here is the original description, http://tweetmotif.com/about and the later inclusion into the ARK tools for social media, http://www.cs.cmu.edu/~ark/TweetNLP/  

In [37]:
tweetPattern = r''' (?x) # set flag to allow verbose regexps
      (?:https?://|www)\S+    # simple URLs
      | (?::-\)|;-\)) # small list of emoticons
      | &(?:amp|lt|gt|quot);  # XML or HTML entity
      | \#\w+                 # hashtags
      | @\w+                  # mentions   
      | \d+:\d+               # timelike pattern
      | \d+\.\d+              # number with a decimal
      | (?:\d+,)+?\d{3}(?=(?:[^,]|$))   # number with a comma
      | (?:[A-Z]\.)+                    # simple abbreviations
      | (?:--+)               # multiple dashes
      | \w+(?:-\w+)*          # words with internal hyphens or apostrophes
      | ['\".?!,:;/]+         # special characters
      '''


In [38]:
tweet1 = "@natalieohayre I agree #hc09 needs reform- but not by crooked politicians who r clueless about healthcare! #tcot #fishy NO GOV'T TAKEOVER!"

In [39]:
tweet2 = "To Sen. Roland Burris: Affordable, quality health insurance can't wait http://bit.ly/j63je #hc09 #IL #60660"

In [40]:
tweet3 = "RT @karoli: RT @Seriou: .@whitehouse I will stand w/ Obama on #healthcare,  I trust him. #p2 #tlot"

In [41]:
print(nltk.regexp_tokenize(tweet1,tweetPattern))

['@natalieohayre', 'I', 'agree', '#hc09', 'needs', 'reform', 'but', 'not', 'by', 'crooked', 'politicians', 'who', 'r', 'clueless', 'about', 'healthcare', '!', '#tcot', '#fishy', 'NO', 'GOV', "'", 'T', 'TAKEOVER', '!']


In [45]:
from nltk.tokenize import TweetTokenizer
ttokenizer = TweetTokenizer()
print(ttokenizer.tokenize(tweet1))

['@natalieohayre', 'I', 'agree', '#hc09', 'needs', 'reform', '-', 'but', 'not', 'by', 'crooked', 'politicians', 'who', 'r', 'clueless', 'about', 'healthcare', '!', '#tcot', '#fishy', 'NO', "GOV'T", 'TAKEOVER', '!']


In [297]:
sent = "Mr. Black and Mrs. Brown attended the lecture by Dr. Gray, but Gov. White wasn’t there."
print(nltk.word_tokenize(sent))

['Mr.', 'Black', 'and', 'Mrs.', 'Brown', 'attended', 'the', 'lecture', 'by', 'Dr.', 'Gray', ',', 'but', 'Gov', '.', 'White', 'wasn', '’', 't', 'there', '.']


In [414]:
pattern = r''' (?x) # set flag to allow verbose regexps
        [^ a-z]\w+[.]|\w+(?:’\w+)*   
        |(?:https?://|www)\S+    # simple URLs
        |(?:\[A-z]+[''])   # abbreviations, e.g. U.S.A.
        | \w+(?:-\w+)*  # words with internal hyphens
        | ['\".?!,:;/]+ # special characters
        | \d+\.\d+              # number with a decimal
        | (?:\d+,)+?\d{3}(?=(?:[^,]|$))   # number with a comma
        | (?:[A-Z]\.)+                    # simple abbreviations
        | (?:--+)               # multiple dashes
        '''

print(nltk.regexp_tokenize(sent,pattern))

['Mr.', 'Black', 'and', 'Mrs.', 'Brown', 'attended', 'the', 'lecture', 'by', 'Dr.', 'Gray', ',', 'but', 'Gov.', 'White', 'wasn’t', 'there', '.']


In [412]:
textbook ='''The required textbook is available online:
Bird, S., Klein, E., & Loper, E. Natural language processing with Python. Available from http://www.nltk.org/book/
Please refer to this online version for reading (instead of the older version) as it is updated with Python 3 and NLTK 3.
The following textbook will be referred to but is not required:
Jurafsky, D., & Martin, J. H. Speech and language processing (3rd ed. draft). Available from https://web.stanford.edu/~jurafsky/slp3/
Additional supplementary readings will be assigned during the semester and will be available online.
Mr. Black and Mrs. Brown attended the lecture by Dr. Gray, but Gov. White wasn’t there.
'''
pattern = r''' (?x) # set flag to allow verbose regexps
      (?:https?://|www)\S+      # simple URLs
      |[^ a-z]\w+[.]|\w+(?:’\w+)*  
      | (?::-\)|;-\))		# small list of emoticons
      | &(?:amp|lt|gt|quot);    # XML or HTML entity
      | \#\w+                 # hashtags
      | @\w+                  # mentions   
      | \d+:\d+               # timelike pattern
      | \d+\.\d+              # number with a decimal
      | (?:\d+,)+?\d{3}(?=(?:[^,]|$))   # number with a comma
      | (?:[A-Z]\.)+                    # simple abbreviations
      | (?:--+)               # multiple dashes
      | \w+(?:-\w+)*          # words with internal hyphens or apostrophes
      | ['\".?!,:;/]+         # special characters
        '''

print(nltk.regexp_tokenize(textbook,pattern))

['The', 'required', 'textbook', 'is', 'available', 'online', ':', 'Bird', ',', 'S', '.,', 'Klein', ',', 'E', '.,', 'Loper', ',', 'E', '.', 'Natural', 'language', 'processing', 'with', 'Python.', 'Available', 'from', 'http://www.nltk.org/book/', 'Please', 'refer', 'to', 'this', 'online', 'version', 'for', 'reading', 'instead', 'of', 'the', 'older', 'version', 'as', 'it', 'is', 'updated', 'with', 'Python', '3', 'and', 'NLTK', '3', '.', 'The', 'following', 'textbook', 'will', 'be', 'referred', 'to', 'but', 'is', 'not', 'required', ':', 'Jurafsky', ',', 'D', '.,', 'Martin', ',', 'J', '.', 'H', '.', 'Speech', 'and', 'language', 'processing', '3rd', 'ed', '.', 'draft', '.', 'Available', 'from', 'https://web.stanford.edu/~jurafsky/slp3/', 'Additional', 'supplementary', 'readings', 'will', 'be', 'assigned', 'during', 'the', 'semester', 'and', 'will', 'be', 'available', 'online', '.', '\nMr.', 'Black', 'and', 'Mrs.', 'Brown', 'attended', 'the', 'lecture', 'by', 'Dr.', 'Gray', ',', 'but', 'Gov.'

# Part 1:  Session Setup and Reading Text from Files

In [422]:
# Working with file, list and sorting
try: 
    f = open('/Users/sathishrajendiran/ist664-nlp/CrimeAndPunishment.txt', 'r')
    rawtext = f.read()
    print('length of raw text is:',len(rawtext))
    
except:
    print("Is the file in correct directory?")
    
crimetokens = nltk.word_tokenize(rawtext)
text = nltk.Text(crimetokens)
text.concordance('pass')
#When we are done, we close the file.
f.close()

length of raw text is: 1135052
Displaying 25 of 42 matches:
y time he went out he was obliged to pass her kitchen , the door of which inva
at would it be if it somehow came to pass that I were really going to do it ? 
oom , she said , letting her visitor pass in front of her : '' Step in , my go
d Katerina Ivanovna would not let it pass , she stood up for her ... and so th
ng into the next room , as he had to pass through hers to get there . Taking n
this scandal , and it came to such a pass that Dounia and I dared not even go 
for you . Oh , if only this comes to pass ! This would be such a benefit that 
irst place , because it will come to pass of itself , later on , and he will n
hen the hour struck , it all came to pass quite differently , as it were accid
g in the doorway not allowing him to pass , he advanced straight upon her . Sh
im -- all was lost ; if they let him pass -- all was lost too ; they would rem
t once that it would be loathsome to pass that seat on which after the 

# Part 2:  Stemming and Lemmatization

In [426]:
print(len(crimetokens),'\n')
print(crimetokens[:100])

250985 

['Produced', 'by', 'John', 'Bickers', ';', 'and', 'Dagny', 'CRIME', 'AND', 'PUNISHMENT', 'By', 'Fyodor', 'Dostoevsky', 'Translated', 'By', 'Constance', 'Garnett', 'TRANSLATOR', "'S", 'PREFACE', 'A', 'few', 'words', 'about', 'Dostoevsky', 'himself', 'may', 'help', 'the', 'English', 'reader', 'to', 'understand', 'his', 'work', '.', 'Dostoevsky', 'was', 'the', 'son', 'of', 'a', 'doctor', '.', 'His', 'parents', 'were', 'very', 'hard-working', 'and', 'deeply', 'religious', 'people', ',', 'but', 'so', 'poor', 'that', 'they', 'lived', 'with', 'their', 'five', 'children', 'in', 'only', 'two', 'rooms', '.', 'The', 'father', 'and', 'mother', 'spent', 'their', 'evenings', 'in', 'reading', 'aloud', 'to', 'their', 'children', ',', 'generally', 'from', 'books', 'of', 'a', 'serious', 'character', '.', 'Though', 'always', 'sickly', 'and', 'delicate', 'Dostoevsky', 'came', 'out', 'third']


NLTK has two stemmers, Porter and Lancaster, described in section 3.6 of the NLTK book.  To use these stemmers, you first create them.

In [427]:
#NLTK  stemmers
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

In [428]:
crimePstem = [porter.stem(t) for t in crimetokens]
print(crimePstem[:200])


['produc', 'by', 'john', 'bicker', ';', 'and', 'dagni', 'crime', 'and', 'punish', 'By', 'fyodor', 'dostoevski', 'translat', 'By', 'constanc', 'garnett', 'translat', "'S", 'prefac', 'A', 'few', 'word', 'about', 'dostoevski', 'himself', 'may', 'help', 'the', 'english', 'reader', 'to', 'understand', 'hi', 'work', '.', 'dostoevski', 'wa', 'the', 'son', 'of', 'a', 'doctor', '.', 'hi', 'parent', 'were', 'veri', 'hard-work', 'and', 'deepli', 'religi', 'peopl', ',', 'but', 'so', 'poor', 'that', 'they', 'live', 'with', 'their', 'five', 'children', 'in', 'onli', 'two', 'room', '.', 'the', 'father', 'and', 'mother', 'spent', 'their', 'even', 'in', 'read', 'aloud', 'to', 'their', 'children', ',', 'gener', 'from', 'book', 'of', 'a', 'seriou', 'charact', '.', 'though', 'alway', 'sickli', 'and', 'delic', 'dostoevski', 'came', 'out', 'third', 'in', 'the', 'final', 'examin', 'of', 'the', 'petersburg', 'school', 'of', 'engin', '.', 'there', 'he', 'had', 'alreadi', 'begun', 'hi', 'first', 'work', ',', '`

In [429]:
crimeLstem = [lancaster.stem(t) for t in crimetokens]
print(crimeLstem[:200])


['produc', 'by', 'john', 'bick', ';', 'and', 'dagny', 'crim', 'and', 'pun', 'by', 'fyod', 'dostoevsky', 'transl', 'by', 'const', 'garnet', 'transl', "'s", 'prefac', 'a', 'few', 'word', 'about', 'dostoevsky', 'himself', 'may', 'help', 'the', 'engl', 'read', 'to', 'understand', 'his', 'work', '.', 'dostoevsky', 'was', 'the', 'son', 'of', 'a', 'doct', '.', 'his', 'par', 'wer', 'very', 'hard-working', 'and', 'deeply', 'religy', 'peopl', ',', 'but', 'so', 'poor', 'that', 'they', 'liv', 'with', 'their', 'fiv', 'childr', 'in', 'on', 'two', 'room', '.', 'the', 'fath', 'and', 'moth', 'spent', 'their', 'ev', 'in', 'read', 'aloud', 'to', 'their', 'childr', ',', 'gen', 'from', 'book', 'of', 'a', 'sery', 'charact', '.', 'though', 'alway', 'sick', 'and', 'del', 'dostoevsky', 'cam', 'out', 'third', 'in', 'the', 'fin', 'examin', 'of', 'the', 'petersburg', 'school', 'of', 'engin', '.', 'ther', 'he', 'had', 'already', 'begun', 'his', 'first', 'work', ',', '``', 'poor', 'folk', '.', "''", 'thi', 'story',

The NLTK has a lemmatizer that uses the WordNet on-line thesaurus as a dictionary to look up roots and find the word.

In [430]:
wnl = nltk.WordNetLemmatizer()
crimeLemma = [wnl.lemmatize(t) for t in crimetokens]
print(crimeLemma[:200])

['Produced', 'by', 'John', 'Bickers', ';', 'and', 'Dagny', 'CRIME', 'AND', 'PUNISHMENT', 'By', 'Fyodor', 'Dostoevsky', 'Translated', 'By', 'Constance', 'Garnett', 'TRANSLATOR', "'S", 'PREFACE', 'A', 'few', 'word', 'about', 'Dostoevsky', 'himself', 'may', 'help', 'the', 'English', 'reader', 'to', 'understand', 'his', 'work', '.', 'Dostoevsky', 'wa', 'the', 'son', 'of', 'a', 'doctor', '.', 'His', 'parent', 'were', 'very', 'hard-working', 'and', 'deeply', 'religious', 'people', ',', 'but', 'so', 'poor', 'that', 'they', 'lived', 'with', 'their', 'five', 'child', 'in', 'only', 'two', 'room', '.', 'The', 'father', 'and', 'mother', 'spent', 'their', 'evening', 'in', 'reading', 'aloud', 'to', 'their', 'child', ',', 'generally', 'from', 'book', 'of', 'a', 'serious', 'character', '.', 'Though', 'always', 'sickly', 'and', 'delicate', 'Dostoevsky', 'came', 'out', 'third', 'in', 'the', 'final', 'examination', 'of', 'the', 'Petersburg', 'school', 'of', 'Engineering', '.', 'There', 'he', 'had', 'alre

# 3.6 Lab Session: Processing Text and Stemming

In [431]:
# Working with file, list and sorting
try: 
    d = open('/Users/sathishrajendiran/ist664-nlp/desert.txt', 'r')
    destext = d.read()
    print('length of raw text is:',len(destext))
    
except:
    print("Is the file in correct directory?")
    
deserttokens = nltk.word_tokenize(destext)
dtext = nltk.Text(deserttokens)
dtext.concordance('pass')
#When we are done, we close the file.
f.close()

length of raw text is: 6932
Displaying 2 of 2 matches:
 Shaiba range of mountainous dunes , pass by the quicksand of Umm al Samim ( M
 Shaiba range of mountainous dunes , pass by the quicksand of Umm al Samim ( M


In [432]:
#NLTK  stemmers
dporter = nltk.PorterStemmer()
dlancaster = nltk.LancasterStemmer()

In [445]:
print('the length of the tokens:',len(deserttokens),'\n')
print('top 10 words:\n',deserttokens[:10])

the length of the tokens: 1364 

top 10 words:
 ['Three', 'Calgarians', 'have', 'found', 'a', 'rather', 'unusual', 'way', 'of', 'leaving']


In [446]:
dDesertPstem = [dporter.stem(t) for t in deserttokens]
print('Top 10 tokens from PorterStemmer:\n')
print(dDesertPstem[:10])

Top 10 tokens from PorterStemmer:

['three', 'calgarian', 'have', 'found', 'a', 'rather', 'unusu', 'way', 'of', 'leav']


In [448]:
desertLstem = [dlancaster.stem(t) for t in deserttokens]
print('Top 10 tokens from LancasterStemmer:\n')
print(desertLstem[:10])

Top 10 tokens from LancasterStemmer:

['three', 'calg', 'hav', 'found', 'a', 'rath', 'unus', 'way', 'of', 'leav']


# Part 3:  Python Computation File

In [449]:
def alpha_filter(w):
  # pattern to match word of non-alphabetical characters
  pattern = re.compile('^[^a-z]+$')
  if (pattern.match(w)):
    return True
  else:
    return False

In [450]:
# Working with file, list and sorting
try: 
    f = open('/Users/sathishrajendiran/ist664-nlp/CrimeAndPunishment.txt', 'r')
    rawtext = f.read()
    print('length of raw text is:',len(rawtext))
    
except:
    print("Is the file in correct directory?")
    
crimetokens = nltk.word_tokenize(rawtext)
text = nltk.Text(crimetokens)
text.concordance('pass')
#When we are done, we close the file.
f.close()

length of raw text is: 1135052
Displaying 25 of 42 matches:
y time he went out he was obliged to pass her kitchen , the door of which inva
at would it be if it somehow came to pass that I were really going to do it ? 
oom , she said , letting her visitor pass in front of her : '' Step in , my go
d Katerina Ivanovna would not let it pass , she stood up for her ... and so th
ng into the next room , as he had to pass through hers to get there . Taking n
this scandal , and it came to such a pass that Dounia and I dared not even go 
for you . Oh , if only this comes to pass ! This would be such a benefit that 
irst place , because it will come to pass of itself , later on , and he will n
hen the hour struck , it all came to pass quite differently , as it were accid
g in the doorway not allowing him to pass , he advanced straight upon her . Sh
im -- all was lost ; if they let him pass -- all was lost too ; they would rem
t once that it would be loathsome to pass that seat on which after the 

In [451]:
# choose to treat upper and lower case the same
#    by putting all tokens in lower case
filewords = [w.lower() for w in crimetokens]

In [452]:
# display the first words
print ("Display first 50 words from file:")
print (filewords[:50])

Display first 50 words from file:
['produced', 'by', 'john', 'bickers', ';', 'and', 'dagny', 'crime', 'and', 'punishment', 'by', 'fyodor', 'dostoevsky', 'translated', 'by', 'constance', 'garnett', 'translator', "'s", 'preface', 'a', 'few', 'words', 'about', 'dostoevsky', 'himself', 'may', 'help', 'the', 'english', 'reader', 'to', 'understand', 'his', 'work', '.', 'dostoevsky', 'was', 'the', 'son', 'of', 'a', 'doctor', '.', 'his', 'parents', 'were', 'very', 'hard-working', 'and']


In [454]:
# read a stop word file
fstop = open('Smart.English.stop', 'r')
stoptext = fstop.read()
fstop.close()

stopwords = nltk.word_tokenize(stoptext)
print ("Display first 50 Stopwords:")
print (stopwords[:50])

Display first 50 Stopwords:
['’', 's', 'a', "a's", 'able', 'about', 'above', 'according', 'accordingly', 'across', 'actually', 'after', 'afterwards', 'again', 'against', "ain't", 'all', 'allow', 'allows', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'an', 'and', 'another', 'any', 'anybody', 'anyhow', 'anyone', 'anything', 'anyway', 'anyways', 'anywhere', 'apart', 'appear', 'appreciate', 'appropriate', 'are', "aren't", 'around', 'as', 'aside', 'ask']


In [455]:
# setup to process bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [456]:
finder = BigramCollocationFinder.from_words(filewords)
# choose to use both the non-alpha word filter and a stopwords filter
finder.apply_word_filter(alpha_filter)
finder.apply_word_filter(lambda w: w in stopwords)

In [457]:
# score by frequency and display the top 50 bigrams
scored = finder.score_ngrams(bigram_measures.raw_freq)
print ()
print ("Bigrams from file with top 50 frequencies")
for item in scored[:20]:
        print (item)


Bigrams from file with top 50 frequencies
(('katerina', 'ivanovna'), 0.0008446719923501405)
(('pyotr', 'petrovitch'), 0.0006813156164711038)
(('wo', "n't"), 0.0004900691276371097)
(('ca', "n't"), 0.00048608482578640157)
(('pulcheria', 'alexandrovna'), 0.00048210052393569334)
(('avdotya', 'romanovna'), 0.0004581947128314441)
(('rodion', 'romanovitch'), 0.000342649959160906)
(('porfiry', 'petrovitch'), 0.000322728449907365)
(('marfa', 'petrovna'), 0.00030679124250453213)
(('sofya', 'semyonovna'), 0.0002828854314002829)
(('raskolnikov', "'s"), 0.00021913660178895153)
(('amalia', 'ivanovna'), 0.00021515229993824333)
(('young', 'man'), 0.0002071836962368269)
(('great', 'deal'), 0.00018726218698328587)
(("n't", 'understand'), 0.00013945056477478735)
(('ilya', 'petrovitch'), 0.00013148196107337092)
(('ivanovna', "'s"), 0.0001235133573719545)
(('sonia', "'s"), 0.00011554475367053808)
(('make', 'haste'), 0.00010757614996912166)
(('good', 'heavens'), 0.00010359184811841345)


In [458]:
# score by PMI and display the top 50 bigrams
# only use frequently occurring words in mutual information
finder.apply_freq_filter(5)
scored = finder.score_ngrams(bigram_measures.pmi)

In [459]:
print ("\nBigrams from file with top 50 mutual information scores")
for item in scored[:20]:
        print (item)


Bigrams from file with top 50 mutual information scores
(('praskovya', 'pavlovna'), 14.767316617759132)
(('de', 'cristal'), 14.352279118480286)
(('palais', 'de'), 14.352279118480286)
(('explosive', 'lieutenant'), 14.252743444929372)
(('semyon', 'zaharovitch'), 14.252743444929372)
(('assistant', 'superintendent'), 13.91487380617299)
(('arkady', 'ivanovitch'), 13.767316617759132)
(('madame', 'resslich'), 13.570919404955628)
(('afanasy', 'ivanovitch'), 13.352279118480286)
(('andrey', 'semyonovitch'), 13.352279118480286)
(('madame', 'lippevechsel'), 13.352279118480286)
(('nikodim', 'fomitch'), 13.352279118480284)
(('examining', 'lawyer'), 13.030351023592925)
(('flushed', 'crimson'), 12.919319711204182)
(('hay', 'market'), 12.914873806172988)
(('chapter', 'iii'), 12.392921102977633)
(('chapter', 'iv'), 12.392921102977633)
(('chapter', 'vi'), 12.352279118480286)
(('dmitri', 'prokofitch'), 12.352279118480284)
(('canal', 'bank'), 12.325806907119096)
