In [12]:
import spacy
import nltk
from pprint import pprint

# Importing a corpus

In [2]:
from nltk.corpus import gutenberg
print gutenberg.fileids()

[u'austen-emma.txt', u'austen-persuasion.txt', u'austen-sense.txt', u'bible-kjv.txt', u'blake-poems.txt', u'bryant-stories.txt', u'burgess-busterbrown.txt', u'carroll-alice.txt', u'chesterton-ball.txt', u'chesterton-brown.txt', u'chesterton-thursday.txt', u'edgeworth-parents.txt', u'melville-moby_dick.txt', u'milton-paradise.txt', u'shakespeare-caesar.txt', u'shakespeare-hamlet.txt', u'shakespeare-macbeth.txt', u'whitman-leaves.txt']


In [25]:
hamlet_id = 'shakespeare-hamlet.txt'
print "num_chars :",len(gutenberg.raw(hamlet_id))
print "num_words :",len(gutenberg.words(hamlet_id))
print "num_sents :",len(gutenberg.sents(hamlet_id))
hamlet_sents = nltk.corpus.gutenberg.sents(hamlet_id)
print hamlet_sents[1303]

num_chars : 162881
num_words : 37360
num_sents : 3106
[u'To', u'be', u',', u'or', u'not', u'to', u'be', u',', u'that', u'is', u'the', u'Question', u':', u'Whether', u"'", u'tis', u'Nobler', u'in', u'the', u'minde', u'to', u'suffer', u'The', u'Slings', u'and', u'Arrowes', u'of', u'outragious', u'Fortune', u',', u'Or', u'to', u'take', u'Armes', u'against', u'a', u'Sea', u'of', u'troubles', u',', u'And', u'by', u'opposing', u'end', u'them', u':', u'to', u'dye', u',', u'to', u'sleepe', u'No', u'more', u';', u'and', u'by', u'a', u'sleepe', u',', u'to', u'say', u'we', u'end', u'The', u'Heart', u'-', u'ake', u',', u'and', u'the', u'thousand', u'Naturall', u'shockes', u'That', u'Flesh', u'is', u'heyre', u'too', u'?']


# Basic Preprocessing

## Case Conversion and Replacements
To parse texts where Upper-case and Lower-case does not have much significance. <br>
Alternatively, if there a multiple patterns of the same word with varying cases.

Consider the following text : 
> "The **General Data Protection Regulation** (EU) 2016/679 is a regulation in EU law on data protection and privacy for all individuals within the European Union and the European Economic Area. **Gdpr** also addresses the export of personal data outside the EU and EEA. The **GDPR** aims primarily to give control to citizens and residents over their personal data and to simplify the regulatory environment for international business by unifying the regulation within the EU."

In [38]:
sample = "The General Data Protection Regulation (EU) 2016/679 is a regulation in EU law on data protection and privacy for all individuals within the European Union and the European Economic Area. Gdpr also addresses the export of personal data outside the EU and EEA. The GDPR aims primarily to give control to citizens and residents over their personal data and to simplify the regulatory environment for international business by unifying the regulation within the EU."
print "Checking for 'GDPR' only : ",[i for i in sample.split(' ') if i == 'GDPR' ]
print "Checking for 'GDPR' irrespective of case : ",[i for i in sample.lower().split(' ') if i == 'gdpr' ]
sample_rep = sample.replace('General Data Protection Regulation','GDPR')
print "Checking for 'GDPR' irrespective of case after replacing abbreviations : ",[i for i in sample_rep.lower().split(' ') if i == 'gdpr' ]

Checking for 'GDPR' only :  ['GDPR']
Checking for 'GDPR' irrespective of case :  ['gdpr', 'gdpr']
Checking for 'GDPR' irrespective of case after replacing abbreviations :  ['gdpr', 'gdpr', 'gdpr']


## Tokenisation : NLTK


In [45]:
from nltk.tokenize import sent_tokenize
sample2 = "this's a sent tokenize test. this is sent two. is this sent three? sent 4 is cool! Now it's your turn."
sent_tokenize(sample2)

["this's a sent tokenize test.",
 'this is sent two.',
 'is this sent three?',
 'sent 4 is cool!',
 "Now it's your turn."]

In [46]:
# Tokenising the first sentence of sample2
from nltk.tokenize import word_tokenize
word_tokenize(sent_tokenize(sample2)[0])

['this', "'s", 'a', 'sent', 'tokenize', 'test', '.']

## Tokenisation : Spacy
Note : 
1. Unicode
2. Single function
3. Generator

In [51]:
nlp = spacy.load('en_core_web_sm')

In [83]:
for s in x.sents:
    for w in s:
        print w,w.pos_,w.lemma_
    print "===="

this DET this
's VERB be
a DET a
sent ADJ sent
tokenize NOUN tokenize
test NOUN test
. PUNCT .
====
this DET this
is VERB be
sent VERB send
two NUM two
. PUNCT .
====
is VERB be
this DET this
sent VERB send
three NUM three
? PUNCT ?
====
sent VERB send
4 NUM 4
is VERB be
cool ADJ cool
! PUNCT !
====
Now ADV now
it PRON -PRON-
's VERB be
your ADJ -PRON-
turn NOUN turn
. PUNCT .
====


## POS Tagging : NLTK

In [85]:
sent = "I'm gonna make him an offer he can't refuse"
words = nltk.word_tokenize(sent)
nltk.pos_tag(words)

[('I', 'PRP'),
 ("'m", 'VBP'),
 ('gon', 'VBG'),
 ('na', 'TO'),
 ('make', 'VB'),
 ('him', 'PRP'),
 ('an', 'DT'),
 ('offer', 'NN'),
 ('he', 'PRP'),
 ('ca', 'MD'),
 ("n't", 'RB'),
 ('refuse', 'VB')]

In [66]:
from nltk.stem.porter import PorterStemmer
porter_stemmer = PorterStemmer()

words = ["presumably","presume","presuming","presumed","presumes"]
for w in words:
    print w,"-->",porter_stemmer.stem(w)

presumably --> presum
presume --> presum
presuming --> presum
presumed --> presum
presumes --> presum


In [82]:
from nltk.stem import WordNetLemmatizer

words = ["presumably","presume","presuming","presumed","presumes"]
pos_tags = ["a","v","v","v","v"]
print "Lemmatizing without POS Tags"
for w in words:
    print w,"-->",wordnet_lemmatizer.lemmatize(w)

print "\nLemmatizing with POS Tags"
for w,p in zip(words,pos_tags):
    print w,"-->",wordnet_lemmatizer.lemmatize(w, pos=p)

print "\n\n",'aardwolves','--lemma-->',wordnet_lemmatizer.lemmatize('aardwolves')
print 'aardwolves','--stem-->',porter_stemmer.stem('aardwolves')
#     print wordnet_lemmatizer.lemmatize('presuming', pos='v')
# wordnet_lemmatizer.lemmatize('are', pos='v')    

Lemmatizing without POS Tags
presumably --> presumably
presume --> presume
presuming --> presuming
presumed --> presumed
presumes --> presumes

Lemmatizing with POS Tags
presumably --> presumably
presume --> presume
presuming --> presume
presumed --> presume
presumes --> presume


aardwolves --lemma--> aardwolf
aardwolves --stem--> aardwolv


u'aardwolf'