In [2]:
import nltk

##PORTER STEMMER

In [3]:
from nltk.stem.porter import PorterStemmer

In [4]:
p_stemmer = PorterStemmer()

In [5]:
words = ['have', 'having','had', 'hang', 'hanging', 'run', 'ran', 'runner', 'fair', 'fairly', 'nationality', 'nationalism', 'national', 'fairness']

In [6]:
for word in words:
  print(word + '----->' + p_stemmer.stem(word))

have----->have
having----->have
had----->had
hang----->hang
hanging----->hang
run----->run
ran----->ran
runner----->runner
fair----->fair
fairly----->fairli
nationality----->nation
nationalism----->nation
national----->nation
fairness----->fair


##SNOWBALL STEMMER

In [7]:
from nltk.stem.snowball import SnowballStemmer

In [8]:
s_stemmer = SnowballStemmer(language = 'english')

In [9]:
for word in words:
  print(word + '----->' + s_stemmer.stem(word))

have----->have
having----->have
had----->had
hang----->hang
hanging----->hang
run----->run
ran----->ran
runner----->runner
fair----->fair
fairly----->fair
nationality----->nation
nationalism----->nation
national----->nation
fairness----->fair


##LEMMANTIZATION

In [10]:
import spacy

In [11]:
nlp = spacy.load('en_core_web_sm')

In [12]:
doc = nlp(u'Peter Piper picked a peck of pickled peppers')

In [13]:
for token in doc:
  print(token, '\t', token.dep_, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

Peter 	 compound 	 PROPN 	 12811822816318728671 	 Peter
Piper 	 nsubj 	 PROPN 	 16272889730469212282 	 Piper
picked 	 ROOT 	 VERB 	 14020768407998353649 	 pick
a 	 det 	 DET 	 11901859001352538922 	 a
peck 	 dobj 	 NOUN 	 17237996080690907035 	 peck
of 	 prep 	 ADP 	 886050111519832510 	 of
pickled 	 amod 	 VERB 	 15206147607577397413 	 pickle
peppers 	 pobj 	 NOUN 	 7406717370878359568 	 pepper


##MATCHING AND VOCABULARY

##Rule-based Matching
spaCy offers a rule-matching tool called Matcher that allows you to build a library of token patterns, then match those patterns against a Doc object to return a list of found matches. You can match on any part of the token including text and annotations, and you can add multiple patterns to the same matcher.

In [14]:
from spacy.matcher import Matcher #IMPORT THE MATCHER LIBRARY
matcher = Matcher(nlp.vocab)

#Creating patterns
In literature, the phrase 'solar power' might appear as one word or two, with or without a hyphen. In this section we'll develop a matcher named 'SolarPower' that finds all three:

In [20]:
pattern1 = [{'ORTH': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]
pattern3 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]

matcher.add('SolarPower', None, pattern1, pattern2, pattern3)

#Applying the matcher to a Doc object


In [21]:
doc = nlp(u'The Solar Power industry continues to grow as demand \
for solarpower increases. Solar-power cars are gaining popularity.')

In [22]:
found_matches = matcher(doc)  #matcher returns a list of tuples. Each tuple contains an ID for the match, with start & end tokens that map to the span doc[start:end]

for match_id, start, end in found_matches:
   print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]
[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]
[(8656102463236116519, 1, 3), (8656102463236116519, 10, 11), (8656102463236116519, 13, 16)]


##ON LEMMAS
If we wanted to match on both 'solar power' and 'solar powered', it might be tempting to look for the lemma of 'powered' and expect it to be 'power'. This is not always the case! The lemma of the adjective 'powered' is still 'powered':

In [23]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP':'*'}, {'LEMMA': 'power'}] # CHANGE THIS PATTERN

# Remove the old patterns to avoid duplication:
matcher.remove('SolarPower')

# Add the new set of patterns to the 'SolarPower' matcher:
matcher.add('SolarPower', None, pattern1, pattern2)

In [24]:
doc2 = nlp(u'Solar-powered energy runs solar-powered cars.')

In [25]:
found_matches = matcher(doc2)
print(found_matches)

[(8656102463236116519, 0, 3), (8656102463236116519, 5, 8)]


##PhraseMatcher
In the above section we used token patterns to perform rule-based matching. An alternative - and often more efficient - method is to match on terminology lists. In this case we use PhraseMatcher to create a Doc object from a list of phrases, and pass that into matcher instead.

In [27]:
from spacy.matcher import PhraseMatcher #IMPORTING PHRASE MATCHER LIBRARY
matcher = PhraseMatcher(nlp.vocab)

#For this exercise we're going to import a Wikipedia article on Reaganomics

In [29]:
with open('/content/Reaganomics - Wikipedia.html') as f:
    doc3 = nlp(f.read())

In [30]:
# First, create a list of match phrases:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

# Next, convert each phrase to a Doc object:
phrase_patterns = [nlp(text) for text in phrase_list]

# Pass each Doc object into matcher (note the use of the asterisk!):
matcher.add('VoodooEconomics', None, *phrase_patterns)

# Build a list of matches:
matches = matcher(doc3)

In [32]:
             # (match_id, start, end)
             matches

[(3473369816841043438, 12690, 12692)]

In [33]:
doc3[:70] #HERE WE ARE GETTING THE OUTPUT AS ELEMENTS OF AN HTML PAGE. THIS IS BEACUSE WE HAVE LOADED AN HTML DOC

<!DOCTYPE html>
<!-- saved from url=(0041)https://en.wikipedia.org/wiki/Reaganomics -->
<html class="client-js ve-available" lang="en" dir="ltr"><head><meta http-equiv="Content-Type" content="text/html; charset=UTF-8">

<title>Reaganomics - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YEr0P89Zdk4iy-DL7UMSPwAAAEk","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Reaganomics","wgTitle":"Reaganomics","wgCurRevisionId":1011394188,"wgRevisionId":1011394188,"wgArticleId":26529,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["CS1 errors: missing periodical","Webarchive template wayba

##Viewing Matches
here are a few ways to fetch the text surrounding a match. The simplest is to grab a slice of tokens from the doc that is wider than the match:

In [34]:
doc3[665:685]  # Note that the fifth match starts at doc3[673]

-color:transparent;padding:0}.mw-ui-button:active,.mw-ui-button.is-on{background-color:#c8ccd1;color:#000000;border-color:#72777d;box

In [35]:
doc3[2975:2995]  # The sixth match starts at doc3[2985]

mode:both;-moz-animation-fill-mode:both;-o-animation-fill-mode:both;animation-