<a href="https://colab.research.google.com/github/sujayrittikar/NLP_Basics/blob/main/PreProcessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Stemming -> Porter's Algorithm
# Includes Word Reduction in 5 phases/rules

1. Simple suffix mapping. ex: SSES -> SS, ponies -> poni
2. Length/Complexity of word. ex: ATIONAL -> ATE

Snowball -> Porter 2 Stemmer

In [1]:
import nltk

In [2]:
from nltk.stem.porter import PorterStemmer

In [3]:
p_stemmer = PorterStemmer()

In [4]:
words = ['run', 'runs', 'runner', 'ran', 'easily', 'fairly', 'fairness']

In [5]:
for word in words:
  print(word + '------->' + p_stemmer.stem(word))

run------->run
runs------->run
runner------->runner
ran------->ran
easily------->easili
fairly------->fairli
fairness------->fair


In [6]:
from nltk.stem.snowball import SnowballStemmer

In [7]:
s_stemmer = SnowballStemmer(language='english')

In [8]:
for word in words:
  print(word + '------->' + s_stemmer.stem(word))

run------->run
runs------->run
runner------->runner
ran------->ran
easily------->easili
fairly------->fair
fairness------->fair


In [9]:
words_ = ['generous', 'generation', 'generously', 'generate']

In [10]:
for word in words_:
  print(word + '------->' + s_stemmer.stem(word))

generous------->generous
generation------->generat
generously------->generous
generate------->generat


# Lemmatization

In [11]:
import spacy 



In [12]:
nlp = spacy.load('en_core_web_sm')

In [13]:
doc1 = nlp('I am a fighter fighting in a fight because I love to fight since I fought a tiger! I ran and ran but running never caught me as a runner, the tiger did. This is a story narrated by story-tellers whose stories are as glamorous as their own stories.')

In [14]:
for token in doc1:
  print(token.text, '\t', token.pos_, '\t', token.lemma, '\t', token.lemma_)

I 	 PRON 	 4690420944186131903 	 I
am 	 AUX 	 10382539506755952630 	 be
a 	 DET 	 11901859001352538922 	 a
fighter 	 NOUN 	 4089770865968377353 	 fighter
fighting 	 NOUN 	 13293374023388930237 	 fighting
in 	 ADP 	 3002984154512732771 	 in
a 	 DET 	 11901859001352538922 	 a
fight 	 NOUN 	 15486772212226780222 	 fight
because 	 SCONJ 	 16950148841647037698 	 because
I 	 PRON 	 4690420944186131903 	 I
love 	 VERB 	 3702023516439754181 	 love
to 	 PART 	 3791531372978436496 	 to
fight 	 VERB 	 15486772212226780222 	 fight
since 	 SCONJ 	 10066841407251338481 	 since
I 	 PRON 	 4690420944186131903 	 I
fought 	 VERB 	 15486772212226780222 	 fight
a 	 DET 	 11901859001352538922 	 a
tiger 	 NOUN 	 5423999730010037932 	 tiger
! 	 PUNCT 	 17494803046312582752 	 !
I 	 PRON 	 4690420944186131903 	 I
ran 	 VERB 	 12767647472892411841 	 run
and 	 CCONJ 	 2283656566040971221 	 and
ran 	 VERB 	 12767647472892411841 	 run
but 	 CCONJ 	 14560795576765492085 	 but
running 	 VERB 	 12767647472892411841 	

# Stop Words

Words like 'a', 'the' that appear too frequently and aren't required to be tagged.

In [15]:
print(nlp.Defaults.stop_words)

{'still', 'never', 'their', 'only', 'out', 'serious', 'say', 'noone', 'part', 'above', 'anywhere', 'an', 'anything', 'afterwards', 'between', 'per', 'hers', 'her', 'sometimes', 'toward', 'n’t', 'me', "'d", 'mostly', 'doing', 'they', 'beyond', 'hereby', 'who', 'whose', 'whence', 'seem', 'several', 'former', 'latterly', 'some', 'as', "'m", "'ve", 'him', 'less', 'be', 'and', 'whoever', 'anyhow', 'get', 'somewhere', 'about', 'latter', 'everyone', 'go', 'whereupon', 'either', 'seemed', 'behind', 'almost', 'these', 'those', 'into', 'yours', '’s', 'nobody', 'therein', 'whereby', 'does', 'at', 'could', 'became', 'or', "'s", 'can', 'must', 'in', 'that', 'even', 'will', 'full', 'no', 'thence', 'myself', 'back', 'unless', 'whither', 'across', 'now', 'any', 'such', 'there', '‘ll', 'against', 'among', 'namely', 'too', 'twelve', 'may', 'whereafter', 'wherever', 'own', '’ll', 'keep', 'regarding', 'than', 'becomes', 'much', 'from', 'over', 'he', 'often', 'onto', 'been', 'alone', '‘m', 'whatever', 'alw

In [16]:
len(nlp.Defaults.stop_words)

326

In [17]:
nlp.vocab['is'].is_stop

True

In [18]:
nlp.vocab['amazing'].is_stop

False

In [19]:
# Add a stop word
nlp.Defaults.stop_words.add('btw')
nlp.vocab['btw'].is_stop = True

In [20]:
nlp.vocab['btw'].is_stop

True

In [21]:
len(nlp.Defaults.stop_words)

327

In [22]:
# Remove a stop word
nlp.Defaults.stop_words.remove('btw')
nlp.vocab['btw'].is_stop = False

In [23]:
nlp.vocab['btw'].is_stop

False

In [24]:
len(nlp.Defaults.stop_words)

326

# Vocabulary and Matching

In [25]:
from spacy.matcher import Matcher

In [26]:
matcher = Matcher(nlp.vocab)

In [28]:
# SolarPower
pattern1 = [{'LOWER': 'solarpower'}]
# Solar-power
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True}, {'LOWER': 'power'}]
# Solar power
pattern3 = [{'LOWER': 'solar'}, {'LOWER': 'power'}]

In [30]:
matcher.add(key='SolarPower', patterns=[pattern1, pattern2, pattern3])

In [31]:
doc = nlp(u"The Solar Power industry continues to grow a solarpower increases. Solar-power is an important resource for power.")

In [32]:
found_matches = matcher(doc)

In [33]:
print(found_matches)

[(8656102463236116519, 1, 3), (8656102463236116519, 8, 9), (8656102463236116519, 11, 14)]


In [34]:
for match_id, start, end in found_matches:
  string_id = nlp.vocab.strings[match_id]
  span = doc[start:end]
  print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 1 3 Solar Power
8656102463236116519 SolarPower 8 9 solarpower
8656102463236116519 SolarPower 11 14 Solar-power


In [35]:
matcher.remove('SolarPower')

In [36]:
pattern1 = [{'LOWER': 'solarpower'}]
pattern2 = [{'LOWER': 'solar'}, {'IS_PUNCT': True, 'OP': '*'}, {'LOWER': 'power'}]

In [38]:
matcher.add(key='SolarPower', patterns=[pattern1, pattern2])

In [39]:
doc2 = nlp(u"Solar--power is solarpower!")

In [40]:
found_matches = matcher(doc2)

In [42]:
for match_id, start, end in found_matches:
  string_id = nlp.vocab.strings[match_id]
  span = doc2[start:end]
  print(match_id, string_id, start, end, span.text)

8656102463236116519 SolarPower 0 3 Solar--power
8656102463236116519 SolarPower 4 5 solarpower


# Phrase Matching

In [43]:
from spacy.matcher import PhraseMatcher

In [44]:
matcher = PhraseMatcher(nlp.vocab)

In [45]:
with open('reagonomics.txt') as f:
  doc3 = nlp(f.read())

In [46]:
phrase_list = ['voodoo economics', 'supply-side economics', 'trickle-down economics', 'free-market economics']

In [47]:
phrase_patterns = [nlp(text) for text in phrase_list]

In [49]:
matcher.add(key='EconMatcher', docs=phrase_patterns)

In [51]:
found_matches = matcher(doc3)

In [52]:
found_matches

[(3680293220734633682, 41, 45),
 (3680293220734633682, 49, 53),
 (3680293220734633682, 54, 56),
 (3680293220734633682, 61, 65),
 (3680293220734633682, 673, 677),
 (3680293220734633682, 2986, 2990)]

In [53]:
for match_id, start, end in found_matches:
  string_id = nlp.vocab.strings[match_id]
  span = doc3[start:end]
  print(match_id, string_id, start, end, span.text)

3680293220734633682 EconMatcher 41 45 supply-side economics
3680293220734633682 EconMatcher 49 53 trickle-down economics
3680293220734633682 EconMatcher 54 56 voodoo economics
3680293220734633682 EconMatcher 61 65 free-market economics
3680293220734633682 EconMatcher 673 677 supply-side economics
3680293220734633682 EconMatcher 2986 2990 trickle-down economics
