### Installing Dependent Files

In [1]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\91991\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91991\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91991\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\91991\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# Spacy model loading
import spacy

# Smaller pipeline and less accurate but faster
nlp_sm = spacy.load("en_core_web_sm")

# Larger pipeline and less accurate but faster
nlp_trf = spacy.load("en_core_web_trf")

### Case Conversion

In [3]:
text = "I like Data Science doamin. Learning and coding in Python is my one of hobies."
text

'I like Data Science doamin. Learning and coding in Python is my one of hobies.'

In [4]:
# Lower case
text.lower()

'i like data science doamin. learning and coding in python is my one of hobies.'

In [5]:
# Upper case
text.upper()

'I LIKE DATA SCIENCE DOAMIN. LEARNING AND CODING IN PYTHON IS MY ONE OF HOBIES.'

In [6]:
text.title()

'I Like Data Science Doamin. Learning And Coding In Python Is My One Of Hobies.'

### Tokenization

In [7]:
sample_text = ("The leaders of Group of Seven (G7) debated how strongly they should respond to China's growing clout "
               "around the world and alleged forced labour practices in the Xinjiang region."
               " According to a CNN report, the G7 leaders aired serious differences over the approach and "
               "the disagreements reportedly became so sensitive at one point that all internet was shut off to the room."
               "While the G7 leaders unveiled an infrastructure plan for the developing world to counter China’s Belt"
               " and Road program, reports suggest that there was no immediate consensus on how forcefully they should "
               "call out Beijing over the alleged human rights abuses. ")
sample_text

"The leaders of Group of Seven (G7) debated how strongly they should respond to China's growing clout around the world and alleged forced labour practices in the Xinjiang region. According to a CNN report, the G7 leaders aired serious differences over the approach and the disagreements reportedly became so sensitive at one point that all internet was shut off to the room.While the G7 leaders unveiled an infrastructure plan for the developing world to counter China’s Belt and Road program, reports suggest that there was no immediate consensus on how forcefully they should call out Beijing over the alleged human rights abuses. "

In [8]:
# Sentence Token Identification using NLTK
nltk.sent_tokenize(sample_text)

["The leaders of Group of Seven (G7) debated how strongly they should respond to China's growing clout around the world and alleged forced labour practices in the Xinjiang region.",
 'According to a CNN report, the G7 leaders aired serious differences over the approach and the disagreements reportedly became so sensitive at one point that all internet was shut off to the room.While the G7 leaders unveiled an infrastructure plan for the developing world to counter China’s Belt and Road program, reports suggest that there was no immediate consensus on how forcefully they should call out Beijing over the alleged human rights abuses.']

In [9]:
# Word Token identification using NLTK
print(nltk.word_tokenize(sample_text))

['The', 'leaders', 'of', 'Group', 'of', 'Seven', '(', 'G7', ')', 'debated', 'how', 'strongly', 'they', 'should', 'respond', 'to', 'China', "'s", 'growing', 'clout', 'around', 'the', 'world', 'and', 'alleged', 'forced', 'labour', 'practices', 'in', 'the', 'Xinjiang', 'region', '.', 'According', 'to', 'a', 'CNN', 'report', ',', 'the', 'G7', 'leaders', 'aired', 'serious', 'differences', 'over', 'the', 'approach', 'and', 'the', 'disagreements', 'reportedly', 'became', 'so', 'sensitive', 'at', 'one', 'point', 'that', 'all', 'internet', 'was', 'shut', 'off', 'to', 'the', 'room.While', 'the', 'G7', 'leaders', 'unveiled', 'an', 'infrastructure', 'plan', 'for', 'the', 'developing', 'world', 'to', 'counter', 'China', '’', 's', 'Belt', 'and', 'Road', 'program', ',', 'reports', 'suggest', 'that', 'there', 'was', 'no', 'immediate', 'consensus', 'on', 'how', 'forcefully', 'they', 'should', 'call', 'out', 'Beijing', 'over', 'the', 'alleged', 'human', 'rights', 'abuses', '.']


In [10]:
text_spacy_sm = nlp_sm(sample_text)
text_spacy_trf = nlp_trf(sample_text)

In [11]:
[obj.text for obj in text_spacy_sm.sents]

["The leaders of Group of Seven (G7) debated how strongly they should respond to China's growing clout around the world and alleged forced labour practices in the Xinjiang region.",
 'According to a CNN report, the G7 leaders aired serious differences over the approach and the disagreements reportedly became so sensitive at one point that all internet was shut off to the room.',
 'While the G7 leaders unveiled an infrastructure plan for the developing world to counter China’s Belt and Road program, reports suggest that there was no immediate consensus on how forcefully they should call out Beijing over the alleged human rights abuses.']

In [12]:
[obj.text for obj in text_spacy_trf.sents]

["The leaders of Group of Seven (G7) debated how strongly they should respond to China's growing clout around the world and alleged forced labour practices in the Xinjiang region.",
 'According to a CNN report, the G7 leaders aired serious differences over the approach and the disagreements reportedly became so sensitive at one point that all internet was shut off to the room.',
 'While the G7 leaders unveiled an infrastructure plan for the developing world to counter China’s Belt and Road program, reports suggest that there was no immediate consensus on how forcefully they should call out Beijing over the alleged human rights abuses.']

### Removing HTML tags & noise

In [13]:
import requests

data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.text
print(content[2745:3948])

 * Default rule centered and clear of floats; sized for thought-breaks
 * ********************************************************************** */
	hr {
		width:45%;			/* adjust to ape original work */
		margin-top: 1em;	/* space above &amp;amp; below */
		margin-bottom: 1em;
		margin-left: auto;  /* these two ensure a.. */
		margin-right: auto; /* ..centered rule */
		clear: both;		/* don't let sidebars &amp;amp; floats overlap rule */
	}
/* ************************************************************************
 * Images and captions
 * ********************************************************************** */
	img { /* the default inline image has */
		border: 1px solid black; /* a thin black line border.. */
		padding: 6px; /* ..spaced a bit out from the graphic */
		} </style><link rel="schema.DCTERMS" href="http://purl.org/dc/terms/"/>
<link rel="schema.MARCREL" href="http://id.loc.gov/vocabulary/relators/"/>
<meta name="DCTERMS.title" content="The Bible, King 

In [14]:
import re
from bs4 import BeautifulSoup

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)
print(clean_content[1163:1957])

*** START OF THE PROJECT GUTENBERG EBOOK, THE BIBLE, KING JAMES, BOOK 1***
This eBook was produced by David Widger
with the help of Derek Andrew's text from January 1992
and the work of Bryan Taylor in November 2002.
Book 01        Genesis
01:001:001 In the beginning God created the heaven and the earth.
01:001:002 And the earth was without form, and void; and darkness was
           upon the face of the deep. And the Spirit of God moved upon
           the face of the waters.
01:001:003 And God said, Let there be light: and there was light.
01:001:004 And God saw the light, that it was good: and God divided the
           light from the darkness.
01:001:005 And God called the light Day, and the darkness he called
           Night. And the evening and the morning were the first day.



### Removing Accented Characters

In [15]:
import unicodedata

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text


In [16]:
s = 'Sómě Áccěntěd těxt'
print("Before:\t", s)
print("After:\t", remove_accented_chars(s))

Before:	 Sómě Áccěntěd těxt
After:	 Some Accented text


### Removing Special Characters, Numbers and Symbols

In [17]:
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text

In [18]:
s = "Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂"
print("Before:\t", s)
print("After:\t", remove_special_characters(s, remove_digits=False))

Before:	 Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂
After:	 Well this was fun See you at 730 What do you think 9318 


### Expanding Contractions

In [19]:
!pip install contractions
!pip install textsearch

Collecting contractions
  Using cached contractions-0.0.50-py2.py3-none-any.whl (7.2 kB)
Collecting textsearch>=0.0.21
  Using cached textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Using cached pyahocorasick-1.4.2.tar.gz (321 kB)
Collecting anyascii
  Using cached anyascii-0.2.0-py3-none-any.whl (283 kB)
Building wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py): started

  ERROR: Command errored out with exit status 1:
   command: 'c:\users\91991\anaconda3\python.exe' -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\91991\\AppData\\Local\\Temp\\pip-install-2s31x7ut\\pyahocorasick_c31693aa62f64a35820a1e4d702d9224\\setup.py'"'"'; __file__='"'"'C:\\Users\\91991\\AppData\\Local\\Temp\\pip-install-2s31x7ut\\pyahocorasick_c31693aa62f64a35820a1e4d702d9224\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d 'C:\Users\91991\AppData\Local\Temp\pip-wheel-ipvmnrq_'
       cwd: C:\Users\91991\AppData\Local\Temp\pip-install-2s31x7ut\pyahocorasick_c31693aa62f64a35820a1e4d702d9224\
  Complete output (5 lines):
  running bdist_wheel
  running build
  running build_ext
  building 'ahocorasick' extension
  er


  Building wheel for pyahocorasick (setup.py): finished with status 'error'
  Running setup.py clean for pyahocorasick
Failed to build pyahocorasick
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
    Running setup.py install for pyahocorasick: started
    Running setup.py install for pyahocorasick: finished with status 'error'


  ERROR: Command errored out with exit status 1:
   command: 'c:\users\91991\anaconda3\python.exe' -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\91991\\AppData\\Local\\Temp\\pip-install-dfae_ual\\pyahocorasick_b2fd00c12b9b4b5bb311bca27e18da2f\\setup.py'"'"'; __file__='"'"'C:\\Users\\91991\\AppData\\Local\\Temp\\pip-install-dfae_ual\\pyahocorasick_b2fd00c12b9b4b5bb311bca27e18da2f\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d 'C:\Users\91991\AppData\Local\Temp\pip-wheel-2rtou_mx'
       cwd: C:\Users\91991\AppData\Local\Temp\pip-install-dfae_ual\pyahocorasick_b2fd00c12b9b4b5bb311bca27e18da2f\
  Complete output (5 lines):
  running bdist_wheel
  running build
  running build_ext
  building 'ahocorasick' extension
  er

Collecting textsearch
  Using cached textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting pyahocorasick
  Using cached pyahocorasick-1.4.2.tar.gz (321 kB)
Collecting anyascii
  Using cached anyascii-0.2.0-py3-none-any.whl (283 kB)
Building wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py): started
  Building wheel for pyahocorasick (setup.py): finished with status 'error'
  Running setup.py clean for pyahocorasick
Failed to build pyahocorasick
Installing collected packages: pyahocorasick, anyascii, textsearch
    Running setup.py install for pyahocorasick: started
    Running setup.py install for pyahocorasick: finished with status 'error'



    building 'ahocorasick' extension
    error: Microsoft Visual C++ 14.0 or greater is required. Get it with "Microsoft C++ Build Tools": https://visualstudio.microsoft.com/visual-cpp-build-tools/
    ----------------------------------------
ERROR: Command errored out with exit status 1: 'c:\users\91991\anaconda3\python.exe' -u -c 'import io, os, sys, setuptools, tokenize; sys.argv[0] = '"'"'C:\\Users\\91991\\AppData\\Local\\Temp\\pip-install-dfae_ual\\pyahocorasick_b2fd00c12b9b4b5bb311bca27e18da2f\\setup.py'"'"'; __file__='"'"'C:\\Users\\91991\\AppData\\Local\\Temp\\pip-install-dfae_ual\\pyahocorasick_b2fd00c12b9b4b5bb311bca27e18da2f\\setup.py'"'"';f = getattr(tokenize, '"'"'open'"'"', open)(__file__) if os.path.exists(__file__) else io.StringIO('"'"'from setuptools import setup; setup()'"'"');code = f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' install --record 'C:\Users\91991\AppData\Local\Temp\pip-record-e2h6mik4\install-r

In [20]:
s = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
s

"Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"

In [21]:
import contractions

list(contractions.contractions_dict.items())[:10]

ModuleNotFoundError: No module named 'contractions'

In [24]:
import contractions

list(contractions.contractions_dict.items())[:10]

ModuleNotFoundError: No module named 'contractions'

In [25]:
print("Before:\t", s)
print("After:\t", contractions.fix(s))

Before:	 Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?


NameError: name 'contractions' is not defined

### Stemming

In [26]:
# Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

('jump', 'jump', 'jump')

In [27]:
ps.stem('lying')

'lie'

In [28]:
ps.stem('strange')

'strang'

### Try using Lancaster stemmer on the same two words as depicted previously

In [29]:
# Lancaster Stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

('jump', 'jump', 'jump')

In [30]:
ls.stem('lying')

'lying'

In [31]:
ls.stem('strange')

'strange'

In [32]:
import nltk
ps = nltk.porter.PorterStemmer()
ls = nltk.stem.LancasterStemmer()

def simple_stemming(text, stemmer=ps):
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

### Try calling the above defined function for both Lancaster and Porter stemmer separately

In [33]:
s = "My system keeps crashing his crashed yesterday ours crashes daily and presumably we are not lying"
s

'My system keeps crashing his crashed yesterday ours crashes daily and presumably we are not lying'

In [34]:
simple_stemming(s, stemmer=ps)

'My system keep crash hi crash yesterday our crash daili and presum we are not lie'

In [35]:
simple_stemming(s, stemmer=ls)

'my system keep crash his crash yesterday our crash dai and presum we ar not lying'

### Lemmatization

In [36]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [37]:
help(wnl.lemmatize)

Help on method lemmatize in module nltk.stem.wordnet:

lemmatize(word, pos='n') method of nltk.stem.wordnet.WordNetLemmatizer instance



In [38]:
# lemmatize nouns
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('boxes', 'n'))

car
box


In [39]:
# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print(wnl.lemmatize('ate', 'v'))

run
eat


In [40]:
# lemmatize verbs
print(wnl.lemmatize('running', 'n'))
print(wnl.lemmatize('ate', 'n'))

running
ate


In [41]:
# lemmatize adjectives
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('fancier', 'a'))

sad
fancy


In [42]:
# ineffective lemmatization
print(wnl.lemmatize('ate', 'n'))
print(wnl.lemmatize('fancier', 'v'))
print(wnl.lemmatize('fancier'))

ate
fancier
fancier


In [43]:
s = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [44]:
tokens = nltk.word_tokenize(s)
print(tokens)

['The', 'brown', 'foxes', 'are', 'quick', 'and', 'they', 'are', 'jumping', 'over', 'the', 'sleeping', 'lazy', 'dogs', '!']


In [45]:
lemmatized_text = ' '.join(wnl.lemmatize(token) for token in tokens)
lemmatized_text

'The brown fox are quick and they are jumping over the sleeping lazy dog !'

In [46]:
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

[('The', 'DT'), ('brown', 'JJ'), ('foxes', 'NNS'), ('are', 'VBP'), ('quick', 'JJ'), ('and', 'CC'), ('they', 'PRP'), ('are', 'VBP'), ('jumping', 'VBG'), ('over', 'IN'), ('the', 'DT'), ('sleeping', 'VBG'), ('lazy', 'JJ'), ('dogs', 'NNS'), ('!', '.')]


In [47]:
lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in tagged_tokens)
lemmatized_text

KeyError: 'DT'

In [48]:
from nltk.corpus import wordnet

wordnet.ADJ

'a'

In [49]:
def pos_tag_wordnet(tagged_tokens):
    tag_map = {'j': wordnet.ADJ, 'v': wordnet.VERB, 'n': wordnet.NOUN, 'r': wordnet.ADV}
    new_tagged_tokens = [(word, tag_map.get(tag[0].lower(), wordnet.NOUN))
                            for word, tag in tagged_tokens]
    return new_tagged_tokens

In [50]:
wordnet_tokens = pos_tag_wordnet(tagged_tokens)
print(wordnet_tokens)

[('The', 'n'), ('brown', 'a'), ('foxes', 'n'), ('are', 'v'), ('quick', 'a'), ('and', 'n'), ('they', 'n'), ('are', 'v'), ('jumping', 'v'), ('over', 'n'), ('the', 'n'), ('sleeping', 'v'), ('lazy', 'a'), ('dogs', 'n'), ('!', 'n')]


In [51]:
lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
lemmatized_text

'The brown fox be quick and they be jump over the sleep lazy dog !'

### Define a function such that you put all the above steps together so that it does the following

In [53]:
wnl = WordNetLemmatizer()

def wordnet_lemmatize_text(text):
    tagged_tokens = nltk.pos_tag(nltk.word_tokenize(text))
    wordnet_tokens = pos_tag_wordnet(tagged_tokens)
    lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
    return lemmatized_text

In [54]:
s

'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [55]:
wordnet_lemmatize_text(s)

'The brown fox be quick and they be jump over the sleep lazy dog !'

In [58]:

import spacy

def spacy_lemmatize_text(text):
    text = nlp_sm(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [59]:
print("Before:\t", s)
print("After:\t", spacy_lemmatize_text(s))

Before:	 The brown foxes are quick and they are jumping over the sleeping lazy dogs!
After:	 the brown fox be quick and they be jump over the sleep lazy dog !


### Stopword Removal

In [60]:
def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [61]:
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words[:10])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [63]:
print("Before:\t", s)
print("After:\t", remove_stopwords(s, is_lower_case=False))

Before:	 The brown foxes are quick and they are jumping over the sleeping lazy dogs!
After:	 brown foxes quick jumping sleeping lazy dogs !
