# Imports

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)

import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

# Text Wrangling Examples

In [2]:
sample = '''<div><p> This will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine 
everywhere on Tuesday and temperatures of between 22 and 27 degrees. It will warmest in the midlands.&nbsp;
 Temperatures could reach a September record for the century in Ireland, but are unlikely to surpass the 29.1 
 degrees recorded at Kildare’s Clongowes Wood College on September 1st, 1906. </p>
<p> Tuesday, however, will be the last day of the sunshine with rain arriving across the country on Wednesday morning. </p>
<p> Temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers. </p></div>
'''

## Tokenization

In [3]:
from nltk import sent_tokenize, word_tokenize

In [4]:
print(word_tokenize(sample))

['<', 'div', '>', '<', 'p', '>', 'This', 'will', 'be', 'followed', 'by', 'more', 'of', 'the', 'same', 'with', 'the', 'mist', 'and', 'fog', 'clearing', 'to', 'give', 'a', 'day', 'of', 'unbroken', 'sunshine', 'everywhere', 'on', 'Tuesday', 'and', 'temperatures', 'of', 'between', '22', 'and', '27', 'degrees', '.', 'It', 'will', 'warmest', 'in', 'the', 'midlands.', '&', 'nbsp', ';', 'Temperatures', 'could', 'reach', 'a', 'September', 'record', 'for', 'the', 'century', 'in', 'Ireland', ',', 'but', 'are', 'unlikely', 'to', 'surpass', 'the', '29.1', 'degrees', 'recorded', 'at', 'Kildare', '’', 's', 'Clongowes', 'Wood', 'College', 'on', 'September', '1st', ',', '1906', '.', '<', '/p', '>', '<', 'p', '>', 'Tuesday', ',', 'however', ',', 'will', 'be', 'the', 'last', 'day', 'of', 'the', 'sunshine', 'with', 'rain', 'arriving', 'across', 'the', 'country', 'on', 'Wednesday', 'morning', '.', '<', '/p', '>', '<', 'p', '>', 'Temperatures', 'will', 'remain', 'as', 'high', 'as', '24', 'degrees', 'with', 

In [5]:
len(word_tokenize(sample))

139

In [6]:
sent_tokenize(sample)

['<div><p> This will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine \neverywhere on Tuesday and temperatures of between 22 and 27 degrees.',
 'It will warmest in the midlands.&nbsp;\n Temperatures could reach a September record for the century in Ireland, but are unlikely to surpass the 29.1 \n degrees recorded at Kildare’s Clongowes Wood College on September 1st, 1906.',
 '</p>\n<p> Tuesday, however, will be the last day of the sunshine with rain arriving across the country on Wednesday morning.',
 '</p>\n<p> Temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers.',
 '</p></div>']

In [7]:
len(sent_tokenize(sample))

5

## Unique Tokens

In [8]:
print(set(word_tokenize(sample)))

{'22', 'between', 'a', 'followed', 'Wood', 'fog', 'however', 'div', 'Wednesday', 'are', '29.1', 'clearing', 'last', 's', 'everywhere', 'be', '>', 'with', 'give', 'temperatures', 'unlikely', 'heavy', '&', 'same', '’', ';', 'recorded', 'day', 'as', 'warmest', 'warmth', 'at', 'to', 'punctuated', 'high', 'sunshine', 'across', 'the', '1st', 'country', 'degrees', 'arriving', 'by', 'on', 'record', 'rain', 'This', '.', 'more', '<', '27', '/div', 'but', 'It', '24', 'Ireland', 'mist', ',', '/p', 'reach', 'September', '1906', 'Tuesday', 'p', 'in', 'surpass', 'of', 'College', 'century', 'for', 'unbroken', 'midlands.', 'nbsp', 'morning', 'Clongowes', 'remain', 'Kildare', 'Temperatures', 'will', 'and', 'showers', 'could'}


In [9]:
print(len(set(word_tokenize(sample))))

82


## Casing

In [10]:
sample_lower = sample.lower()
sample_lower

'<div><p> this will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine \neverywhere on tuesday and temperatures of between 22 and 27 degrees. it will warmest in the midlands.&nbsp;\n temperatures could reach a september record for the century in ireland, but are unlikely to surpass the 29.1 \n degrees recorded at kildare’s clongowes wood college on september 1st, 1906. </p>\n<p> tuesday, however, will be the last day of the sunshine with rain arriving across the country on wednesday morning. </p>\n<p> temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers. </p></div>\n'

In [11]:
print(word_tokenize(sample_lower))

['<', 'div', '>', '<', 'p', '>', 'this', 'will', 'be', 'followed', 'by', 'more', 'of', 'the', 'same', 'with', 'the', 'mist', 'and', 'fog', 'clearing', 'to', 'give', 'a', 'day', 'of', 'unbroken', 'sunshine', 'everywhere', 'on', 'tuesday', 'and', 'temperatures', 'of', 'between', '22', 'and', '27', 'degrees', '.', 'it', 'will', 'warmest', 'in', 'the', 'midlands.', '&', 'nbsp', ';', 'temperatures', 'could', 'reach', 'a', 'september', 'record', 'for', 'the', 'century', 'in', 'ireland', ',', 'but', 'are', 'unlikely', 'to', 'surpass', 'the', '29.1', 'degrees', 'recorded', 'at', 'kildare', '’', 's', 'clongowes', 'wood', 'college', 'on', 'september', '1st', ',', '1906', '.', '<', '/p', '>', '<', 'p', '>', 'tuesday', ',', 'however', ',', 'will', 'be', 'the', 'last', 'day', 'of', 'the', 'sunshine', 'with', 'rain', 'arriving', 'across', 'the', 'country', 'on', 'wednesday', 'morning', '.', '<', '/p', '>', '<', 'p', '>', 'temperatures', 'will', 'remain', 'as', 'high', 'as', '24', 'degrees', 'with', 

In [12]:
len(word_tokenize(sample_lower))

139

In [13]:
print(set(word_tokenize(sample_lower)))

{'22', 'between', 'a', 'followed', 'fog', 'however', 'div', 'are', '29.1', 'clearing', 'last', 's', 'everywhere', 'be', 'clongowes', '>', 'with', 'give', 'temperatures', 'unlikely', 'heavy', 'wednesday', '&', 'same', '’', ';', 'recorded', 'day', 'as', 'warmest', 'warmth', 'at', 'this', 'to', 'punctuated', 'high', 'kildare', 'sunshine', 'across', 'the', 'college', '1st', 'it', 'country', 'degrees', 'arriving', 'by', 'on', 'record', 'rain', 'tuesday', '.', 'more', '<', '27', '/div', 'but', '24', 'mist', ',', '/p', 'reach', '1906', 'p', 'in', 'surpass', 'of', 'wood', 'century', 'for', 'unbroken', 'midlands.', 'nbsp', 'morning', 'remain', 'september', 'will', 'and', 'showers', 'ireland', 'could'}


In [14]:
print(len(set(word_tokenize(sample_lower))))

81


### Observation:

Temparature and temparature is treated as the same word if the casing is same. Else, they will be treated as different words.

## Remove HTML Tags

### Technique 1

In [15]:
import re

def remove_html_entities(text):
    ''' This method removes html tags'''
    html_entities = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'
    text = re.sub(html_entities, '', text)
    return text

In [16]:
print(remove_html_entities(sample_lower))

 this will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine 
everywhere on tuesday and temperatures of between 22 and 27 degrees. it will warmest in the midlands.
 temperatures could reach a september record for the century in ireland, but are unlikely to surpass the 29.1 
 degrees recorded at kildare’s clongowes wood college on september 1st, 1906. 
 tuesday, however, will be the last day of the sunshine with rain arriving across the country on wednesday morning. 
 temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers. 



In [17]:
sample_lower_html_cleaned = remove_html_entities(sample_lower).lower()
print(len(set(word_tokenize(sample_lower_html_cleaned))))

72


### Technique 2

In [18]:
import re
from bs4 import BeautifulSoup

In [19]:
soup = BeautifulSoup(sample_lower, "html.parser")
soup

<div><p> this will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine 
everywhere on tuesday and temperatures of between 22 and 27 degrees. it will warmest in the midlands. 
 temperatures could reach a september record for the century in ireland, but are unlikely to surpass the 29.1 
 degrees recorded at kildare’s clongowes wood college on september 1st, 1906. </p>
<p> tuesday, however, will be the last day of the sunshine with rain arriving across the country on wednesday morning. </p>
<p> temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers. </p></div>

In [20]:
stripped_text = soup.get_text()
stripped_text

' this will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine \neverywhere on tuesday and temperatures of between 22 and 27 degrees. it will warmest in the midlands.\xa0\n temperatures could reach a september record for the century in ireland, but are unlikely to surpass the 29.1 \n degrees recorded at kildare’s clongowes wood college on september 1st, 1906. \n tuesday, however, will be the last day of the sunshine with rain arriving across the country on wednesday morning. \n temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers. \n'

In [21]:
stripped_text = re.sub(r'[\t|\n|\xa0]+', '', stripped_text)
stripped_text

' this will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine everywhere on tuesday and temperatures of between 22 and 27 degrees. it will warmest in the midlands. temperatures could reach a september record for the century in ireland, but are unlikely to surpass the 29.1  degrees recorded at kildare’s clongowes wood college on september 1st, 1906.  tuesday, however, will be the last day of the sunshine with rain arriving across the country on wednesday morning.  temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers. '

In [22]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\t|\n|\xa0]+', '', stripped_text)
    return stripped_text

In [23]:
strip_html_tags(sample_lower)

' this will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine everywhere on tuesday and temperatures of between 22 and 27 degrees. it will warmest in the midlands. temperatures could reach a september record for the century in ireland, but are unlikely to surpass the 29.1  degrees recorded at kildare’s clongowes wood college on september 1st, 1906.  tuesday, however, will be the last day of the sunshine with rain arriving across the country on wednesday morning.  temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers. '

## Strip

In [24]:
cleaned_text = strip_html_tags(sample_lower)

In [25]:
cleaned_text.strip()

'this will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine everywhere on tuesday and temperatures of between 22 and 27 degrees. it will warmest in the midlands. temperatures could reach a september record for the century in ireland, but are unlikely to surpass the 29.1  degrees recorded at kildare’s clongowes wood college on september 1st, 1906.  tuesday, however, will be the last day of the sunshine with rain arriving across the country on wednesday morning.  temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers.'

## Remove Accented Characters


This function returns the normal form for the Unicode string unistr. Valid values for form are ‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’.

In [26]:
sample_accented_text = 'Baile Átha Cliath'

In [27]:
import unicodedata

accent_removed_text = unicodedata.normalize(
    'NFKD', sample_accented_text).encode('ascii',
                                         'ignore').decode('utf-8', 'ignore')

accent_removed_text

'Baile Atha Cliath'

## Remove Special Characters

In [28]:
# Remove everything except alphabets and digits and space

pattern = r'[^a-zA-Z0-9\s]' 

In [29]:
re.sub(pattern, '', sample_lower)

'divp this will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine \neverywhere on tuesday and temperatures of between 22 and 27 degrees it will warmest in the midlandsnbsp\n temperatures could reach a september record for the century in ireland but are unlikely to surpass the 291 \n degrees recorded at kildares clongowes wood college on september 1st 1906 p\np tuesday however will be the last day of the sunshine with rain arriving across the country on wednesday morning p\np temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers pdiv\n'

In [30]:
re.sub(pattern, '', sample_accented_text)

'Baile tha Cliath'

## Expanding Contractions

In [31]:
# !pip install contractions

In [32]:
import contractions

In [33]:
contractions.fix("I'm")

'I am'

In [34]:
list(contractions.contractions_dict.items())[:5]

[("I'm", 'I am'),
 ("I'm'a", 'I am about to'),
 ("I'm'o", 'I am going to'),
 ("I've", 'I have'),
 ("I'll", 'I will')]

In [35]:
len(list(contractions.contractions_dict.items()))

329

## Stemming

In [36]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [37]:
for each in word_tokenize(sample_lower_html_cleaned)[:20]:
    print("{} : {}".format(each, ps.stem(each)))

this : thi
will : will
be : be
followed : follow
by : by
more : more
of : of
the : the
same : same
with : with
the : the
mist : mist
and : and
fog : fog
clearing : clear
to : to
give : give
a : a
day : day
of : of


## Lemmatization

In [38]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [39]:
wnl.lemmatize('this')

'this'

In [40]:
wnl.lemmatize('clearing')

'clearing'

In [41]:
wnl.lemmatize('clearing', 'v')

'clear'

In [42]:
wnl.lemmatize('clearing', 'n')

'clearing'

In [43]:
for each in word_tokenize(sample_lower_html_cleaned)[:20]:
    print("{} : {}".format(each, wnl.lemmatize(each)))

this : this
will : will
be : be
followed : followed
by : by
more : more
of : of
the : the
same : same
with : with
the : the
mist : mist
and : and
fog : fog
clearing : clearing
to : to
give : give
a : a
day : day
of : of


You will need a POS tagger to get the POS tag for each token and then apply lemmatization.

## Stopwords Removal

In [44]:
print(nltk.corpus.stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [45]:
print(word_tokenize(sample_lower_html_cleaned))

['this', 'will', 'be', 'followed', 'by', 'more', 'of', 'the', 'same', 'with', 'the', 'mist', 'and', 'fog', 'clearing', 'to', 'give', 'a', 'day', 'of', 'unbroken', 'sunshine', 'everywhere', 'on', 'tuesday', 'and', 'temperatures', 'of', 'between', '22', 'and', '27', 'degrees', '.', 'it', 'will', 'warmest', 'in', 'the', 'midlands', '.', 'temperatures', 'could', 'reach', 'a', 'september', 'record', 'for', 'the', 'century', 'in', 'ireland', ',', 'but', 'are', 'unlikely', 'to', 'surpass', 'the', '29.1', 'degrees', 'recorded', 'at', 'kildare', '’', 's', 'clongowes', 'wood', 'college', 'on', 'september', '1st', ',', '1906.', 'tuesday', ',', 'however', ',', 'will', 'be', 'the', 'last', 'day', 'of', 'the', 'sunshine', 'with', 'rain', 'arriving', 'across', 'the', 'country', 'on', 'wednesday', 'morning', '.', 'temperatures', 'will', 'remain', 'as', 'high', 'as', '24', 'degrees', 'with', 'the', 'warmth', 'punctuated', 'by', 'heavy', 'showers', '.']


In [46]:
print([
    each for each in word_tokenize(sample_lower_html_cleaned)
    if each not in nltk.corpus.stopwords.words('english')
])

['followed', 'mist', 'fog', 'clearing', 'give', 'day', 'unbroken', 'sunshine', 'everywhere', 'tuesday', 'temperatures', '22', '27', 'degrees', '.', 'warmest', 'midlands', '.', 'temperatures', 'could', 'reach', 'september', 'record', 'century', 'ireland', ',', 'unlikely', 'surpass', '29.1', 'degrees', 'recorded', 'kildare', '’', 'clongowes', 'wood', 'college', 'september', '1st', ',', '1906.', 'tuesday', ',', 'however', ',', 'last', 'day', 'sunshine', 'rain', 'arriving', 'across', 'country', 'wednesday', 'morning', '.', 'temperatures', 'remain', 'high', '24', 'degrees', 'warmth', 'punctuated', 'heavy', 'showers', '.']


In [47]:
print(
    len([
        each for each in word_tokenize(sample_lower_html_cleaned)
        if each not in nltk.corpus.stopwords.words('english')
    ]))

64


## POS Tagging using spaCy

In [48]:
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(sample_lower_html_cleaned)

In [49]:
for token in doc:
    print("token: {}   pos: {}   tag: {}".format(token.text, token.pos_, token.tag_))

token:     pos: SPACE   tag: _SP
token: this   pos: DET   tag: DT
token: will   pos: AUX   tag: MD
token: be   pos: AUX   tag: VB
token: followed   pos: VERB   tag: VBN
token: by   pos: ADP   tag: IN
token: more   pos: ADJ   tag: JJR
token: of   pos: ADP   tag: IN
token: the   pos: DET   tag: DT
token: same   pos: ADJ   tag: JJ
token: with   pos: ADP   tag: IN
token: the   pos: DET   tag: DT
token: mist   pos: NOUN   tag: NN
token: and   pos: CCONJ   tag: CC
token: fog   pos: NOUN   tag: NN
token: clearing   pos: NOUN   tag: NN
token: to   pos: PART   tag: TO
token: give   pos: VERB   tag: VB
token: a   pos: DET   tag: DT
token: day   pos: NOUN   tag: NN
token: of   pos: ADP   tag: IN
token: unbroken   pos: ADJ   tag: JJ
token: sunshine   pos: NOUN   tag: NN
token: 
   pos: SPACE   tag: _SP
token: everywhere   pos: ADV   tag: RB
token: on   pos: ADP   tag: IN
token: tuesday   pos: PROPN   tag: NNP
token: and   pos: CCONJ   tag: CC
token: temperatures   pos: NOUN   tag: NNS
token: of   

## Named Entity Recognition using spaCy

In [50]:
for token_ent in doc.ents:
    print("token: {}   entity: {}".format(token_ent.text, token_ent.label_))

token: tuesday   entity: DATE
token: between 22   entity: CARDINAL
token: 27 degrees   entity: QUANTITY
token: september   entity: DATE
token: the century   entity: DATE
token: ireland   entity: GPE
token: 29.1 
    entity: QUANTITY
token: september 1st, 1906   entity: DATE
token: tuesday   entity: DATE
token: the last day   entity: DATE
token: wednesday   entity: DATE
token: morning   entity: TIME
token: as high as   entity: CARDINAL
token: 24 degrees   entity: QUANTITY
