# Imports

In [None]:
import pandas as pd
import numpy as np

pd.set_option('display.max_colwidth', None)

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Text Wrangling Examples

In [None]:
sample = '''<div><p> This will be followed by more of the same with the mist and fog clearing to give a day of unbroken sunshine 
everywhere on Tuesday and temperatures of between 22 and 27 degrees. It will warmest in the midlands.&nbsp;
 Temperatures could reach a September record for the century in Ireland, but are unlikely to surpass the 29.1 
 degrees recorded at Kildare’s Clongowes Wood College on September 1st, 1906. </p>
<p> Tuesday, however, will be the last day of the sunshine with rain arriving across the country on Wednesday morning. </p>
<p> Temperatures will remain as high as 24 degrees with the warmth punctuated by heavy showers. </p></div>
'''

In [None]:
len(sample)

## Tokenization

In [None]:
from nltk import sent_tokenize, word_tokenize

In [None]:
print(word_tokenize(sample))

In [None]:
len(word_tokenize(sample))

In [None]:
sent_tokenize(sample)

In [None]:
len(sent_tokenize(sample))

In [None]:
for each_seq in sent_tokenize(sample):
    print('-------')
    print(word_tokenize(each_seq))

## Unique Tokens

In [None]:
print(set(word_tokenize(sample)))

In [None]:
print(len(set(word_tokenize(sample))))

## Casing

In [None]:
sample_lower = sample.lower()
sample_lower

In [None]:
print(word_tokenize(sample_lower))

In [None]:
len(word_tokenize(sample_lower))

In [None]:
print(set(word_tokenize(sample_lower)))

In [None]:
print(len(set(word_tokenize(sample_lower))))

### Observation:

Temparature and temparature is treated as the same word if the casing is same. Else, they will be treated as different words.

## Remove HTML Tags

### Technique 1

In [None]:
import re

def remove_html_entities(text):
    ''' This method removes html tags'''
    html_entities = r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});'
    text = re.sub(html_entities, '', text)
    return text

In [None]:
print(remove_html_entities(sample_lower))

In [None]:
sample_lower_html_cleaned = remove_html_entities(sample_lower).lower()
print(len(set(word_tokenize(sample_lower_html_cleaned))))

### Technique 2

In [None]:
import re
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(sample_lower, "html.parser")
soup

In [None]:
stripped_text = soup.get_text()
stripped_text

In [None]:
stripped_text = re.sub(r'[\t|\n|\xa0]+', '', stripped_text)
stripped_text

In [None]:
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\t|\n|\xa0]+', '', stripped_text)
    return stripped_text

In [None]:
strip_html_tags(sample_lower)

## Strip

In [None]:
cleaned_text = strip_html_tags(sample_lower)

In [None]:
cleaned_text.strip()

## Remove Accented Characters


This function returns the normal form for the Unicode string unistr. Valid values for form are ‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’.

In [None]:
sample_accented_text = 'Baile Átha Cliath'

In [None]:
import unicodedata

accent_removed_text = unicodedata.normalize(
    'NFKD', sample_accented_text).encode('ascii',
                                         'ignore').decode('utf-8', 'ignore')

accent_removed_text

## Remove Special Characters

In [None]:
# Remove everything except alphabets and digits and space

pattern = r'[^a-zA-Z0-9\s]' 

In [None]:
re.sub(pattern, '', sample_lower)

In [None]:
re.sub(pattern, '', sample_accented_text)

## Expanding Contractions

In [None]:
# !pip install contractions

In [None]:
import contractions

In [None]:
contractions.fix("I'm")

In [None]:
list(contractions.contractions_dict.items())[:5]

In [None]:
len(list(contractions.contractions_dict.items()))

## Stemming

In [None]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

In [None]:
ps.stem("important")

In [None]:
ps.stem("bravery")

In [None]:
for each in word_tokenize(sample_lower_html_cleaned)[:20]:
    print("{} : {}".format(each, ps.stem(each)))

## Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [None]:
wnl.lemmatize('this')

In [None]:
wnl.lemmatize('clearing')

In [None]:
wnl.lemmatize('clearing', 'v')

In [None]:
wnl.lemmatize('clearing', 'n')

In [None]:
for each in word_tokenize(sample_lower_html_cleaned)[:20]:
    print("{} : {}".format(each, wnl.lemmatize(each)))

You will need a POS tagger to get the POS tag for each token and then apply lemmatization.

## Stopwords Removal

In [None]:
print(nltk.corpus.stopwords.words('english'))

In [None]:
print(word_tokenize(sample_lower_html_cleaned))

In [None]:
# for every word in the tokenized list, if the word is not a stopword, then print it

print([
    each for each in word_tokenize(sample_lower_html_cleaned)
    if each not in nltk.corpus.stopwords.words('english')
])

In [None]:
print(
    len([
        each for each in word_tokenize(sample_lower_html_cleaned)
        if each not in nltk.corpus.stopwords.words('english')
    ]))

## POS Tagging using spaCy

In [None]:
!pip install spacy

In [None]:
#Either
from spacy.cli.download import download
download('en_core_web_sm')

# OR

python -m spacy download en_core_web_sm

In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [None]:
doc = nlp(sample_lower_html_cleaned)

In [None]:
doc

In [None]:
for token in doc:
    print("token: {}   pos: {}   tag: {}".format(token.text, token.pos_, token.tag_))

## Named Entity Recognition using spaCy

In [None]:
for token_ent in doc.ents:
    print("token: {}   entity: {}".format(token_ent.text, token_ent.label_))