# Install Dependencies

In [None]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# Case Conversion

In [None]:
text = 'The quick brown fox jumped over The Big Dog'
text

'The quick brown fox jumped over The Big Dog'

In [None]:
text.lower()

'the quick brown fox jumped over the big dog'

In [None]:
text.upper()

'THE QUICK BROWN FOX JUMPED OVER THE BIG DOG'

In [None]:
text.title()

'The Quick Brown Fox Jumped Over The Big Dog'

# Tokenization

In [None]:
sample_text = ("US unveils world's most powerful supercomputer, beats China. "
               "The US has unveiled the world's most powerful supercomputer called 'Summit', "
               "beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
               "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
               "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
               "which reportedly take up the size of two tennis courts.")
sample_text

"US unveils world's most powerful supercomputer, beats China. The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight. With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, which reportedly take up the size of two tennis courts."

In [None]:
import nltk

nltk.sent_tokenize(sample_text)

["US unveils world's most powerful supercomputer, beats China.",
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight.",
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.',
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']

In [None]:
print(nltk.word_tokenize(sample_text))

['US', 'unveils', 'world', "'s", 'most', 'powerful', 'supercomputer', ',', 'beats', 'China', '.', 'The', 'US', 'has', 'unveiled', 'the', 'world', "'s", 'most', 'powerful', 'supercomputer', 'called', "'Summit", "'", ',', 'beating', 'the', 'previous', 'record-holder', 'China', "'s", 'Sunway', 'TaihuLight', '.', 'With', 'a', 'peak', 'performance', 'of', '200,000', 'trillion', 'calculations', 'per', 'second', ',', 'it', 'is', 'over', 'twice', 'as', 'fast', 'as', 'Sunway', 'TaihuLight', ',', 'which', 'is', 'capable', 'of', '93,000', 'trillion', 'calculations', 'per', 'second', '.', 'Summit', 'has', '4,608', 'servers', ',', 'which', 'reportedly', 'take', 'up', 'the', 'size', 'of', 'two', 'tennis', 'courts', '.']


In [None]:
import spacy
nlp = spacy.load('en')

text_spacy = nlp(sample_text)
import spacy

# Load the English model
nlp = spacy.load("en_core_web_sm")

# Your sample text
sample_text = "This is a sample sentence."

# Process the text with the loaded model
text_spacy = nlp(sample_text)


OSError: ignored

In [None]:
[obj.text for obj in text_spacy.sents]

In [None]:
print([obj.text for obj in text_spacy])

In [None]:
import requests

data = requests.get('http://www.gutenberg.org/cache/epub/8001/pg8001.html')
content = data.text
print(content[2745:3948])

In [None]:
import re
from bs4 import BeautifulSoup

def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

clean_content = strip_html_tags(content)
print(clean_content[1163:1957])

# Removing Accented Characters

In [None]:
import unicodedata

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [None]:
s = 'S√≥mƒõ √Åccƒõntƒõd tƒõxt'
s

In [None]:
remove_accented_chars(s)

# Removing Special Characters, Numbers and Symbols

In [None]:
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text


In [None]:
s = "Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ üôÇüôÇüôÇ"
s

In [None]:
remove_special_characters(s, remove_digits=True)

In [None]:
remove_special_characters(s)

# Expanding Contractions

In [None]:
!pip install contractions
!pip install textsearch

In [None]:
s = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
s

In [None]:
import contractions

list(contractions.contractions_dict.items())[:10]

In [None]:
contractions.fix(s)

# Stemming

In [None]:
# Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

In [None]:
ps.stem('lying')

In [None]:
ps.stem('strange')

# Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [None]:
help(wnl.lemmatize)

In [None]:
# lemmatize nouns
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('boxes', 'n'))

In [None]:
# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print(wnl.lemmatize('ate', 'v'))

In [None]:
# lemmatize adjectives
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('fancier', 'a'))

In [None]:
# ineffective lemmatization
print(wnl.lemmatize('ate', 'n'))
print(wnl.lemmatize('fancier', 'v'))
print(wnl.lemmatize('fancier'))

In [None]:
s = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

### Tokenize

In [None]:
tokens = nltk.word_tokenize(s)
print(tokens)

In [None]:
lemmatized_text = ' '.join(wnl.lemmatize(token) for token in tokens)
lemmatized_text

### POS Tagging

In [None]:
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

https://www.guru99.com/pos-tagging-chunking-nltk.html

### Tag conversion to WordNet Tags

In [None]:
from nltk.corpus import wordnet

def pos_tag_wordnet(tagged_tokens):
    tag_map = {'j': wordnet.ADJ, 'v': wordnet.VERB, 'n': wordnet.NOUN, 'r': wordnet.ADV}
    new_tagged_tokens = [(word, tag_map.get(tag[0].lower(), wordnet.NOUN))
                            for word, tag in tagged_tokens]
    return new_tagged_tokens

In [None]:
wordnet_tokens = pos_tag_wordnet(tagged_tokens)
print(wordnet_tokens)

### Effective Lemmatization

In [None]:
lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
lemmatized_text

### Your turn: Define a function such that you put all the above steps together so that it does the following

- Function name is __`wordnet_lemmatize_text(...)`__
- Input is a variable __`text`__ which should take in a document (bunch of words)
- Call the earlier defined functions and utilize them
- Return lemmatized text as the output (as a string)

In [None]:
wnl = WordNetLemmatizer()

def wordnet_lemmatize_text(text):
    tagged_tokens = nltk.pos_tag(nltk.word_tokenize(text))
    wordnet_tokens = pos_tag_wordnet(tagged_tokens)
    lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in wordnet_tokens)
    return lemmatized_text

### Your Turn: Now call the function on the below sentence and test it

In [None]:
s

In [None]:
wordnet_lemmatize_text(s)

## Lemmatization with Spacy

In [None]:
import spacy
nlp = spacy.load('en', parse=False, tag=False, entity=False)

def spacy_lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [None]:
s

In [None]:
spacy_lemmatize_text(s)

# Stopword Removal

In [None]:
def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]

    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]

    filtered_text = ' '.join(filtered_tokens)
    return filtered_text

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words[:10])

In [None]:
s

In [None]:
remove_stopwords(s, is_lower_case=False)

### Your turn: Remove the words 'the' and 'brown' from the stop_words list and call the function with this new list

In [None]:
stop_words.remove('the')
stop_words.append('brown')

In [None]:
remove_stopwords(s, is_lower_case=False, stopwords=stop_words)