# Case Conversion

In [2]:
text = 'The quick brown fox jumped over The Big Dog'
text

'The quick brown fox jumped over The Big Dog'

In [3]:
text.lower()

'the quick brown fox jumped over the big dog'

In [4]:
text.upper()

'THE QUICK BROWN FOX JUMPED OVER THE BIG DOG'

In [5]:
text.title()

'The Quick Brown Fox Jumped Over The Big Dog'

In [6]:
text.capitalize()

'The quick brown fox jumped over the big dog'

# Tokenization

In [1]:
sample_text = ("US unveils world's most powerful supercomputer, beats China. " 
               "The US has unveiled the world's most powerful supercomputer called 'Summit', " 
               "beating the previous record-holder China's Sunway TaihuLight. With a peak performance "
               "of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, "
               "which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, "
               "which reportedly take up the size of two tennis courts.")
sample_text

"US unveils world's most powerful supercomputer, beats China. The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight. With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second. Summit has 4,608 servers, which reportedly take up the size of two tennis courts."

In [2]:
import nltk

nltk.sent_tokenize(sample_text)

["US unveils world's most powerful supercomputer, beats China.",
 "The US has unveiled the world's most powerful supercomputer called 'Summit', beating the previous record-holder China's Sunway TaihuLight.",
 'With a peak performance of 200,000 trillion calculations per second, it is over twice as fast as Sunway TaihuLight, which is capable of 93,000 trillion calculations per second.',
 'Summit has 4,608 servers, which reportedly take up the size of two tennis courts.']

In [3]:
print(nltk.word_tokenize(sample_text))

['US', 'unveils', 'world', "'s", 'most', 'powerful', 'supercomputer', ',', 'beats', 'China', '.', 'The', 'US', 'has', 'unveiled', 'the', 'world', "'s", 'most', 'powerful', 'supercomputer', 'called', "'Summit", "'", ',', 'beating', 'the', 'previous', 'record-holder', 'China', "'s", 'Sunway', 'TaihuLight', '.', 'With', 'a', 'peak', 'performance', 'of', '200,000', 'trillion', 'calculations', 'per', 'second', ',', 'it', 'is', 'over', 'twice', 'as', 'fast', 'as', 'Sunway', 'TaihuLight', ',', 'which', 'is', 'capable', 'of', '93,000', 'trillion', 'calculations', 'per', 'second', '.', 'Summit', 'has', '4,608', 'servers', ',', 'which', 'reportedly', 'take', 'up', 'the', 'size', 'of', 'two', 'tennis', 'courts', '.']


# Removing Accented Characters

In [None]:
import unicodedata

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

In [None]:
s = 'Sómě Áccěntěd těxt'
s

In [None]:
remove_accented_chars(s)

# Removing Special Characters, Numbers and Symbols

In [None]:
import re

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-Z0-9\s]' if not remove_digits else r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', text)
    return text


In [None]:
s = "Well this was fun! See you at 7:30, What do you think!!? #$@@9318@ 🙂🙂🙂"
s

### Your Turn: Try both combinations first remove all special characters and then keep the digits

In [None]:
remove_special_characters(____________)

In [None]:
remove_special_characters(____________)

# Expanding Contractions

In [16]:
!pip install contractions



In [7]:
s = "Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"
s

"Y'all can't expand contractions I'd think! You wouldn't be able to. How'd you do it?"

In [8]:
import contractions

list(contractions.contractions_dict.items())[:10]

[("I'm", 'I am'),
 ("I'm'a", 'I am about to'),
 ("I'm'o", 'I am going to'),
 ("I've", 'I have'),
 ("I'll", 'I will'),
 ("I'll've", 'I will have'),
 ("I'd", 'I would'),
 ("I'd've", 'I would have'),
 ('Whatcha', 'What are you'),
 ("amn't", 'am not')]

In [13]:
list(contractions.contractions_dict.items())[-10:]

[('y’all’re', 'you all are'),
 ('y’all’ve', 'you all have'),
 ('y’all’d', 'you all would'),
 ('y’all’d’ve', 'you all would have'),
 ('you’re', 'you are'),
 ('you’ve', 'you have'),
 ('you’ll’ve', 'you shall have'),
 ('you’ll', 'you will'),
 ('you’d', 'you would'),
 ('you’d’ve', 'you would have')]

In [9]:
contractions.fix(s)

'You all cannot expand contractions I would think! You would not be able to. How did you do it?'

In [None]:
from contractions_list import CONTRACTION_MAP
import re

list(CONTRACTION_MAP.items())[:10]

In [None]:
contractions_pattern = re.compile('({})'.format('|'.join(CONTRACTION_MAP.keys())), 
                                  flags=re.IGNORECASE|re.DOTALL)
contractions_pattern

In [None]:
contractions_pattern.sub(lambda r: print(r.group(0), '->', r.group(0).lower(), 
                                         '->', CONTRACTION_MAP.get(r.group(0).lower())), s)

In [None]:
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
    
    contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())), 
                                      flags=re.IGNORECASE|re.DOTALL)
    def expand_match(contraction):
        match = contraction.group(0)
        first_char = match[0]
        expanded_contraction = contraction_mapping.get(match)\
                                if contraction_mapping.get(match)\
                                else contraction_mapping.get(match.lower())                       
        expanded_contraction = first_char+expanded_contraction[1:]
        return expanded_contraction
        
    expanded_text = contractions_pattern.sub(expand_match, text)
    expanded_text = re.sub("'", "", expanded_text)
    return expanded_text

In [None]:
expand_contractions(s, contraction_mapping=CONTRACTION_MAP)

# Stemming

In [14]:
# Porter Stemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

ps.stem('jumping'), ps.stem('jumps'), ps.stem('jumped')

('jump', 'jump', 'jump')

In [15]:
ps.stem('lying')

'lie'

In [16]:
ps.stem('strange')

'strang'

In [17]:
ps.stem('flies')

'fli'

### Your Turn: Try using Lancaster stemmer on the same two words as depicted previously

In [18]:
# Lancaster Stemmer
from nltk.stem import LancasterStemmer
ls = LancasterStemmer()

ls.stem('jumping'), ls.stem('jumps'), ls.stem('jumped')

('jump', 'jump', 'jump')

In [19]:
ls.stem('lying')

'lying'

In [20]:
ls.stem('strange')

'strange'

In [21]:
import nltk
ps = nltk.porter.PorterStemmer()
ls = nltk.stem.LancasterStemmer()

def simple_stemming(text, stemmer=ps):
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    return text

### Your Turn: Try calling the above defined function for both Lancaster and Porter stemmer separately

Do you notice any difference in the results?

In [27]:
s = "My system keeps crashing his crashed yesterday ours crashes daily and presumably we are not lying"
s

'My system keeps crashing his crashed yesterday ours crashes daily and presumably we are not lying'

In [29]:
simple_stemming(s)

'my system keep crash hi crash yesterday our crash daili and presum we are not lie'

In [30]:
simple_stemming(s)

'my system keep crash hi crash yesterday our crash daili and presum we are not lie'

# Lemmatization

In [22]:
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

In [32]:
help(wnl.lemmatize)

Help on method lemmatize in module nltk.stem.wordnet:

lemmatize(word:str, pos:str='n') -> str method of nltk.stem.wordnet.WordNetLemmatizer instance
    Lemmatize `word` using WordNet's built-in morphy function.
    Returns the input word unchanged if it cannot be found in WordNet.
    
    :param word: The input word to lemmatize.
    :type word: str
    :param pos: The Part Of Speech tag. Valid options are `"n"` for nouns,
        `"v"` for verbs, `"a"` for adjectives, `"r"` for adverbs and `"s"`
        for satellite adjectives.
    :param pos: str
    :return: The lemma of `word`, for the given `pos`.



In [39]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\jainneha\AppData\Roaming\nltk_data...


True

In [23]:
# lemmatize nouns
print(wnl.lemmatize('cars', 'n'))
print(wnl.lemmatize('boxes', 'n'))

car
box


In [24]:
wnl.lemmatize('flies')

'fly'

In [25]:
# lemmatize verbs
print(wnl.lemmatize('running', 'v'))
print(wnl.lemmatize('ate', 'v'))

run
eat


In [44]:
ps.stem('ate')

'ate'

In [26]:
# lemmatize adjectives
print(wnl.lemmatize('saddest', 'a'))
print(wnl.lemmatize('fancier', 'a'))

sad
fancy


In [27]:
ps.stem('saddest'), ps.stem('fancier')

('saddest', 'fancier')

In [29]:
wnl.lemmatize("flies")

'fly'

In [22]:
# ineffective lemmatization
print(wnl.lemmatize('ate', 'n'))
print(wnl.lemmatize('fancier', 'v'))
print(wnl.lemmatize('fancier'))

ate
fancier
fancier


In [24]:
s = 'The brown foxes are quick and they are jumping over the sleeping lazy dogs!'

In [25]:
tokens = nltk.word_tokenize(s)
print(tokens)

['The', 'brown', 'foxes', 'are', 'quick', 'and', 'they', 'are', 'jumping', 'over', 'the', 'sleeping', 'lazy', 'dogs', '!']


In [26]:
lemmatized_text = ' '.join(wnl.lemmatize(token) for token in tokens)
lemmatized_text

'The brown fox are quick and they are jumping over the sleeping lazy dog !'

In [None]:
tagged_tokens = nltk.pos_tag(tokens)
print(tagged_tokens)

In [None]:
lemmatized_text = ' '.join(wnl.lemmatize(word, tag) for word, tag in tagged_tokens)
lemmatized_text

### Your turn: Define a function such that you put all the above steps together so that it does the following

- Function name is __`wordnet_lemmatize_text(...)`__
- Input is a variable __`text`__ which should take in a document (bunch of words)
- Call the earlier defined functions and utilize them
- Return lemmatized text as the output (as a string)

In [None]:
wnl = WordNetLemmatizer()

def wordnet_lemmatize_text(text):
    ______
    return lemmatized_text

### Your Turn: Now call the function on the below sentence and test it

In [None]:
s

In [None]:
wordnet_lemmatize_text(s)

In [None]:
import spacy
nlp = spacy.load('en', parse=False, tag=False, entity=False)

def spacy_lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

In [None]:
s

In [None]:
spacy_lemmatize_text(s)

# Stopword Removal

In [None]:
def remove_stopwords(text, is_lower_case=False, stopwords=None):
    if not stopwords:
        stopwords = nltk.corpus.stopwords.words('english')
    tokens = nltk.word_tokenize(text)
    tokens = [token.strip() for token in tokens]
    
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopwords]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopwords]
    
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

In [None]:
stop_words = nltk.corpus.stopwords.words('english')
print(stop_words[:10])

In [None]:
s

In [None]:
remove_stopwords(s, is_lower_case=False)

### Your turn: Remove the words 'the' and 'brown' from the stop_words list and call the function with this new list

In [None]:
_______________
_______________

In [None]:
remove_stopwords(______________)