# Part of speech tagging

In [54]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [55]:
sentence = "the boy ate pancakes at the restaurant"

In [56]:
token = nltk.word_tokenize(sentence)
token

['the', 'boy', 'ate', 'pancakes', 'at', 'the', 'restaurant']

In [57]:
tags = nltk.pos_tag(token)
tags

[('the', 'DT'),
 ('boy', 'NN'),
 ('ate', 'NN'),
 ('pancakes', 'NNS'),
 ('at', 'IN'),
 ('the', 'DT'),
 ('restaurant', 'NN')]

# Expand Clitics/Contractions (eg: She's to She is)

In [58]:
!pip install contractions
import contractions

statement = "She's going for a walk. Jack didn't go because he was tired. They'll surely go for a walk together tomorrow."

expanded_statement = []

for word in statement.split():
  expanded_statement.append(contractions.fix(word))

fixed_statement = " ".join(expanded_statement)

print(fixed_statement)

she is going for a walk. Jack did not go because he was tired. they will surely go for a walk together tomorrow.


# Tokenization

In [59]:
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [60]:
sentence = "He's a German Shepherd. They are some of the smartest dogs!"
t = word_tokenize(sentence)
t

['He',
 "'s",
 'a',
 'German',
 'Shepherd',
 '.',
 'They',
 'are',
 'some',
 'of',
 'the',
 'smartest',
 'dogs',
 '!']

In [61]:
multiple = "Hi! My name is Sid. I live in Mumbai, India."
x = sent_tokenize(multiple)
x

['Hi!', 'My name is Sid.', 'I live in Mumbai, India.']

# Use contrations library to expand clitics

In [62]:
import contractions

a = contractions.fix("you're happy now")
b = contractions.fix("yall're happy now.", slang = False)
c = contractions.fix("yall're happy now.")

a, b, c

('you are happy now', "yall're happy now.", 'you all are happy now.')

# REGEX

In [63]:
sentence = "He's fast. The buy can run up the hill in a jiffy!"

result = sentence.split()

print(result)

["He's", 'fast.', 'The', 'buy', 'can', 'run', 'up', 'the', 'hill', 'in', 'a', 'jiffy!']


In [64]:
import re

text = "uno-dos+tres#quatro cinco"

x = re.split("\W+", text)

x

['uno', 'dos', 'tres', 'quatro', 'cinco']

In [65]:
# alternate approach

y = re.split("[-+#]", text)

y

['uno', 'dos', 'tres', 'quatro cinco']

In [66]:
# re.sub()

string = "Hi, I am currently in New York."

new_string = re.sub("New York", "Mumbai", string)

new_string

'Hi, I am currently in Mumbai.'

In [67]:
words = ["Sid", "Mumbai", "Bangalore"]

statement = "Hi! My name is Sid. I live in Mumbai, India."

for word in words:
  print("Looking for '{}' in '{}': \n".format(word, statement))

  if re.search(word, statement):
    print("--search succesful--\n")
  else:
    print("--word not found--\n")


Looking for 'Sid' in 'Hi! My name is Sid. I live in Mumbai, India.': 

--search succesful--

Looking for 'Mumbai' in 'Hi! My name is Sid. I live in Mumbai, India.': 

--search succesful--

Looking for 'Bangalore' in 'Hi! My name is Sid. I live in Mumbai, India.': 

--word not found--



In [68]:
# re.findall()


contact = "Sid Krishnan; sid@sidkrishnan.com; Mumbai, India;"

email = re.findall(r"[\w\.\-]+@[.\w\-]+", contact)

email

['sid@sidkrishnan.com']

In [69]:
# Context-Free Grammar Rules

# parse tree

import nltk
nltk.download('punkt') # punkt is a pre-trained tokenizer
nltk.download('averaged_preceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk import pos_tag, word_tokenize, RegexpParser

sentence = "I want a morning flight."

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Error loading averaged_preceptron_tagger: Package
[nltk_data]     'averaged_preceptron_tagger' not found in index


In [70]:
tags = pos_tag(word_tokenize(sentence))

tags

[('I', 'PRP'),
 ('want', 'VBP'),
 ('a', 'DT'),
 ('morning', 'NN'),
 ('flight', 'NN'),
 ('.', '.')]

In [71]:
# chunk extraction

chunker = RegexpParser("""
                       NP : {<DT>?<JJ>*<NN>}        # to extract noun phrase
                       P : {<IN>}                   # to extract prepositions
                       V : {<V.*>}                  # to extract verbs
                       PP : {<P> <NP>}              # to extract preposition phrase
                       VP : {<V> <NP|PP>*}          # to extract verb phrase
                       """)

In [72]:
result = chunker.parse(tags)

print(result)

(S I/PRP (VP (V want/VBP) (NP a/DT morning/NN) (NP flight/NN)) ./.)


In [73]:
statement = "The boy ate the pancakes from the restaurant"

tags = pos_tag(word_tokenize(statement))
result = chunker.parse(tags)
print(result)

(S
  (NP The/DT boy/NN)
  (VP (V ate/VB))
  the/DT
  pancakes/NNS
  (PP (P from/IN) (NP the/DT restaurant/NN)))


In [74]:
# result.draw() --- this doesnt work on colab but does work on jupyter

# STEMMING

In [75]:
# 3 main methods:

 # -- Porter 
 # -- Lancaster
 # -- Snowball


In [76]:
from nltk.stem import PorterStemmer

porter = PorterStemmer()

print(porter.stem("running"))
print(porter.stem("bundle"))
print(porter.stem("illustrator"))
print(porter.stem("slept"))
print(porter.stem("restaurant"))
print(porter.stem("organization"))

run
bundl
illustr
slept
restaur
organ


In [77]:
from nltk.stem import LancasterStemmer

lanc = LancasterStemmer()

print(lanc.stem("running"))
print(lanc.stem("bundle"))
print(lanc.stem("illustrator"))
print(lanc.stem("slept"))
print(lanc.stem("restaurant"))
print(lanc.stem("organization"))

run
bundl
illust
slept
resta
org


In [78]:
from nltk.stem import SnowballStemmer

snow = SnowballStemmer("english")

print(snow.stem("running"))
print(snow.stem("bundle"))
print(snow.stem("illustrator"))
print(snow.stem("slept"))
print(snow.stem("restaurant"))
print(snow.stem("organization"))

run
bundl
illustr
slept
restaur
organ


# LEMMATIZATION

In [79]:
import nltk 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [80]:
lem = WordNetLemmatizer()
token = "spies"
token2 = "festivities"

lemma = lem.lemmatize(token)
lemma2 = lem.lemmatize(token2)

print(token, " --> ", lemma)
print(token2, " --> ", lemma2)

spies  -->  spy
festivities  -->  festivity


In [81]:
string1 = "Mumbai is India's commercial capital"
string2 = "My favourite part of Mumbai is Marine Drive"

tokens = nltk.word_tokenize(string1)
print("Tokenized sentence : ", tokens)

# lemmatize the tokenized sentence:
lemmatized_tokens = " ".join(lem.lemmatize(x) for x in tokens)
print("Lemmatized tokens : ", lemmatized_tokens)

Tokenized sentence :  ['Mumbai', 'is', 'India', "'s", 'commercial', 'capital']
Lemmatized tokens :  Mumbai is India 's commercial capital


# PRACTICAL EXAMPLES

# Clean Tweets

In [82]:
def hashtags(tweet):
  hash = re.findall(r"#(\w+)", tweet)
  return hash

def remove_username(tweet):
  text = re.sub(r"@[A-Za-z]+[A-Za-z0-9-_]+", "", tweet)
  return text

def remove_links(tweet):
  text = re.sub(r"http\S+", "", tweet)
  text = text.strip("[link]")
  return text

def remove_non_ascii(x):    # only use if you do not plan to translate tweets to other languages
  return "".join(i for i in x if ord(x) < 128)

def lowercase_tweet(tweet):
  return tweet.lower()

In [83]:
sample_tweet = "this is #fun! let's #doThis more often @sid! https.google.com"

print(remove_username(sample_tweet))
print(hashtags(sample_tweet))
print(remove_links(sample_tweet))

this is #fun! let's #doThis more often ! https.google.com
['fun', 'doThis']
this is #fun! let's #doThis more often @sid! 


# Stopwords removal

In [91]:
from nltk.corpus import stopwords

def remove_stopwords(text):
  stops = set(stopwords.words('english'))
  # you can add custom stopwords as well using stops.update()
  stops.update(("brb", "yolo", "hodl", "it", "this", "mailto"))
  new_text = " ".join(word for word in text.split() if word not in stops)
  return new_text
  

In [94]:
sample_text = "bitcoin is goin down. But i am gonna hodl coz yolo"

remove_stopwords(sample_text)

'bitcoin goin down. But gonna coz'

# Named Entity Recognition (NER) ***

In [98]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()

In [104]:
text_sample = "The cryptocurrency market was back in the green on Tuesday as investors picked quality tokens at lower prices. Bitcoin hit the $50,000 mark as concerns over the Omicron variant eased across the globe and investors lapped up riskier assets."
doc = nlp(text_sample)
ner = [(x.text, x.label_) for x in doc.ents] 
ner

[('Tuesday', 'DATE'),
 ('Bitcoin', 'GPE'),
 ('$50,000 mark', 'MONEY'),
 ('Omicron', 'ORG')]