<a href="https://colab.research.google.com/github/sarathi-vs13/Natural-Language-Processing/blob/main/Text_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [58]:
import warnings
warnings.filterwarnings("ignore")

## Tokenization

In [59]:
import nltk
from nltk.tokenize import word_tokenize
#nltk.download('popular')
#nltk.download('punkt_tab')

In [60]:
tokens = word_tokenize("I'm learning NLP")
print(tokens)

['I', "'m", 'learning', 'NLP']


## Stemming

In [61]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ["running", "ran", "swimming", "playing", "plays", "happily", "lazy"]

for word in words:
  print(word, "→", stemmer.stem(word) )

running → run
ran → ran
swimming → swim
playing → play
plays → play
happily → happili
lazy → lazi


## Lemmatization

In [62]:
# Using NLTK
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

words = ["running", "dancing", "happily", "lazy", "written","goes"]

for word in words:
  print(word, "→", lemmatizer.lemmatize(word, pos="v"))

running → run
dancing → dance
happily → happily
lazy → lazy
written → write
goes → go


In [63]:
#Using spaCy
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("The weather in chennai is unusually romantic today.")

for token in doc:
  print(token.text, "→", token.lemma_)

The → the
weather → weather
in → in
chennai → chennai
is → be
unusually → unusually
romantic → romantic
today → today
. → .


## Stopword

In [64]:
doc = nlp("Let's see an example of a simple stopword removal code by using spacy")

filtered = []

for token in doc:
  if not token.is_stop:
    filtered.append(token)

print(filtered)


[Let, example, simple, stopword, removal, code, spacy]


## Part-of-Speech (POS) Tagging

In [65]:
# Using NLTK
nltk.download('averaged_perceptron_tagger_eng')
text = "The quick brown fox jumps over the lazy dog"
tokens = nltk.word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)

print(pos_tags)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN')]


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [66]:
# Using spaCy
doc = nlp("The quick brown fox jumps over the lazy dog")
for token in doc:
  print(f" {token.text} → {token.pos} ({token.tag_})")

 The → 90 (DT)
 quick → 84 (JJ)
 brown → 84 (JJ)
 fox → 92 (NN)
 jumps → 100 (VBZ)
 over → 85 (IN)
 the → 90 (DT)
 lazy → 84 (JJ)
 dog → 92 (NN)


## Named Entity Recognition (NER)

In [57]:
import spacy
nlp = spacy.load("en_core_web_sm")

text = "Tesla was founded by Elon Musk in California in 2003."
doc = nlp(text)

for ent in doc.ents:
    print(ent.text, "→", ent.label_)

Elon Musk → PERSON
California → GPE
2003 → DATE
