<a href="https://colab.research.google.com/github/sanjana030805/NLP-Lab/blob/main/text_preprocessing(6).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy nltk
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m96.5 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
from nltk.stem import PorterStemmer

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Initialize stemmer
stemmer = PorterStemmer()

# Input text
text = "Apple is looking at buying a startup in New York"

# Process text
doc = nlp(text)

print("---- Tokenization ----")
for token in doc:
    print(token.text)

print("\n---- Stop Word Removal ----")
for token in doc:
    if not token.is_stop:
        print(token.text)

print("\n---- Lemmatization ----")
for token in doc:
    print(token.text, "->", token.lemma_)

print("\n---- POS Tagging ----")
for token in doc:
    print(token.text, "->", token.pos_)

print("\n---- Named Entity Recognition (NER) ----")
for ent in doc.ents:
    print(ent.text, "->", ent.label_)


---- Tokenization ----
Apple
is
looking
at
buying
a
startup
in
New
York

---- Stop Word Removal ----
Apple
looking
buying
startup
New
York

---- Lemmatization ----
Apple -> Apple
is -> be
looking -> look
at -> at
buying -> buy
a -> a
startup -> startup
in -> in
New -> New
York -> York

---- POS Tagging ----
Apple -> PROPN
is -> AUX
looking -> VERB
at -> ADP
buying -> VERB
a -> DET
startup -> NOUN
in -> ADP
New -> PROPN
York -> PROPN

---- Named Entity Recognition (NER) ----
Apple -> ORG
New York -> GPE


In [None]:
doc = nlp("Microsoft opened a new office in Bengaluru.")
print([token.text for token in doc])


['Microsoft', 'opened', 'a', 'new', 'office', 'in', 'Bengaluru', '.']


In [None]:
doc = nlp("Microsoft opened a new office in Bengaluru.")
print([token.lemma_ for token in doc])


['Microsoft', 'open', 'a', 'new', 'office', 'in', 'Bengaluru', '.']


In [None]:
doc = nlp("Microsoft opened a new office in Bengaluru.")
print([(token.text, token.pos_) for token in doc])


[('Microsoft', 'PROPN'), ('opened', 'VERB'), ('a', 'DET'), ('new', 'ADJ'), ('office', 'NOUN'), ('in', 'ADP'), ('Bengaluru', 'PROPN'), ('.', 'PUNCT')]


In [None]:
doc = nlp("Microsoft opened a new office in Bengaluru. Satya Nadella leads Microsoft.")
print([(ent.text, ent.label_) for ent in doc.ents])


[('Microsoft', 'ORG'), ('Bengaluru', 'GPE'), ('Satya Nadella', 'PERSON'), ('Microsoft', 'ORG')]


In [None]:
doc = nlp("Microsoft opened a new office in Bengaluru.")
print([token.text for token in doc if not token.is_stop and not token.is_punct])


['Microsoft', 'opened', 'new', 'office', 'Bengaluru']
