In [None]:
# Install necessary libraries if not already present
!pip install nltk spacy
!pip install ipykernel
!python -m spacy download en_core_web_sm

# Import libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import spacy

# Download NLTK data (if not already downloaded)
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# Sample text
text = "My name is Dang Phuc, I'm 18 years old and studying Computer Science in HCMUT. This is the test for NLP in Google Colab"

# 1. Tokenization (Word and Sentence)
words = word_tokenize(text)
sentences = sent_tokenize(text)

print("--- Tokenization ---")
print(f"Words: {words[:10]}...") # Display first 10 words
print(f"Sentences: {sentences}")

# 2. Stop Word Removal
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in words if word.lower() not in stop_words and word.isalnum()]

print("\n--- Stop Word Removal ---")
print(f"Filtered Words: {filtered_words[:10]}...")

# 3. Stemming
ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in filtered_words]

print("\n--- Stemming ---")
print(f"Stemmed Words: {stemmed_words[:10]}...")

# 4. Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]

print("\n--- Lemmatization ---")
print(f"Lemmatized Words: {lemmatized_words[:10]}...")

# 5. Named Entity Recognition (NER) with spaCy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

print("\n--- Named Entity Recognition (NER) ---")
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m34.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


--- Tokenization ---
Words: ['My', 'name', 'is', 'Dang', 'Phuc', ',', 'I', "'m", '18', 'years']...
Sentences: ["My name is Dang Phuc, I'm 18 years old and studying Computer Science in HCMUT.", 'This is the test for NLP in Google Colab']

--- Stop Word Removal ---
Filtered Words: ['name', 'Dang', 'Phuc', '18', 'years', 'old', 'studying', 'Computer', 'Science', 'HCMUT']...

--- Stemming ---
Stemmed Words: ['name', 'dang', 'phuc', '18', 'year', 'old', 'studi', 'comput', 'scienc', 'hcmut']...

--- Lemmatization ---
Lemmatized Words: ['name', 'Dang', 'Phuc', '18', 'year', 'old', 'studying', 'Computer', 'Science', 'HCMUT']...

--- Named Entity Recognition (NER) ---
Entity: Dang Phuc, Label: PERSON
Entity: 18 years old, Label: DATE
Entity: Computer Science, Label: WORK_OF_ART
Entity: NLP, Label: ORG
