In [None]:
!pip install nltk



# Perform stemming and lemmatization

In [None]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')      # For tokenizing the text
nltk.download('wordnet')    # For lemmatization
nltk.download('punkt_tab')

# Sample text
text = "The children are playing and running around the playground."

# Tokenize the text
tokens = word_tokenize(text)

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Stemming: Apply PorterStemmer to each token
stemmed_words = [stemmer.stem(word) for word in tokens]
print("Stemmed Words:")
print(stemmed_words)

# Lemmatization: Apply WordNetLemmatizer to each token
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print("\nLemmatized Words:")
print(lemmatized_words)

[nltk_data] Downloading package punkt to /Users/sohan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sohan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/sohan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Stemmed Words:
['the', 'children', 'are', 'play', 'and', 'run', 'around', 'the', 'playground', '.']

Lemmatized Words:
['The', 'child', 'are', 'playing', 'and', 'running', 'around', 'the', 'playground', '.']


# Design a custom tokenizer and perform stemming and lemmatization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

def custom_tokenizer(text):
    # Remove URLs (http:// or https://)
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)
    # Tokenize the cleaned text using NLTK's word_tokenize
    tokens = word_tokenize(text.lower())
    # Remove any remaining non-alphabetic tokens
    tokens = [token for token in tokens if token.isalpha()]
    return tokens

text = "Check out my new blog post! #TechBlog @john_doe https://example.com"
tokens = custom_tokenizer(text)

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

stemmed_words = [stemmer.stem(word) for word in tokens]
print("Stemmed Words:")
print(stemmed_words)

lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print("\nLemmatized Words:")
print(lemmatized_words)

Stemmed Words:
['check', 'out', 'my', 'new', 'blog', 'post']

Lemmatized Words:
['check', 'out', 'my', 'new', 'blog', 'post']


# Named Entity Recognition

In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [2]:
import spacy
import re
from datetime import datetime

nlp = spacy.load("en_core_web_sm")

def normalize_text(text):
    text = re.sub(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', 'DATE', text)
    text = re.sub(r'\b\d{4}-\d{2}-\d{2}\b', 'DATE', text)
    text = re.sub(r'\b\d{1,2} \w+ \d{4}\b', 'DATE', text)

    text = re.sub(r'\$\d+(?:,\d{3})*(?:\.\d{2})?', 'MONEY', text)  # Matches dollar values like $1,000.50
    text = re.sub(r'\b\d+(?:,\d{3})*(?:\.\d+)?\s?(usd|euro|gbp|inr)\b', 'MONEY', text, flags=re.IGNORECASE)  # Matches currency like 1000 USD

    text = re.sub(r'\b\d+\b', 'NUMBER', text)

    return text

# Function for Named Entity Recognition
def named_entity_recognition(text):
    doc = nlp(text)

    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))

    return entities

text = "Apple Inc. was founded on April 1, 1976 by Steve Jobs. The price of the iPhone is $999. " \
        "On 12/05/2023, the company announced a partnership with Microsoft. A person who earned 5000 USD " \
        "on 10th May 2023. Contact John at john@example.com or visit our office at 123 Park Ave."

normalized_text = normalize_text(text)
print("Normalized Text:")
print(normalized_text)

entities = named_entity_recognition(text)
print("\nNamed Entities Recognized:")
for entity, label in entities:
    print(f"{entity}: {label}")

Normalized Text:
Apple Inc. was founded on April NUMBER, NUMBER by Steve Jobs. The price of the iPhone is MONEY. On DATE, the company announced a partnership with Microsoft. A person who earned MONEY on 10th May NUMBER. Contact John at john@example.com or visit our office at NUMBER Park Ave.

Named Entities Recognized:
Apple Inc.: ORG
April 1, 1976: DATE
Steve Jobs: PERSON
iPhone: ORG
999: MONEY
12/05/2023: DATE
Microsoft: ORG
5000: CARDINAL
10th May 2023: DATE
John: PERSON
123: CARDINAL
