In [1]:
!pip install nltk



# Perform stemming and lemmatization

In [2]:
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')      # For tokenizing the text
nltk.download('wordnet')    # For lemmatization
nltk.download('punkt_tab')

# Sample text
text = "The children are playing and running around the playground."

# Tokenize the text
tokens = word_tokenize(text)

# Initialize the stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Stemming: Apply PorterStemmer to each token
stemmed_words = [stemmer.stem(word) for word in tokens]
print("Stemmed Words:")
print(stemmed_words)

# Lemmatization: Apply WordNetLemmatizer to each token
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print("\nLemmatized Words:")
print(lemmatized_words)

[nltk_data] Downloading package punkt to /Users/sohan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sohan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/sohan/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Stemmed Words:
['the', 'children', 'are', 'play', 'and', 'run', 'around', 'the', 'playground', '.']

Lemmatized Words:
['The', 'child', 'are', 'playing', 'and', 'running', 'around', 'the', 'playground', '.']


# Design a custom tokenizer and perform stemming and lemmatization

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt', quiet=True)
nltk.download('wordnet', quiet=True)

def custom_tokenizer(text):
    # Remove URLs (http:// or https://)
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove mentions (@username)
    text = re.sub(r'@\w+', '', text)
    # Remove hashtags (#hashtag)
    text = re.sub(r'#\w+', '', text)
    # Tokenize the cleaned text using NLTK's word_tokenize
    tokens = word_tokenize(text.lower())
    # Remove any remaining non-alphabetic tokens
    tokens = [token for token in tokens if token.isalpha()]
    return tokens

# Example usage
text = "Check out my new blog post! #TechBlog @john_doe https://example.com"
tokens = custom_tokenizer(text)

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Stemming: Apply PorterStemmer to each token
stemmed_words = [stemmer.stem(word) for word in tokens]
print("Stemmed Words:")
print(stemmed_words)

# Lemmatization: Apply WordNetLemmatizer to each token
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
print("\nLemmatized Words:")
print(lemmatized_words)

Stemmed Words:
['check', 'out', 'my', 'new', 'blog', 'post']

Lemmatized Words:
['check', 'out', 'my', 'new', 'blog', 'post']


# Named Entity Recognition

In [9]:
!pip install spacy
!python -m spacy download en_core_web_sm

Traceback (most recent call last):
  File "<frozen runpy>", line 189, in _run_module_as_main
  File "<frozen runpy>", line 148, in _get_module_details
  File "<frozen runpy>", line 112, in _get_module_details
  File "/opt/anaconda3/lib/python3.12/site-packages/spacy/__init__.py", line 13, in <module>
    from . import pipeline  # noqa: F401
    ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/spacy/pipeline/__init__.py", line 1, in <module>
    from .attributeruler import AttributeRuler
  File "/opt/anaconda3/lib/python3.12/site-packages/spacy/pipeline/attributeruler.py", line 8, in <module>
    from ..language import Language
  File "/opt/anaconda3/lib/python3.12/site-packages/spacy/language.py", line 43, in <module>
    from .pipe_analysis import analyze_pipes, print_pipe_analysis, validate_attrs
  File "/opt/anaconda3/lib/python3.12/site-packages/spacy/pipe_analysis.py", line 6, in <module>
    from .tokens import Doc, Span, Token
  File "/opt/anaconda3/lib

In [3]:
import spacy
import re
from datetime import datetime

nlp = spacy.load("en_core_web_sm")

# Function to normalize text (remove or standardize dates, monetary values, and numbers)
def normalize_text(text):
    # Normalize dates (example: replace any date with 'DATE')
    text = re.sub(r'\b\d{1,2}[-/]\d{1,2}[-/]\d{2,4}\b', 'DATE', text)  # Matches formats like 12/05/2021
    text = re.sub(r'\b\d{4}-\d{2}-\d{2}\b', 'DATE', text)  # Matches formats like 2021-12-05
    text = re.sub(r'\b\d{1,2} \w+ \d{4}\b', 'DATE', text)  # Matches formats like 12 May 2021

    # Normalize monetary values (example: replace any monetary amount with 'MONEY')
    text = re.sub(r'\$\d+(?:,\d{3})*(?:\.\d{2})?', 'MONEY', text)  # Matches dollar values like $1,000.50
    text = re.sub(r'\b\d+(?:,\d{3})*(?:\.\d+)?\s?(usd|euro|gbp|inr)\b', 'MONEY', text, flags=re.IGNORECASE)  # Matches currency like 1000 USD

    # Normalize numbers (example: replace any number with 'NUMBER')
    text = re.sub(r'\b\d+\b', 'NUMBER', text)  # Matches any number (e.g., 123, 12345)

    return text

# Function for Named Entity Recognition
def named_entity_recognition(text):
    # Process the text with spaCy
    doc = nlp(text)

    # Extract entities
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))  # Store the entity and its label

    return entities

text = "Apple Inc. was founded on April 1, 1976 by Steve Jobs. The price of the iPhone is $999. " \
        "On 12/05/2023, the company announced a partnership with Microsoft. A person who earned 5000 USD " \
        "on 10th May 2023. Contact John at john@example.com or visit our office at 123 Park Ave."

normalized_text = normalize_text(text)
print("Normalized Text:")
print(normalized_text)

entities = named_entity_recognition(text)
print("\nNamed Entities Recognized:")
for entity, label in entities:
    print(f"{entity}: {label}")

ModuleNotFoundError: No module named 'spacy'

# 07/03

In [None]:
def normalize_text(text):
    """
    Normalizes text by replacing slangs, abbreviations, and emojis with formal counterparts,
    and normalizing irregular spaces and punctuation.

    Args:
        text (str): Input text to normalize

    Returns:
        str: Normalized text
    """
    import re

    # Dictionary for common slangs and abbreviations (limited to 5)
    slang_dict = {
        "lol": "laughing out loud",
        "idk": "I don't know",
        "btw": "by the way",
        "omg": "oh my goodness",
        "thx": "thanks"
    }

    # Dictionary for emojis (limited to 5)
    emoji_dict = {
        "😊": "smiling",
        "😂": "laughing with tears",
        "👍": "thumbs up",
        "❤️": "love",
        ":)": "smiling"
    }

    # Convert text to lowercase for easier matching
    normalized_text = text.lower()

    # Replace slangs and abbreviations
    # Add word boundaries to avoid replacing parts of words
    for slang, formal in slang_dict.items():
        normalized_text = re.sub(r'\b' + slang + r'\b', formal, normalized_text)

    # Replace emojis
    for emoji, description in emoji_dict.items():
        normalized_text = normalized_text.replace(emoji, f" {description} ")

    # Normalize spaces (replace multiple spaces with a single space)
    normalized_text = re.sub(r'\s+', ' ', normalized_text)

    # Normalize punctuation (replace multiple occurrences with a single one)
    for punct in '.!?,:;-':
        normalized_text = re.sub(r'[' + re.escape(punct) + r']+', punct, normalized_text)

    # Ensure proper spacing after punctuation
    for punct in '.!?,:;':
        normalized_text = re.sub(r'[' + re.escape(punct) + r']', punct + ' ', normalized_text)
    normalized_text = re.sub(r'\s+', ' ', normalized_text)  # Fix any double spaces created

    # Capitalize the first letter of each sentence
    normalized_text = re.sub(r'(^|[.!?]\s+)([a-z])', lambda m: m.group(1) + m.group(2).upper(), normalized_text)

    # Remove leading/trailing whitespace
    normalized_text = normalized_text.strip()

    return normalized_text

# Test the function with various inputs
def test_normalize_text():
    test_cases = [
        "OMG!!! This is funny lol 😂",
        "idk why you are so upset tbh... :)",
        "btw, thx for the help 👍",
        "lol that's a good one ❤️",
        "omg!!! multiple punctuation marks?!?!"
    ]

    for i, test in enumerate(test_cases):
        print(f"Test {i+1}:")
        print(f"Original: {test}")
        print(f"Normalized: {normalize_text(test)}")
        print()

# Run the tests
if __name__ == "__main__":
    test_normalize_text()

Test 1:
Original: OMG!!! This is funny lol 😂
Normalized: Oh my goodness! This is funny laughing out loud laughing with tears

Test 2:
Original: idk why you are so upset tbh... :)
Normalized: I don't know why you are so upset tbh. Smiling

Test 3:
Original: btw, thx for the help 👍
Normalized: By the way, thanks for the help thumbs up

Test 4:
Original: lol that's a good one ❤️
Normalized: Laughing out loud that's a good one love

Test 5:
Original: omg!!! multiple punctuation marks?!?!
Normalized: Oh my goodness! Multiple punctuation marks? ! ? !



In [None]:
def normalize_text(text):
    """
    Normalizes text by:
    1. Replacing slang words with their full forms
    2. Standardizing punctuation
    3. Converting the text to lowercase

    Args:
        text (str): Input text to normalize

    Returns:
        str: Normalized text
    """
    import re

    # Convert to lowercase first
    text = text.lower()

    # Dictionary of slang words and their full forms
    slang_dict = {
        "lol": "laughing out loud",
        "idk": "i don't know",
        "btw": "by the way",
        "omg": "oh my goodness",
        "thx": "thanks",
        "u": "you",
        "r": "are",
        "ur": "your",
        "y": "why",
        "k": "okay"
    }

    # Replace slang words
    for slang, full_form in slang_dict.items():
        # Use word boundaries to avoid replacing parts of words
        text = re.sub(r'\b' + slang + r'\b', full_form, text)

    # Standardize punctuation (replace multiple occurrences with a single one)
    for punct in '.!?,:;-':
        text = re.sub(r'[' + re.escape(punct) + r']+', punct, text)

    # Add proper spacing after punctuation
    for punct in '.!?,:;':
        text = re.sub(r'([' + re.escape(punct) + r'])([^\s])', r'\1 \2', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove leading/trailing whitespace
    text = text.strip()

    return text

# Test the function
def main():
    test_sentences = [
        "OMG!!! Have u SEEN this???",
        "LOL that's SO funny btw, IDK y ppl r mad!!!!",
        "ThX for Ur help..... K???",
        "Y r U so LATE?? BTW, the movie starts at 8pm...",
        "This is a Normal Sentence, But with Weird Capitalization."
    ]

    print("Text Normalization Results:")
    print("--------------------------")

    for i, sentence in enumerate(test_sentences, 1):
        normalized = normalize_text(sentence)
        print(f"Example {i}:")
        print(f"Original: {sentence}")
        print(f"Normalized: {normalized}")
        print()

if __name__ == "__main__":
    main()

Text Normalization Results:
--------------------------
Example 1:
Original: OMG!!! Have u SEEN this???
Normalized: oh my goodness! have you seen this?

Example 2:
Original: LOL that's SO funny btw, IDK y ppl r mad!!!!
Normalized: laughing out loud that's so funny by the way, i don't know why ppl are mad!

Example 3:
Original: ThX for Ur help..... K???
Normalized: thanks for your help. okay?

Example 4:
Original: Y r U so LATE?? BTW, the movie starts at 8pm...
Normalized: why are you so late? by the way, the movie starts at 8pm.

Example 5:
Original: This is a Normal Sentence, But with Weird Capitalization.
Normalized: this is a normal sentence, but with weird capitalization.

