# Import libraries and Dataset

In [None]:
# Import required libraries
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import reuters
import spacy
import re

# Download necessary resources
nltk.download('reuters')
nltk.download('punkt_tab')

# Sample text
text = reuters.raw(reuters.fileids()[0])[:500]  # Extract the first 500 characters for moderate complexity

# Display the sample text
print("Sample Text:")
print(text)

# Tokenization

### Part 1: Tokenization with NLTK

In [None]:
print("\n=== Tokenization with NLTK ===")
sentences_nltk = sent_tokenize(text)
print("Sentence Tokenization (NLTK):", sentences_nltk)

words_nltk = word_tokenize(text)
print("\nWord Tokenization (NLTK):", words_nltk)

### Part 2: Tokenization with SpaCy



In [None]:
print("\n=== Tokenization with SpaCy ===")
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)
tokens_spacy = [token.text for token in doc]
print("Word Tokenization (SpaCy):", tokens_spacy)

### Part 3: Custom Tokenization Using Regular Expressions


In [None]:
print("\n=== Custom Tokenization with Regex ===")
def custom_tokenizer_regex(text):
    return re.findall(r"\b\w+(?:-\w+)*(?:'t|n't|'re|'ve|'ll|'d|'s)?\b", text)

tokens_regex = custom_tokenizer_regex(text)
print("Word Tokenization (Regex):", tokens_regex)

 Reflection Questions
1. How do the tokenization outputs from NLTK, SpaCy, and Regex differ?"
2. Which method better handles edge cases like contractions or special characters? Provide examples.

# Stop Words Removal
#### Learning Goals:
##### Understand the role of stop words in text preprocessing.
##### Explore stop word removal using NLTK and SpaCy.
##### Customize stop word lists for domain-specific tasks.
##### Reflect on how stop word removal affects text analysis.

### Part 1: Stop Word Removal with NLTK

In [None]:
# Import required libraries
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords
nltk.download('stopwords')

# Default stop words
stop_words_nltk = set(stopwords.words('english'))

# Tokenize the Reuters sample text
words_nltk = word_tokenize(text)

# Remove stop words
filtered_words_nltk = [word for word in words_nltk if word.lower() not in stop_words_nltk]
print("Original Words (NLTK):", words_nltk[:20])
print("\nFiltered Words (No Stop Words, NLTK):", filtered_words_nltk[:20])

### Part 2: Stop Word Removal with SpaCy

In [None]:
doc = nlp(text)
filtered_words_spacy = [token.text for token in doc if not token.is_stop]
print("\nFiltered Words (No Stop Words, SpaCy):", filtered_words_spacy[:20])

### Part 3: Custom Stop Words

In [None]:
# Add domain-specific stop words
custom_stop_words = stop_words_nltk.union({"reuters", "said", "mr"})
filtered_words_custom = [word for word in words_nltk if word.lower() not in custom_stop_words]
print("\nFiltered Words (Custom Stop Words):", filtered_words_custom[:20])


Filtered Words (Custom Stop Words): ['ASIAN', 'EXPORTERS', 'FEAR', 'DAMAGE', 'U.S.-JAPAN', 'RIFT', 'Mounting', 'trade', 'friction', 'U.S.', 'Japan', 'raised', 'fears', 'among', 'many', 'Asia', "'s", 'exporting', 'nations', 'row']


# Reflection Questions
1. How do the filtered words from NLTK and SpaCy differ?
2. Why might you want to customize stop words for domain-specific tasks? Provide examples."

# Text Normalization
#### Learning Goals:
##### Focus on lowercasing, punctuation removal and normalization of text inconsistencies

### Part 1: Lowercasing Text

In [None]:
lowercased_text = text.lower()
print("Original Text (First 200 characters):")
print(text[:200])
print("\nLowercased Text (First 200 characters):")
print(lowercased_text[:200])

### Part 2: Removing Punctuation

In [None]:
import string

print("\n=== Removing Punctuation ===")
# Remove punctuation
text_no_punctuation = ''.join([char for char in lowercased_text if char not in string.punctuation])
print("Text without Punctuation (First 200 characters):")
print(text_no_punctuation[:200])

### Part 3: Handling Numbers

In [None]:
print("\n=== Removing Numbers ===")
# Remove numbers
text_no_numbers = ''.join([char for char in text_no_punctuation if not char.isdigit()])
print("Text without Numbers (First 200 characters):")
print(text_no_numbers[:200])

### Part 4: Normalizing Text Inconsistencies

In [None]:
print("\n=== Normalizing Text Inconsistencies (British to American English) ===")
# Define a function for normalizing British to American English
def normalize_british_to_american(text):
    british_to_american = {
        "colour": "color",
        "favourite": "favorite",
        "organise": "organize",
        "realise": "realize"
    }
    words = text.split()
    normalized_words = [british_to_american.get(word, word) for word in words]
    return ' '.join(normalized_words)

# Sample text with British spellings
british_text = "I organise my tasks and realise my favourite colour is blue."
normalized_text = normalize_british_to_american(british_text)
print("Original Text (British):", british_text)
print("Normalized Text (American):", normalized_text)

# Reflection Questions
print("\n=== Reflection Questions ===")
print("1. Why might lowercasing or punctuation removal be inappropriate for certain NLP tasks?")
print("2. How does normalizing text inconsistencies (e.g., British vs. American spellings) improve data quality?")

1. Why might lowercasing or punctuation removal be inappropriate for certain NLP tasks (e.g., sentiment analysis or named entity recognition)?

2. What are the benefits of normalizing British to American spellings in multilingual datasets?
3. Can you think of a domain where preserving numbers is critical? Provide examples.‚Äù