In [None]:
# Lab Assignment 1: Text Preprocessing and Regular Expressions
# •	Implement tokenization, stemming, and lemmatization using NLTK and spaCy.
# •	Use regular expressions for tasks such as extracting email addresses, phone numbers, and hashtags from a given text dataset of minimum 5 pages.

In [None]:
# Step 1: Install Required Libraries
!pip install nltk spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m74.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Step 2: Import Libraries
import nltk
import re
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# Download necessary NLTK data
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

In [None]:
# Step 3: Simulate a sample 5-page dataset (you can load your own .txt or .csv here)
text_data = """
Contact me at john.doe@example.com or jane_doe22@sample.org.
My phone number is +1-800-555-1234 or (212) 555-4567.
I love #MachineLearning and #AI!
Barack Obama was the 44th president of the United States.
SpaCy is great for NLP. NLTK is also useful.

Email me at test.email@gmail.com or hello@mydomain.org.
Call me at 987-654-3210 or 1234567890.
Follow #Python and #DataScience on Twitter.
The cat sat on the mat. The cats are sitting on the mats.
"""

# Preprocess into lines like different pages for simulation
pages = text_data.strip().split('\n')

In [None]:
print("=== Tokenization ===")
for i, page in enumerate(pages):
    tokens = word_tokenize(page)
    print(f"\nPage {i+1} Tokens:\n", tokens)

=== Tokenization ===

Page 1 Tokens:
 ['Contact', 'me', 'at', 'john.doe', '@', 'example.com', 'or', 'jane_doe22', '@', 'sample.org', '.']

Page 2 Tokens:
 ['My', 'phone', 'number', 'is', '+1-800-555-1234', 'or', '(', '212', ')', '555-4567', '.']

Page 3 Tokens:
 ['I', 'love', '#', 'MachineLearning', 'and', '#', 'AI', '!']

Page 4 Tokens:
 ['Barack', 'Obama', 'was', 'the', '44th', 'president', 'of', 'the', 'United', 'States', '.']

Page 5 Tokens:
 ['SpaCy', 'is', 'great', 'for', 'NLP', '.', 'NLTK', 'is', 'also', 'useful', '.']

Page 6 Tokens:
 []

Page 7 Tokens:
 ['Email', 'me', 'at', 'test.email', '@', 'gmail.com', 'or', 'hello', '@', 'mydomain.org', '.']

Page 8 Tokens:
 ['Call', 'me', 'at', '987-654-3210', 'or', '1234567890', '.']

Page 9 Tokens:
 ['Follow', '#', 'Python', 'and', '#', 'DataScience', 'on', 'Twitter', '.']

Page 10 Tokens:
 ['The', 'cat', 'sat', 'on', 'the', 'mat', '.', 'The', 'cats', 'are', 'sitting', 'on', 'the', 'mats', '.']


In [None]:
stemmer = PorterStemmer()

print("\n=== Stemming ===")
for i, page in enumerate(pages):
    tokens = word_tokenize(page)
    stemmed = [stemmer.stem(word) for word in tokens]
    print(f"\nPage {i+1} Stemmed:\n", stemmed)


=== Stemming ===

Page 1 Stemmed:
 ['contact', 'me', 'at', 'john.do', '@', 'example.com', 'or', 'jane_doe22', '@', 'sample.org', '.']

Page 2 Stemmed:
 ['my', 'phone', 'number', 'is', '+1-800-555-1234', 'or', '(', '212', ')', '555-4567', '.']

Page 3 Stemmed:
 ['i', 'love', '#', 'machinelearn', 'and', '#', 'ai', '!']

Page 4 Stemmed:
 ['barack', 'obama', 'wa', 'the', '44th', 'presid', 'of', 'the', 'unit', 'state', '.']

Page 5 Stemmed:
 ['spaci', 'is', 'great', 'for', 'nlp', '.', 'nltk', 'is', 'also', 'use', '.']

Page 6 Stemmed:
 []

Page 7 Stemmed:
 ['email', 'me', 'at', 'test.email', '@', 'gmail.com', 'or', 'hello', '@', 'mydomain.org', '.']

Page 8 Stemmed:
 ['call', 'me', 'at', '987-654-3210', 'or', '1234567890', '.']

Page 9 Stemmed:
 ['follow', '#', 'python', 'and', '#', 'datasci', 'on', 'twitter', '.']

Page 10 Stemmed:
 ['the', 'cat', 'sat', 'on', 'the', 'mat', '.', 'the', 'cat', 'are', 'sit', 'on', 'the', 'mat', '.']


In [None]:
print("\n=== Lemmatization (spaCy) ===")
for i, page in enumerate(pages):
    doc = nlp(page)
    lemmatized = [token.lemma_ for token in doc]
    print(f"\nPage {i+1} Lemmatized:\n", lemmatized)


=== Lemmatization (spaCy) ===

Page 1 Lemmatized:
 ['contact', 'I', 'at', 'john.doe@example.com', 'or', 'jane_doe22@sample.org', '.']

Page 2 Lemmatized:
 ['my', 'phone', 'number', 'be', '+1', '-', '800', '-', '555', '-', '1234', 'or', '(', '212', ')', '555', '-', '4567', '.']

Page 3 Lemmatized:
 ['I', 'love', '#', 'MachineLearning', 'and', '#', 'AI', '!']

Page 4 Lemmatized:
 ['Barack', 'Obama', 'be', 'the', '44th', 'president', 'of', 'the', 'United', 'States', '.']

Page 5 Lemmatized:
 ['SpaCy', 'be', 'great', 'for', 'NLP', '.', 'NLTK', 'be', 'also', 'useful', '.']

Page 6 Lemmatized:
 []

Page 7 Lemmatized:
 ['email', 'I', 'at', 'test.email@gmail.com', 'or', 'hello@mydomain.org', '.']

Page 8 Lemmatized:
 ['call', 'I', 'at', '987', '-', '654', '-', '3210', 'or', '1234567890', '.']

Page 9 Lemmatized:
 ['follow', '#', 'Python', 'and', '#', 'DataScience', 'on', 'Twitter', '.']

Page 10 Lemmatized:
 ['the', 'cat', 'sit', 'on', 'the', 'mat', '.', 'the', 'cat', 'be', 'sit', 'on', 'the'

In [None]:
email_pattern = r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+'
phone_pattern = r'(\+?\d{1,3})?[\s\-\.]?\(?\d{2,4}\)?[\s\-\.]?\d{3,4}[\s\-\.]?\d{4}'
hashtag_pattern = r'#\w+'

print("\n=== Regex Extraction ===")
for i, page in enumerate(pages):
    emails = re.findall(email_pattern, page)
    phones = re.findall(phone_pattern, page)
    hashtags = re.findall(hashtag_pattern, page)

    print(f"\nPage {i+1} Results:")
    print("Emails:", emails)
    print("Phone Numbers:", phones)
    print("Hashtags:", hashtags)


=== Regex Extraction ===

Page 1 Results:
Emails: ['john.doe@example.com', 'jane_doe22@sample.org.']
Phone Numbers: []
Hashtags: []

Page 2 Results:
Emails: []
Phone Numbers: ['+1', '']
Hashtags: []

Page 3 Results:
Emails: []
Phone Numbers: []
Hashtags: ['#MachineLearning', '#AI']

Page 4 Results:
Emails: []
Phone Numbers: []
Hashtags: []

Page 5 Results:
Emails: []
Phone Numbers: []
Hashtags: []

Page 6 Results:
Emails: []
Phone Numbers: []
Hashtags: []

Page 7 Results:
Emails: ['test.email@gmail.com', 'hello@mydomain.org.']
Phone Numbers: []
Hashtags: []

Page 8 Results:
Emails: []
Phone Numbers: ['', '']
Hashtags: []

Page 9 Results:
Emails: []
Phone Numbers: []
Hashtags: ['#Python', '#DataScience']

Page 10 Results:
Emails: []
Phone Numbers: []
Hashtags: []
