In [2]:
# DATA 622 - Homework 2
# Custom implementation with structured modular workflow

import requests
import string
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

# Ensure required NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Added to address the LookupError

def fetch_article_text(url):
    """Download article content and return first 700 characters (raw)."""
    response = requests.get(url, timeout=10)
    response.raise_for_status()
    return response.text[:700], response.text

def strip_html(content):
    """Remove HTML tags and return clean text."""
    parser = BeautifulSoup(content, "html.parser")
    return parser.get_text(separator=" ")

def normalize_text(text):
    """Lowercase text and remove punctuation."""
    text = text.lower()
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

def remove_stop_words(tokens):
    """Remove English stopwords."""
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words and word.isalpha()]

def lemmatize_words(tokens):
    """Apply WordNet lemmatization."""
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]

def stem_words(tokens):
    """Apply Porter stemming."""
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

def main():
    article_url = "https://www.cnn.com/2025/06/13/style/why-luxury-brands-are-so-expensive"

    # Step 1: Read file and print first 700 characters
    raw_preview, full_html = fetch_article_text(article_url)
    print("\n--- First 700 Characters (Raw HTML Content) ---\n")
    print(raw_preview)

    # Step 2: Remove HTML tags
    clean_text = strip_html(full_html)

    # Step 3: Lowercase + remove punctuation
    normalized_text = normalize_text(clean_text)

    # Tokenization
    tokens = word_tokenize(normalized_text)

    # Step 4: Remove stopwords
    filtered_tokens = remove_stop_words(tokens)

    # Step 5: Lemmatization
    lemmatized = lemmatize_words(filtered_tokens)
    print("\n--- First 50 Lemmatized Words ---\n")
    print(lemmatized[:50])

    # Comparison with stemming
    stemmed = stem_words(filtered_tokens)
    print("\n--- First 50 Stemmed Words ---\n")
    print(stemmed[:50])

    print("\n--- Difference Observation ---")
    print("Lemmatization keeps words meaningful (e.g., 'better' -> 'good'),")
    print("while stemming often truncates words mechanically (e.g., 'expensive' -> 'expens').")

if __name__ == "__main__":
    main()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.



--- First 700 Characters (Raw HTML Content) ---

  <!DOCTYPE html>
<html lang="en" data-uri="cms.cnn.com/_pages/cmboyzvxs00d626qmalw1heqy@published" data-layout-uri="cms.cnn.com/_layouts/layout-with-rail/instances/style-article-feature-v1@published" >
  <head>
<link rel="dns-prefetch" href="//tpc.googlesyndication.com">

<link rel="preconnect" href="//tpc.googlesyndication.com">

<link rel="dns-prefetch" href="//pagead2.googlesyndication.com">

<link rel="preconnect" href="//pagead2.googlesyndication.com">

<link rel="dns-prefetch" href="//www.googletagservices.com">

<link rel="preconnect" href="//www.googletagservices.com">

<link rel="dns-prefetch" href="//www.google.com">

<link rel="preconnect" href="//www.google.com">

<link rel="dns

--- First 50 Lemmatized Words ---

['luxury', 'brand', 'expensive', 'ever', 'telling', 'worth', 'cnn', 'cnn', 'value', 'feedback', 'relevant', 'ad', 'encounter', 'technical', 'issue', 'video', 'player', 'slow', 'load', 'content', 'video', 'content'