In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import re
from collections import Counter
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')
import text_mining_utils as tmu

nltk.data.path.append("./data")
nltk.download('punkt', download_dir="./data")
nltk.download("punkt_tab", download_dir="./data")
nltk.download("averaged_perceptron_tagger", download_dir="./data")
nltk.download("averaged_perceptron_tagger_eng", download_dir="./data")
nltk.download('stopwords', download_dir="./data")

In [None]:
df = pd.read_csv("./data/spunout_data.csv", encoding='utf-8-sig')

# Ensure titles are strings and handle potential NaN values by replacing them with empty strings
df['Title'] = df['Title'].fillna('').astype(str)

# Remove rows where content extraction failed (marked as 'N/A' by the scraper)
# Also drop rows where Content is actually NaN
df = df[df['Content'] != 'N/A']
df = df.dropna(subset=['Content'])

# Concatenate the Title with the Content
# We add a space in between to prevent the last word of the title merging with the first word of the body
df['Content'] = df['Title'] + " " + df['Content']

In [None]:
from nltk.stem import WordNetLemmatizer
from spacy.lang.en.stop_words import STOP_WORDS as SPACY_STOP_WORDS
nltk.download('wordnet', download_dir="./data")

# Text Initial Preprocessing

# Load the Standard Library Stopwords (SpaCy is preferred over NLTK as it is more comprehensive)
# This automatically covers:
# - Pronouns ("he", "she", "they", "its", "whose", "whom")
# - Determiners ("this", "that", "these", "those")
# - Basic function words
stop_words = set(SPACY_STOP_WORDS)

# Extended Domain-Specific Stopwords for SpunOut.ie
# Includes site-specific noise, generic web terms, and common filler words.
domain_stopwords = [
    # Site & Web specific (Libraries don't know 'spunout' is noise)
    'spunout', 'spun', 'out', 'ie', 'ireland', 'irish', 'www', 'http', 'https', 'com', 
    'copyright', 'privacy', 'policy', 'terms', 'conditions', 'login', 'sign', 'register',
    
    # Scraping / HTML / Metadata Artifacts
    'page', 'section', 'footer', 'header', 'sidebar', 'widget', 'nav', 'advertisement', 'ad',
    'promo', 'cookie', 'script', 'javascript', 'css', 'html', 'body', 'main', 'published', 'updated',
    'author', 'post', 'article', 'url', 'permalink',
    
    # Generic Advice / High Frequency Verbs (Context specific noise)
    # Libraries consider these content words, but in an advice corpus they are fillers
    'day', 'new', 'good', 'bad',
    'check', 'try', 'keep',
    'like', 'just', 'get', 'also', 'would', 'could', 'one', 'make', 'use', 'way', 'well', 
    'time', 'know', 'need', 'really', 'thing', 'think', 'much', 'even', 'still', 'another', 
    'every', 'go', 'want', 'take', 'find', 'look', 'come', 'year', 'old', 'may', 'might',
    
    # Interaction / Navigation
    'click', 'read', 'link', 'menu', 'comment', 'reply',
    
    # Text Slang / Filler
    'u', 'ur', 'im', 'dont', 'cant', 'wont', 'oh', 'ok', 'please', 'thanks', 'thank', 'yes', 'no'
]

# Merge the standard library list with your custom list
stop_words.update(domain_stopwords)

def clean_text(text):
    """
    Refined text cleaning function.
    Includes explicit removal of separator artifacts (e.g., ___, ///), 
    HTML/URL removal, Lemmatization, and Stopword filtering.
    """
    # 1. Ensure text is string and lowercase
    text = str(text).lower()
    
    # 2. Remove URLs and HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'<.*?>', ' ', text)
    
    # 3. NEW: Explicitly remove common page break/artifact separators
    # Matches 2 or more underscores, dashes, or dots (e.g., "___", "---", "...")
    text = re.sub(r'[\_\-\.]{1,}', ' ', text)
    # Remove standalone slashes (forward or backward)
    text = re.sub(r'[/\\]', ' ', text)

    # 4. Remove apostrophes to unify contractions (e.g., "don't" -> "dont")
    text = re.sub(r"\'", "", text)
    
    # 5. Remove all non-letter characters (except spaces) - Final Polish
    # This removes remaining symbols like @, #, $, %, &, *, etc.
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # 6. Remove single characters that are surrounded by spaces
    # This cleans up leftover fragments like " a " or " b " that usually hold no meaning
    text = re.sub(r'\s+[a-z]\s+', ' ', text)
    
    # 7. Tokenize (split into words)
    words = text.split()
    
    # 8. Lemmatization (Morphological reduction)
    # Reduces words to base form (e.g., "studies" -> "study")
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    
    # 9. Filter Stopwords and Short Words
    # Remove stopwords and words shorter than 3 characters
    filtered_words = [word for word in lemmatized_words if word not in stop_words and len(word) > 2]
    
    return " ".join(filtered_words)

print("Cleaning text data (Refined: Separators Removed + Lemmatization + Extended Stopwords)")
df['Clean_Content'] = df['Content'].apply(clean_text)
df.head()

In [None]:
# Basic statistics
df['Word_Count'] = df['Clean_Content'].apply(lambda x: len(x.split()))

total_words = df['Word_Count'].sum()
avg_words_per_article = df['Word_Count'].mean()
unique_categories = df['Category'].nunique()
unique_topics = df['Topic'].nunique()

print("DATASET STATISTICS")
print(f"Total articles: {len(df)}")
print(f"Total categories: {unique_categories}")
print(f"Total topics: {unique_topics}")
print(f"Total words (after cleaning): {total_words}")
print(f"Average words per article: {avg_words_per_article:.2f}")
print("\nTop 5 categories by article count:")
print(df['Category'].value_counts())

In [None]:
# Define the save path
save_path = 'practical_assessment_adsah_6014_2_web_content_mining/images/'

# Create the directory if it doesn't exist
if not os.path.exists(save_path):
    os.makedirs(save_path)

# Visualizations 
# Set general style for plots
sns.set(style="whitegrid")

# Distribution of article lengths
plt.figure(figsize=(10, 6))
sns.histplot(df['Word_Count'], bins=30, kde=True, color='skyblue')
# plt.title('Distribution of Article Lengths (Word Count)')
plt.xlabel('Number of words')
plt.ylabel('Frequency')
plt.savefig(os.path.join(save_path, 'distribution_article_lengths.png'), dpi=300, bbox_inches='tight')
plt.show()

# Article count per category
plt.figure(figsize=(12, 6))
# Limit to top 10 categories to keep the chart readable
top_categories = df['Category'].value_counts().nlargest(10).index
sns.countplot(data=df[df['Category'].isin(top_categories)], x='Category', order=top_categories, palette='viridis')
# plt.title('Top 10 Categories by Article Volume')
plt.xlabel('Category')
plt.ylabel('Number of Articles')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(save_path, 'top_10_categories.png'), dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Word Clouds per Category (for the top 3 categories) ---
print("Generating Word Clouds for top categories...")
top_3_cats = df['Category'].value_counts().nlargest(3).index
# Create a figure with 1 row and 3 columns
fig, axes = plt.subplots(1, 3, figsize=(18, 6))
# Flatten the axes array for easy iteration (makes axes[0], axes[1], etc. work easily)
axes = axes.flatten()

# Iterate through the top 3 categories and the corresponding axes
for i, cat in enumerate(top_3_cats):
    # 1. Prepare the text
    cat_text = " ".join(df[df['Category'] == cat]['Clean_Content'])
    # 2. Generate the Word Cloud
    # You can customize width/height/background_color here if needed
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(cat_text)
    # 3. Plot on the specific axis
    axes[i].imshow(wordcloud, interpolation='bilinear')
    axes[i].set_title(f'{cat.replace("_", " ").title()}', fontsize=14)
    axes[i].axis('off')  # Hide axis ticks/lines

# Adjust layout to prevent overlapping
plt.tight_layout()
save_path = 'practical_assessment_adsah_6014_2_web_content_mining/images'
plt.savefig(f'{save_path}/top_3_wordclouds.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Flatten all words into a single list
all_words_list = " ".join(df['Clean_Content']).split()
word_counts = Counter(all_words_list)
most_common_words = word_counts.most_common(20)
words, counts = zip(*most_common_words)
plt.figure(figsize=(20, 6))
sns.barplot(x=list(words), y=list(counts), palette='rocket')
# plt.title('Top 20 Most Frequent Words in Dataset')
plt.xlabel('Words')
plt.ylabel('Frequency')
# Rotate the x-axis labels by 45 degrees
# ha='right' ensures the text aligns nicely at the end of the label
plt.xticks(rotation=45, ha='right') 
plt.tight_layout() 
plt.savefig(os.path.join(save_path, 'top_20_frequent_words.png'), dpi=300, bbox_inches='tight')
plt.show()