## Project Setup and Overview

This section outlines the project's environment, required dependencies, and the established folder structure.

### 1. Environment Details

This notebook is running in a Google Colaboratory environment. Here's a quick look at the Python version being used.

In [10]:
import sys
print(f"Python Version: {sys.version}")

Python Version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]


### 2. Dependencies Installation

The required Python libraries for this project are listed in `requirements.txt`. We will install them using `pip`.

In [63]:
import os

project_root = 'Seo_Content_Detector'
requirements_path = os.path.join(project_root, 'requirements.txt')

# Ensure the project root exists
os.makedirs(project_root, exist_ok=True)

# List only the essential dependencies directly used in the analysis
essential_dependencies = [
    'pandas',
    'numpy',
    'regex',
    'scikit-learn',
    'beautifulsoup4',
    'requests',
    'textstat',
    'sentence-transformers'
]

print(f"Creating streamlined requirements.txt at: {requirements_path}...")
with open(requirements_path, 'w') as f:
    for dep in essential_dependencies:
        f.write(f'{dep}\n')

print(f"Streamlined requirements.txt created. Please inspect it for accuracy.")
print(f"We can install these dependencies using: !pip install -r {requirements_path}")

Creating streamlined requirements.txt at: Seo_Content_Detector/requirements.txt...
Streamlined requirements.txt created. Please inspect it for accuracy.
You can install these dependencies using: !pip install -r Seo_Content_Detector/requirements.txt


In [65]:
!pip install -r Seo_Content_Detector/requirements.txt



### 3. Folder Structure



In [64]:
import os

project_root = 'Seo_Content_Detector'

print(f"Displaying folder structure for: {project_root}/")

for root, dirs, files in os.walk(project_root):
    level = root.replace(project_root, '').count(os.sep)
    indent = ' ' * 4 * (level)
    print(f"{indent}{os.path.basename(root)}/")
    subindent = ' ' * 4 * (level + 1)
    for f in files:
        print(f"{subindent}{f}")

Displaying folder structure for: Seo_Content_Detector/
Seo_Content_Detector/
    requirements.txt
    notebooks/
    models/
        quality_model.pkl
        tfidf_vectorizer.pkl
        tfidf_matrix.pkl
    .ipynb_checkpoints/
    Data/
        extracted_content.csv
        data.csv
        features.csv
        duplicate.csv
        .ipynb_checkpoints/


## Importing Necessary Libraries

In [None]:
import pandas as pd
import regex as re
import textstat
import numpy as np
import os
import pickle
import ast
import json
import requests
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from bs4 import BeautifulSoup
from sentence_transformers import SentenceTransformer, util


## Step 1 : Data Overview and Parshing

In [66]:
# Load the dataset
df = pd.read_csv('/content/Seo_Content_Detector/Data/data.csv')

# Inspect the first few rows
print(df.head())

# Get information about the columns and data types
print(df.info())

                                                 url  \
0     https://www.cm-alliance.com/cybersecurity-blog   
1    https://www.varonis.com/blog/cybersecurity-tips   
2  https://www.cisecurity.org/insights/blog/11-cy...   
3  https://www.cisa.gov/topics/cybersecurity-best...   
4  https://www.qnbtrust.bank/Resources/Learning-C...   

                                        html_content  
0  <!doctype html><!--[if lt IE 7]> <html class="...  
1  <!doctype html><html lang="en"><head>\n    <me...  
2  <!DOCTYPE html><html data-unhead-vue-server-re...  
3  \n\n<!DOCTYPE html>\n<html lang="en" dir="ltr"...  
4                                                NaN  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   url           81 non-null     object
 1   html_content  69 non-null     object
dtypes: object(2)
memory usage: 1.4+ KB
None


## Data Parshing

In [67]:
def parse_html_regex(html_content):
    """
    Parses HTML using regex to extract title and clean body text.
    Handles errors and missing content gracefully.
    """
    if pd.isna(html_content) or not html_content.strip():
        return "", "", 0

    try:
        # 1. Extract Title
        # re.DOTALL makes '.' match newlines
        title_match = re.search(r'<title>(.*?)</title>', html_content, re.IGNORECASE | re.DOTALL)
        title_text = title_match.group(1).strip() if title_match else ""

        # 2. Extract Body Text
        body_text = ""

        # Try to find <article> or <main> content first
        article_match = re.search(r'<article[^>]*>(.*?)</article>', html_content, re.IGNORECASE | re.DOTALL)
        main_match = re.search(r'<main[^>]*>(.*?)</main>', html_content, re.IGNORECASE | re.DOTALL)

        content_to_parse = ""
        if article_match:
            content_to_parse = article_match.group(1)
        elif main_match:
            content_to_parse = main_match.group(1)
        else:
            # Fallback to the entire body
            body_match = re.search(r'<body[^>]*>(.*?)</body>', html_content, re.IGNORECASE | re.DOTALL)
            if body_match:
                content_to_parse = body_match.group(1)
            else:
                # Last resort: use all HTML
                content_to_parse = html_content

        # Extract text from <p> tags within the selected content
        paragraphs = re.findall(r'<p[^>]*>(.*?)</p>', content_to_parse, re.IGNORECASE | re.DOTALL)
        if paragraphs:
            body_text = ' '.join(paragraphs)
        else:
            # If no <p> tags found, use the whole content_to_parse
            body_text = content_to_parse

        # Clean the extracted text:
        # 1. Remove script and style tags (including their content)
        body_text = re.sub(r'<(script|style).*?>.*?</\1>', ' ', body_text, re.IGNORECASE | re.DOTALL)
        # 2. Remove all other HTML tags
        body_text = re.sub(r'<[^>]+>', ' ', body_text)
        # 3. Replace common HTML entities
        body_text = re.sub(r'&nbsp;', ' ', body_text)
        body_text = re.sub(r'&amp;', '&', body_text)
        body_text = re.sub(r'&lt;', '<', body_text)
        body_text = re.sub(r'&gt;', '>', body_text)
        # 4. Remove extra whitespace
        body_text = re.sub(r'\s+', ' ', body_text).strip()

        # 3. Calculate Word Count
        word_count = len(body_text.split())

        return title_text, body_text, word_count

    except Exception as e:
        # print(f"Error parsing HTML: {e}") # Uncomment for debugging
        return "", "", 0

# Check if df is not empty before proceeding
if not df.empty:
    # Apply the parsing function to the DataFrame
    print("Starting HTML parsing...")
    parsed_data = df['html_content'].apply(parse_html_regex)
    df_parsed = pd.DataFrame(parsed_data.tolist(), index=df.index, columns=['title', 'body_text', 'word_count'])

    # Combine with the original 'url' column
    df_final = pd.concat([df['url'], df_parsed], axis=1)

    # Display the head of the new DataFrame
    print("\n--- Parsed Data Head (Regex) ---")
    print(df_final.head())

    # Display info to check for nulls and types
    print("\n--- Parsed Data Info (Regex) ---")
    df_final.info()

    # Save the extracted data to a new CSV file in the 'Seo_Content_Detector/Data' folder
    output_dir = 'Seo_Content_Detector/Data'
    os.makedirs(output_dir, exist_ok=True) # Ensure the directory exists
    output_filename = os.path.join(output_dir, 'extracted_content.csv')
    df_final.to_csv(output_filename, index=False)
    print(f"\nSuccessfully parsed data and saved to {output_filename}")
    print("You can now download 'extracted_content.csv' from the Colab file browser.")
else:
    print("DataFrame is empty. Please upload 'data.csv' and re-run the cell.")

Starting HTML parsing...

--- Parsed Data Head (Regex) ---
                                                 url  \
0     https://www.cm-alliance.com/cybersecurity-blog   
1    https://www.varonis.com/blog/cybersecurity-tips   
2  https://www.cisecurity.org/insights/blog/11-cy...   
3  https://www.cisa.gov/topics/cybersecurity-best...   
4  https://www.qnbtrust.bank/Resources/Learning-C...   

                                               title  \
0                                Cyber Security Blog   
1  Top 10 Cybersecurity Awareness Tips: How to St...   
2  11 Cyber Defense Tips to Stay Secure at Work a...   
3  Cybersecurity Best Practices | Cybersecurity a...   
4                                                      

                                           body_text  word_count  
0  Back Training NCSC Assured Cyber Incident Plan...        2538  
1  Cybersecurity is gaining more importance globa...        1589  
2  Cybersecurity is inextricably tied to the tech...         946  

In [68]:
extracted_df = pd.read_csv('/content/Seo_Content_Detector/Data/extracted_content.csv')
extracted_df.head()

Unnamed: 0,url,title,body_text,word_count
0,https://www.cm-alliance.com/cybersecurity-blog,Cyber Security Blog,Back Training NCSC Assured Cyber Incident Plan...,2538
1,https://www.varonis.com/blog/cybersecurity-tips,Top 10 Cybersecurity Awareness Tips: How to St...,Cybersecurity is gaining more importance globa...,1589
2,https://www.cisecurity.org/insights/blog/11-cy...,11 Cyber Defense Tips to Stay Secure at Work a...,Cybersecurity is inextricably tied to the tech...,946
3,https://www.cisa.gov/topics/cybersecurity-best...,Cybersecurity Best Practices | Cybersecurity a...,Cybersecurity Best Practices CISA provides inf...,558
4,https://www.qnbtrust.bank/Resources/Learning-C...,,,0


## Step 2 : Feature Extraction and Text Preprocessing

In [69]:
# --- Configuration ---
INPUT_FILE = '/content/Seo_Content_Detector/Data/extracted_content.csv'
OUTPUT_FILE = '/content/Seo_Content_Detector/Data/features.csv'
TFIDF_MAX_FEATURES = 100 # To keep the embedding vector manageable

# Define paths for saving TF-IDF assets
PROJECT_ROOT = 'Seo_Content_Detector'
MODELS_DIR = os.path.join(PROJECT_ROOT, 'models')
VECTORIZER_FILE = os.path.join(MODELS_DIR, 'tfidf_vectorizer.pkl')
TFIDF_MATRIX_FILE = os.path.join(MODELS_DIR, 'tfidf_matrix.pkl')

# Ensure models directory exists
os.makedirs(MODELS_DIR, exist_ok=True)

# Check if the input file from Step 1 exists
if not os.path.exists(INPUT_FILE):
    print(f"Error: Input file '{INPUT_FILE}' not found.")
    print("Please make sure you have run the previous step and generated this file.")
else:
    print(f"Loaded '{INPUT_FILE}'. Starting feature engineering...")

    # 1. Load Data
    df = pd.read_csv(INPUT_FILE)

    # 2. Clean Text (Pre-processing)
    # Fill NaN values in 'body_text' with an empty string to prevent errors
    df['body_text'] = df['body_text'].fillna('').astype(str)
    # Apply lowercase and extra whitespace removal
    df['body_text'] = df['body_text'].apply(lambda x: re.sub(r'\s+', ' ', x.lower().strip()))

    # Create a 'safe' version for textstat to avoid errors on empty strings
    # textstat can fail on very short or empty text
    df['body_text_safe'] = df['body_text'].apply(lambda x: x if len(x) > 10 else "No content to analyze")

    # 3. Extract Core Features

    # Basic metrics: word count (re-calculated from clean text)
    df['word_count'] = df['body_text'].apply(lambda x: len(x.split()))

    # Handle potential errors during textstat calculations
    def safe_sentence_count(text):
        try:
            return textstat.sentence_count(text)
        except Exception:
            return 0

    def safe_flesch_reading_ease(text):
        try:
            return textstat.flesch_reading_ease(text)
        except Exception:
            return 0 # 0 is a bad score, appropriate for un-parsable text

    # Calculate sentence count
    df['sentence_count'] = df['body_text_safe'].apply(safe_sentence_count)

    # Readability: Flesch Reading Ease score
    df['flesch_reading_ease'] = df['body_text_safe'].apply(safe_flesch_reading_ease)

    # Keywords & Embeddings: Using TF-IDF
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=TFIDF_MAX_FEATURES)

    try:
        tfidf_matrix = tfidf_vectorizer.fit_transform(df['body_text'])

        # Save TF-IDF vectorizer and matrix
        with open(VECTORIZER_FILE, 'wb') as f:
            pickle.dump(tfidf_vectorizer, f)
        with open(TFIDF_MATRIX_FILE, 'wb') as f:
            pickle.dump(tfidf_matrix, f)
        print(f"Saved TF-IDF vectorizer to '{VECTORIZER_FILE}'")
        print(f"Saved TF-IDF matrix to '{TFIDF_MATRIX_FILE}'")

        feature_names = tfidf_vectorizer.get_feature_names_out()

        def get_top_keywords(doc_vector, top_n=5):
            # doc_vector is a sparse row matrix
            dense_vec = doc_vector.toarray().flatten()
            # Get indices of top N scores, in descending order
            top_indices = dense_vec.argsort()[-top_n:][::-1]
            # Filter out indices with a score of 0
            top_words = [feature_names[i] for i in top_indices if dense_vec[i] > 0]
            return "|".join(top_words)

        # Apply to get keywords
        df['top_keywords'] = [get_top_keywords(tfidf_matrix[i]) for i in range(tfidf_matrix.shape[0])]

        # Apply to get embeddings (as string-formatted list)
        dense_embeddings = tfidf_matrix.toarray()
        # Convert numpy floats to standard Python floats before string conversion
        df['embedding'] = [str([float(x) for x in np.round(vec, 5)]) for vec in dense_embeddings]

    except ValueError as e:
        # This happens if the vocabulary is empty (e.g., all docs are stop words)
        print(f"TF-IDF Error: {e}. Filling keyword/embedding with empty values.")
        df['top_keywords'] = ""
        df['embedding'] = str([0.0] * TFIDF_MAX_FEATURES)

    # 4. Store features in a DataFrame
    final_columns = ['url', 'word_count', 'sentence_count', 'flesch_reading_ease', 'top_keywords', 'embedding']
    df_features = df[final_columns]

    # 5. Save as CSV
    df_features.to_csv(OUTPUT_FILE, index=False)
    print(f"\nSuccessfully created '{OUTPUT_FILE}' with engineered features.")

    # --- This is the result you can use to cross-check ---
    print(f"\n--- Features Extracted: Head() of {OUTPUT_FILE} ---")
    print(df_features.head())

    print(f"\n--- Features Extracted: Info() of {OUTPUT_FILE} ---")
    df_features.info()

Loaded '/content/Seo_Content_Detector/Data/extracted_content.csv'. Starting feature engineering...
Saved TF-IDF vectorizer to 'Seo_Content_Detector/models/tfidf_vectorizer.pkl'
Saved TF-IDF matrix to 'Seo_Content_Detector/models/tfidf_matrix.pkl'

Successfully created '/content/Seo_Content_Detector/Data/features.csv' with engineered features.

--- Features Extracted: Head() of /content/Seo_Content_Detector/Data/features.csv ---
                                                 url  word_count  \
0     https://www.cm-alliance.com/cybersecurity-blog        2538   
1    https://www.varonis.com/blog/cybersecurity-tips        1589   
2  https://www.cisecurity.org/insights/blog/11-cy...         946   
3  https://www.cisa.gov/topics/cybersecurity-best...         558   
4  https://www.qnbtrust.bank/Resources/Learning-C...           0   

   sentence_count  flesch_reading_ease                           top_keywords  \
0             121            31.111092    2025|security|digital|data|business 

In [70]:
features_df = pd.read_csv('/content/Seo_Content_Detector/Data/features.csv')
features_df.head()

Unnamed: 0,url,word_count,sentence_count,flesch_reading_ease,top_keywords,embedding
0,https://www.cm-alliance.com/cybersecurity-blog,2538,121,31.111092,2025|security|digital|data|business,"[0.05075, 0.89235, 0.01558, 0.0, 0.0, 0.02537,..."
1,https://www.varonis.com/blog/cybersecurity-tips,1589,81,39.334899,access|data|security|information|ai,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.48303, 0.0, 0.0, 0..."
2,https://www.cisecurity.org/insights/blog/11-cy...,946,60,53.379191,use|data|email|security|work,"[0.0, 0.06436, 0.0, 0.0, 0.0, 0.24889, 0.0, 0...."
3,https://www.cisa.gov/topics/cybersecurity-best...,558,24,2.816895,best|security|help|online|information,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,https://www.qnbtrust.bank/Resources/Learning-C...,0,1,54.725,,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


## Step 3 : Duplicate Detection

In [34]:
# --- Configuration ---
INPUT_FILE = '/content/Seo_Content_Detector/Data/features.csv'

DUPLICATES_FILE = '/content/Seo_Content_Detector/Data/duplicate.csv'
SIMILARITY_THRESHOLD = 0.80
THIN_THRESHOLD = 500

# Check if the input file exists
if not os.path.exists(INPUT_FILE):
    print(f"Error: Input file '{INPUT_FILE}' not found.")
    print("Please make sure you have 'features.csv' in your Colab environment.")
else:
    print(f"Loaded '{INPUT_FILE}'. Starting duplicate and thin content detection...")

    # 1. Load Data
    df = pd.read_csv(INPUT_FILE)

    # 2. Process Embeddings
    # The 'embedding' column is a string representation of a list (e.g., '[0.1, 0.2, ...]')
    # We need to convert it back to a list of numbers.
    def safe_eval(s):
        try:
            # ast.literal_eval safely evaluates a string as a Python literal
            return ast.literal_eval(s)
        except (ValueError, SyntaxError, TypeError):
            # Handle empty or malformed strings
            return []

    print("Converting embedding strings to numerical vectors...")
    df['embedding_list'] = df['embedding'].apply(safe_eval)

    # Handle potential empty lists and ensure all vectors have the same length
    # Get the expected length (from the first valid embedding)
    try:
        expected_len = max(len(v) for v in df['embedding_list'])
        if expected_len == 0:
            raise ValueError("All embeddings are empty.")
    except ValueError as e:
        print(f"Warning: Could not determine embedding length ({e}). Using 100 as default.")
        expected_len = 100 # Default to TFIDF_MAX_FEATURES from last step

    # Pad or fix any vectors that are not the right length
    def fix_vector(v):
        if len(v) == expected_len:
            return v
        else:
            # Return a zero vector of the correct length
            return [0.0] * expected_len

    df['embedding_list_fixed'] = df['embedding_list'].apply(fix_vector)

    # Create the final matrix for similarity calculation
    embedding_matrix = np.array(df['embedding_list_fixed'].tolist())

    # 3. Compute Cosine Similarity
    print("Computing cosine similarity matrix...")
    sim_matrix = cosine_similarity(embedding_matrix)

    # 4. Identify and List Duplicate Pairs
    duplicate_pairs = []
    n = len(df)

    # Get indices for the upper triangle of the matrix (to avoid self-pairs and reverse-pairs)
    upper_tri_indices = np.triu_indices(n, k=1)

    for i, j in zip(*upper_tri_indices):
        similarity = sim_matrix[i, j]
        if similarity > SIMILARITY_THRESHOLD:
            pair = {
                'url1': df.iloc[i]['url'],
                'url2': df.iloc[j]['url'],
                'similarity': similarity
            }
            duplicate_pairs.append(pair)

    # 5. Save Duplicate Pairs to CSV
    df_duplicates = pd.DataFrame(duplicate_pairs)
    df_duplicates.to_csv(DUPLICATES_FILE, index=False)
    print(f"\nSaved duplicate pairs to '{DUPLICATES_FILE}'")

    # 6. Thin Content Detection
    # Add the 'is_thin' column to the original DataFrame
    df['is_thin'] = (df['word_count'] < THIN_THRESHOLD).astype(int)

    # 7. Save updated features file (for next step)
    # We drop the temporary list columns before saving
    df_to_save = df.drop(columns=['embedding_list', 'embedding_list_fixed'])
    df_to_save.to_csv(INPUT_FILE, index=False)
    print(f"Updated '{INPUT_FILE}' with 'is_thin' column.")

    # 8. Report Basic Statistics
    total_pages = len(df)
    duplicate_pairs_count = len(df_duplicates)
    thin_content_count = df['is_thin'].sum()
    thin_content_percent = (thin_content_count / total_pages) * 100

    print("\n--- Summary ---")
    print(f"Total pages analyzed: {total_pages}")
    print(f"Duplicate pairs found (>{SIMILARITY_THRESHOLD*100}% similarity): {duplicate_pairs_count}")
    print(f"Thin content pages (<{THIN_THRESHOLD} words): {thin_content_count} ({thin_content_percent:.1f}%)")

    print("\n--- Duplicate Pairs Preview (from duplicate.csv) ---")
    if duplicate_pairs_count > 0:
        print(df_duplicates.head())
    else:
        print("No duplicate pairs found.")

Loaded '/content/Seo_Content_Detector/Data/features.csv'. Starting duplicate and thin content detection...
Converting embedding strings to numerical vectors...
Computing cosine similarity matrix...

Saved duplicate pairs to '/content/Seo_Content_Detector/Data/duplicate.csv'
Updated '/content/Seo_Content_Detector/Data/features.csv' with 'is_thin' column.

--- Summary ---
Total pages analyzed: 81
Duplicate pairs found (>80.0% similarity): 38
Thin content pages (<500 words): 32 (39.5%)

--- Duplicate Pairs Preview (from duplicate.csv) ---
                                                url1  \
0     https://www.cm-alliance.com/cybersecurity-blog   
1     https://www.cm-alliance.com/cybersecurity-blog   
2  https://nordlayer.com/learn/network-security/b...   
3  https://nordlayer.com/learn/network-security/b...   
4  https://nordlayer.com/learn/network-security/b...   

                                                url2  similarity  
0  https://blog.hubspot.com/marketing/what-is-dig...  

## Step 4 : Content Quality Scoring

In [39]:
# --- Configuration ---
INPUT_FILE = '/content/Seo_Content_Detector/Data/features.csv'
MODEL_FILE = '/content/Seo_Content_Detector/models/quality_model.pkl'

# Check if the input file exists
if not os.path.exists(INPUT_FILE):
    print(f"Error: Input file '{INPUT_FILE}' not found.")
    print("Please make sure 'features.csv' (from the previous step) is in your Colab environment.")
else:
    print(f"Loaded '{INPUT_FILE}'. Starting Step 4: Content Quality Scoring...")

    # 1. Load Data
    df = pd.read_csv(INPUT_FILE)

    # 2. Create Synthetic Labels (as per the rules)
    def apply_quality_label(row):
        word_count = row['word_count']
        readability = row['flesch_reading_ease']

        # Rule 1: High Quality
        if word_count > 1500 and (readability >= 50 and readability <= 70):
            return "High"
        # Rule 2: Low Quality
        # (Uses 'is_thin' for word_count < 500)
        elif row['is_thin'] == 1 or readability < 30:
            return "Low"
        # Rule 3: Medium Quality (all other cases)
        else:
            return "Medium"

    df['quality_label'] = df.apply(apply_quality_label, axis=1)

    print("\nLabel Distribution:")
    # Show the distribution of the newly created labels
    print(df['quality_label'].value_counts(normalize=True))

    # 3. Define Features (X) and Target (y)
    # Using the features specified in the prompt + the one we engineered
    feature_cols = ['word_count', 'sentence_count', 'flesch_reading_ease', 'is_thin']
    X = df[feature_cols]
    y = df['quality_label']

    # 4. Train/Test Split (70/30)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=0.3,
        random_state=42,
        stratify=y  # Ensure labels are distributed proportionally
    )

    # 5. Train Classification Model (Random Forest)
    print("\nTraining Random Forest model...")
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # 6. Evaluate ML Model
    y_pred = model.predict(X_test)

    # Get class labels present in the test set for the report
    labels = sorted(y_test.unique())

    print("\n--- Model Performance (Random Forest) ---")
    print(classification_report(y_test, y_pred, labels=labels, zero_division=0))

    ml_accuracy = accuracy_score(y_test, y_pred)
    print(f"Overall Accuracy (Random Forest): {ml_accuracy:.4f}")

    print("\nConfusion Matrix (Random Forest):")
    # Print labels to help read the matrix
    print(f"Labels: {labels}")
    print(confusion_matrix(y_test, y_pred, labels=labels))

    # 7. Compare to Baseline (Rule-based classifier using word count only)
    def baseline_model(word_count):
        if word_count < 500:
            return "Low"
        elif word_count > 1500:
            return "High"
        else:
            return "Medium"

    # Apply baseline model to the 'word_count' column of the test set
    y_pred_baseline = X_test['word_count'].apply(baseline_model)
    baseline_accuracy = accuracy_score(y_test, y_pred_baseline)

    print("\n--- Baseline vs. Model ---")
    print(f"Baseline Accuracy (Word Count Rule): {baseline_accuracy:.4f}")
    print(f"Model Accuracy (Random Forest):    {ml_accuracy:.4f}")


    # 8. Report Top Features
    importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': importances
    }).sort_values(by='importance', ascending=False)

    # --- 9. SAVE THE MODEL ---
    print("\nSaving model to disk...")
    with open(MODEL_FILE, 'wb') as f:
        pickle.dump(model, f)

    print(f"Successfully saved model to '{MODEL_FILE}'")

    print("\n--- Top Features (Random Forest) ---")
    print(feature_importance_df.head(3).to_string(index=False))

    print("\n\nStep 4: Content Quality Scoring completed.")

Loaded '/content/Seo_Content_Detector/Data/features.csv'. Starting Step 4: Content Quality Scoring...

Label Distribution:
quality_label
Low       0.629630
Medium    0.296296
High      0.074074
Name: proportion, dtype: float64

Training Random Forest model...

--- Model Performance (Random Forest) ---
              precision    recall  f1-score   support

        High       0.67      1.00      0.80         2
         Low       1.00      1.00      1.00        16
      Medium       1.00      0.86      0.92         7

    accuracy                           0.96        25
   macro avg       0.89      0.95      0.91        25
weighted avg       0.97      0.96      0.96        25

Overall Accuracy (Random Forest): 0.9600

Confusion Matrix (Random Forest):
Labels: ['High', 'Low', 'Medium']
[[ 2  0  0]
 [ 0 16  0]
 [ 1  0  6]]

--- Baseline vs. Model ---
Baseline Accuracy (Word Count Rule): 0.6400
Model Accuracy (Random Forest):    0.9600

Saving model to disk...
Successfully saved model to '/

##  Step 5 : Real-Time Analysis Demo

In [71]:
# Suppress warnings
warnings.filterwarnings("ignore")

# --- 2. LOAD ASSETS & PRE-COMPUTE EMBEDDINGS ---

# Define file paths
MODEL_FILE = '/content/Seo_Content_Detector/models/quality_model.pkl'
TEXT_FILE = '/content/Seo_Content_Detector/Data/extracted_content.csv'
FEATURES_FILE = '/content/Seo_Content_Detector/Data/features.csv'
EMBEDDING_MODEL_NAME = 'all-MiniLM-L6-v2'
EMBEDDING_FILE = 'sentence_embeddings.npy' # This file was created by the last run

# Load the sentence transformer model
try:
    sbert_model = SentenceTransformer(EMBEDDING_MODEL_NAME)
    print(f"Loaded sentence embedding model: '{EMBEDDING_MODEL_NAME}'")
except Exception as e:
    print(f"Error loading SBERT model: {e}")

# Check for all required files
if not all([os.path.exists(f) for f in [MODEL_FILE, TEXT_FILE, FEATURES_FILE, EMBEDDING_FILE]]):
    print("Error: Missing required files from previous steps.")
    print(f"Make sure you have: {MODEL_FILE}, {TEXT_FILE}, {FEATURES_FILE}, and {EMBEDDING_FILE}")
else:
    print("Loading all required assets...")

    # Load the trained quality model
    with open(MODEL_FILE, 'rb') as f:
        quality_model = pickle.load(f)
    print(f"Loaded '{MODEL_FILE}'")

    # Load the list of known URLs
    df_features = pd.read_csv(FEATURES_FILE)
    known_urls = df_features['url'].tolist()

    # Load the clean text from Step 1
    df_text = pd.read_csv(TEXT_FILE)
    corpus_texts = df_text['body_text'].fillna('').astype(str).tolist()

    # --- Load Pre-Computed Corpus Embeddings ---
    print(f"Loading existing corpus embeddings from '{EMBEDDING_FILE}'...")
    corpus_embeddings = np.load(EMBEDDING_FILE)
    # Convert back to tensor for the 'util.cos_sim' function
    import torch
    corpus_embeddings = torch.tensor(corpus_embeddings)

    print(f"Corpus embeddings are ready (Shape: {corpus_embeddings.shape})")
    print("All assets loaded. Ready to analyze.")


# --- 3. DEFINE THE ANALYSIS FUNCTION ---

def analyze_url(url):
    """
    Scrapes a live URL, extracts features, predicts quality,
    and finds SEMANTICALLY similar matches from our existing dataset.
    """

    # --- Part A: Scrape the URL ---
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers, timeout=10)

        if response.status_code != 200:
            return {"error": f"Failed to fetch URL. Status code: {response.status_code}"}

        html_content = response.text
    except Exception as e:
        return {"error": f"Scraping failed: {str(e)}"}

    # --- Part B: Parse and Engineer Features ---
    try:
        soup = BeautifulSoup(html_content, 'html.parser')
        title = soup.find('title').get_text(strip=True) if soup.find('title') else ""
        main_content = soup.find('article') or soup.find('main') or soup.body
        body_text = main_content.get_text(separator=' ', strip=True) if main_content else ""
        body_text = re.sub(r'\s+', ' ', body_text).lower()

        # Features for the quality model
        word_count = len(body_text.split())
        safe_text = body_text if word_count > 10 else "No content to analyze"
        sentence_count = textstat.sentence_count(safe_text)
        readability = textstat.flesch_reading_ease(safe_text)
        is_thin = bool(word_count < 500)

    except Exception as e:
        return {"error": f"HTML parsing failed: {str(e)}"}

    # --- Part C: Predict Quality ---
    feature_cols = ['word_count', 'sentence_count', 'flesch_reading_ease', 'is_thin']
    features_df = pd.DataFrame(
        [[word_count, sentence_count, readability, int(is_thin)]],
        columns=feature_cols
    )
    quality_label = quality_model.predict(features_df)[0]

    # --- Part D: Detect Duplicates ---
    new_embedding = sbert_model.encode([body_text], convert_to_tensor=True)
    similarities = util.cos_sim(new_embedding, corpus_embeddings)
    SEMANTIC_SIMILARITY_THRESHOLD = 0.60

    similar_to = []
    for i, sim_score in enumerate(similarities.flatten()):
        sim_score = float(sim_score)
        if sim_score > SEMANTIC_SIMILARITY_THRESHOLD and sim_score < 0.99:
            similar_to.append({
                "url": known_urls[i],
                "similarity": round(sim_score, 2)
            })

    similar_to.sort(key=lambda x: x['similarity'], reverse=True)

    # --- Part E: Format the Final Output ---
    result = {
        "url": url,
        "title": title,
        "word_count": word_count,
        "readability": round(readability, 2),
        "quality_label": quality_label,
        "is_thin": is_thin,
        "similar_to": similar_to[:5] # Show top 5 matches
    }

    return result

# --- 4. EXAMPLE USAGE ---
if 'quality_model' in locals():
    # Test a new, random URL that is *related* to our dataset

    test_url = "https://www.kaspersky.com/resource-center/definitions/what-is-cyber-security"

    print(f"\n--- Analyzing Topically-Related URL: {test_url} ---")

    result = analyze_url(test_url)

    # Print the final JSON output
    print(json.dumps(result, indent=2))

else:
    print("\nCould not run example analysis because assets are not loaded.")

Loaded sentence embedding model: 'all-MiniLM-L6-v2'
Loading all required assets...
Loaded '/content/Seo_Content_Detector/models/quality_model.pkl'
Loading existing corpus embeddings from 'sentence_embeddings.npy'...
Corpus embeddings are ready (Shape: torch.Size([81, 384]))
All assets loaded. Ready to analyze.

--- Analyzing Topically-Related URL: https://www.kaspersky.com/resource-center/definitions/what-is-cyber-security ---
{
  "url": "https://www.kaspersky.com/resource-center/definitions/what-is-cyber-security",
  "title": "What is Cybersecurity?",
  "word_count": 1780,
  "readability": 30.16,
  "quality_label": "Low",
  "is_thin": false,
  "similar_to": [
    {
      "url": "https://www.cisa.gov/topics/cybersecurity-best-practices",
      "similarity": 0.66
    },
    {
      "url": "https://nordlayer.com/learn/network-security/basics/",
      "similarity": 0.63
    },
    {
      "url": "https://www.cisco.com/site/us/en/learn/topics/security/what-is-network-security.html",
      

In [59]:
# --- 4.2 EXAMPLE USAGE ---
if 'quality_model' in locals():
    # Test a new, random URL that is *related* to our dataset

    test_url = "https://moz.com/beginners-guide-to-seo"

    print(f"\n--- Analyzing Topically-Related URL: {test_url} ---")

    result = analyze_url(test_url)

    # Print the final JSON output
    print(json.dumps(result, indent=2))

else:
    print("\nCould not run example analysis because assets are not loaded.")


--- Analyzing Topically-Related URL: https://moz.com/beginners-guide-to-seo ---
{
  "url": "https://moz.com/beginners-guide-to-seo",
  "title": "Beginner's Guide to SEO (Search Engine Optimization) - Moz",
  "word_count": 1057,
  "readability": 48.5,
  "quality_label": "Medium",
  "is_thin": false,
  "similar_to": [
    {
      "url": "https://simple.wikipedia.org/wiki/Search_engine_optimization",
      "similarity": 0.74
    },
    {
      "url": "https://www.shopify.com/blog/ecommerce-seo-beginners-guide",
      "similarity": 0.73
    },
    {
      "url": "https://en.wikipedia.org/wiki/Search_engine_optimization",
      "similarity": 0.72
    },
    {
      "url": "https://developers.google.com/search/docs/fundamentals/seo-starter-guide",
      "similarity": 0.7
    }
  ]
}


In [60]:
# --- 4.3 EXAMPLE USAGE ---
if 'quality_model' in locals():
    # Test a new, random URL that is *related* to our dataset

    test_url = "https://www.cisco.com/c/en/us/products/security/what-is-cybersecurity.html"

    print(f"\n--- Analyzing Topically-Related URL: {test_url} ---")

    result = analyze_url(test_url)

    # Print the final JSON output
    print(json.dumps(result, indent=2))

else:
    print("\nCould not run example analysis because assets are not loaded.")


--- Analyzing Topically-Related URL: https://www.cisco.com/c/en/us/products/security/what-is-cybersecurity.html ---
{
  "url": "https://www.cisco.com/c/en/us/products/security/what-is-cybersecurity.html",
  "title": "What is cybersecurity? - Cisco",
  "word_count": 1388,
  "readability": 17.48,
  "quality_label": "Low",
  "is_thin": false,
  "similar_to": [
    {
      "url": "https://www.cisa.gov/topics/cybersecurity-best-practices",
      "similarity": 0.71
    },
    {
      "url": "https://www.cisco.com/site/us/en/learn/topics/security/what-is-network-security.html",
      "similarity": 0.71
    },
    {
      "url": "https://www.fortinet.com/resources/cyberglossary/what-is-network-security",
      "similarity": 0.63
    },
    {
      "url": "https://www.cisecurity.org/insights/blog/11-cyber-defense-tips-to-stay-secure-at-work-and-home",
      "similarity": 0.6
    }
  ]
}
