# SEO Content Detector: End-to-End Pipeline
This notebook builds a full pipeline to parse HTML, engineer NLP features, detect duplicates, and train a content quality model. It also exposes a real-time `analyze_url(url)` function.

## 1. Setup & Imports

In [3]:
import os, re, json
import pandas as pd, numpy as np
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
import joblib

try:
    import textstat
except Exception as e:
    textstat = None
    
cwd = os.path.abspath(os.getcwd())
if os.path.basename(cwd) == "notebooks":
    PROJECT_ROOT = os.path.dirname(cwd)
else:
    PROJECT_ROOT = cwd
BASE = os.path.abspath('..') if os.path.basename(os.getcwd())=='notebooks' else os.path.abspath('.')
DATA_DIR = os.path.join(BASE, 'data')
MODELS_DIR = os.path.join(BASE, 'models')
os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

## 2. Load Dataset
Expected columns:
- `url`
- optional `html_content` (raw HTML string)

In [4]:
data_path = os.path.join(DATA_DIR, 'data.csv')
df = pd.read_csv(data_path)
if 'html_content' not in df.columns:
    df['html_content'] = ''
df.head(2)

Unnamed: 0,url,html_content
0,https://www.cm-alliance.com/cybersecurity-blog,"<!doctype html><!--[if lt IE 7]> <html class=""..."
1,https://www.varonis.com/blog/cybersecurity-tips,"<!doctype html><html lang=""en""><head>\n <me..."


In [5]:
print('Rows:', len(df))

Rows: 81


## 3. HTML Parsing

In [6]:
def parse_html_get_title_body(html: str):
    if not isinstance(html, str) or not html.strip():
        return "", ""
    try:
        soup = BeautifulSoup(html, "html.parser")
        title = soup.title.get_text(strip=True) if soup.title else ""
        main = soup.find('main')
        article = soup.find('article')
        container = main if (main and main.get_text(strip=True)) else (article if (article and article.get_text(strip=True)) else None)

        if container is None:
            paragraphs = [p.get_text(' ', strip=True) for p in soup.find_all('p')]
            body = " ".join(paragraphs) if len(" ".join(paragraphs).strip()) >= 50 else soup.get_text(' ', strip=True)
        else:
            body = container.get_text(' ', strip=True)

        body = re.sub(r"\s+", " ", body).strip()
        return title, body
    except Exception:
        return "", ""

extracted_rows = []
for _, row in df.iterrows():
    url = row.get('url', '')
    html = row.get('html_content', '')
    title, body = parse_html_get_title_body(html)
    word_count = len(body.split()) if isinstance(body, str) else 0
    extracted_rows.append({'url': url, 'title': title, 'body_text': body, 'word_count': word_count})
extracted_df = pd.DataFrame(extracted_rows)
extracted_df.to_csv(os.path.join(DATA_DIR, 'extracted_content.csv'), index=False)
extracted_df.head(3)

Unnamed: 0,url,title,body_text,word_count
0,https://www.cm-alliance.com/cybersecurity-blog,Cyber Security Blog,Cyber Crisis Tabletop Exercise Cyber Security ...,326
1,https://www.varonis.com/blog/cybersecurity-tips,Top 10 Cybersecurity Awareness Tips: How to St...,Blog Privacy & Compliance Top 10 Cybersecurity...,1747
2,https://www.cisecurity.org/insights/blog/11-cy...,11 Cyber Defense Tips to Stay Secure at Work a...,Home Insights Blog Posts 11 Cyber Defense Tips...,1058


## 4. Feature Engineering

In [23]:
def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = re.sub(r"\s+", " ", s).strip()
    return s

def sentence_count(text: str) -> int:
    if not isinstance(text, str) or not text.strip():
        return 0
    parts = re.split(r"[.!?]+(?:\s|$)", text)
    parts = [p for p in parts if p.strip()]
    return len(parts)

extracted_df['clean_text'] = extracted_df['body_text'].apply(clean_text)
extracted_df['sentence_count'] = extracted_df['body_text'].apply(sentence_count)

def safe_flesch(text: str) -> float:
    if textstat:
        try:
            return float(textstat.flesch_reading_ease(text)) if isinstance(text, str) and text.strip() else 0.0
        except Exception:
            pass
    # Fallback proxy
    wc = len(text.split()) if isinstance(text, str) else 0
    sc = sentence_count(text)
    if wc == 0 or sc == 0:
        return 0.0
    avg_sentence_len = wc / max(sc, 1)
    score = max(0.0, 100.0 - min(90.0, avg_sentence_len * 3.0))
    return score

extracted_df['flesch_reading_ease'] = extracted_df['body_text'].apply(safe_flesch)

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words='english')
X_tfidf = tfidf.fit_transform(extracted_df['clean_text'].fillna(''))
feature_names = np.array(tfidf.get_feature_names_out())

top_keywords = []
for i in range(X_tfidf.shape[0]):
    row = X_tfidf.getrow(i)
    if row.nnz == 0:
        top_keywords.append("")
        continue
    idx_sorted = np.argsort(row.data)[::-1][:5]
    terms = feature_names[row.indices[idx_sorted]]
    top_keywords.append("|".join(terms))

extracted_df['top_keywords'] = top_keywords

svd = TruncatedSVD(n_components=min(50, max(1, X_tfidf.shape[1]//2 or 1)), random_state=42)
X_svd = svd.fit_transform(X_tfidf)
extracted_df['embedding'] = [json.dumps(v.tolist()) for v in X_svd]

extracted_df['is_thin'] = extracted_df['word_count'] < 500

features_cols = ['url','word_count','sentence_count','flesch_reading_ease','top_keywords','embedding','is_thin','title']
features_df = extracted_df[features_cols].copy()
features_df.to_csv(os.path.join(DATA_DIR, 'features.csv'), index=False)
features_df.head(3)

Unnamed: 0,url,word_count,sentence_count,flesch_reading_ease,top_keywords,embedding,is_thin,title
0,https://www.cm-alliance.com/cybersecurity-blog,326,7,10.0,cyber|cybersecurity|training|events|clients,"[0.1457109878739277, 0.17160831109043065, -0.1...",True,Cyber Security Blog
1,https://www.varonis.com/blog/cybersecurity-tips,1747,94,44.244681,varonis|data|access|security|cybersecurity,"[0.28637120593334014, 0.40330652914180265, -0....",False,Top 10 Cybersecurity Awareness Tips: How to St...
2,https://www.cisecurity.org/insights/blog/11-cy...,1058,72,55.916667,password|cyber defense|don|authentication|cyber,"[0.23657454565026703, 0.3298751839786317, -0.1...",False,11 Cyber Defense Tips to Stay Secure at Work a...


## 5. Duplicate Detection

In [24]:
sim_matrix = cosine_similarity(X_svd)
threshold = 0.80
pairs = []
n = sim_matrix.shape[0]
for i in range(n):
    for j in range(i+1, n):
        sim = float(sim_matrix[i, j])
        if sim >= threshold:
            pairs.append({'url1': extracted_df.loc[i, 'url'],
                          'url2': extracted_df.loc[j, 'url'],
                          'similarity': round(sim, 4)})
dupes_df = pd.DataFrame(pairs)
dupes_df.to_csv(os.path.join(DATA_DIR, 'duplicates.csv'), index=False)
print('Total pages analyzed:', n)
print('Duplicate pairs:', len(dupes_df))
print('Thin content pages:', int(extracted_df['is_thin'].sum()))
dupes_df.head()

Total pages analyzed: 81
Duplicate pairs: 28
Thin content pages: 29


Unnamed: 0,url1,url2,similarity
0,https://nordlayer.com/learn/network-security/b...,https://www.fortinet.com/resources/cyberglossa...,0.9558
1,https://nordlayer.com/learn/network-security/b...,https://www.cisco.com/site/us/en/learn/topics/...,0.8279
2,https://www.fortinet.com/resources/cyberglossa...,https://www.trendmicro.com/en_us/what-is/netwo...,0.8757
3,https://guardiandigital.com/resources/blog/gui...,https://inspiredelearning.com/blog/phishing-pr...,0.9737
4,https://en.wikipedia.org/wiki/SD-WAN,https://www.cisco.com/site/us/en/learn/topics/...,0.9589


## 6. Quality Labels & Model Training

In [36]:
def label_quality(row):
    wc = row['word_count']
    re_ = row['flesch_reading_ease']
    if (wc > 1500) and (50 <= re_ <= 70):
        return 'High'
    if (wc < 500) or (re_ < 30):
        return 'Low'
    return 'Medium'

features_df['quality_label'] = features_df.apply(label_quality, axis=1)

num_cols = ['word_count','sentence_count','flesch_reading_ease']
X = extracted_df[num_cols].fillna(0.0)
label_map = {'Low':0, 'Medium':1, 'High':2}
y = features_df['quality_label'].map(label_map)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)

rf = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42))
])
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
report = classification_report(y_test, y_pred, target_names=['Low','Medium','High'])
cm = confusion_matrix(y_test, y_pred)

def baseline_predict(wc_series):
    preds = []
    for wc in wc_series:
        if wc < 500:
            preds.append(0)  # Low
        elif wc > 1500:
            preds.append(2)  # High
        else:
            preds.append(1)  # Medium
    return np.array(preds)

y_base_pred = baseline_predict(X_test['word_count'])
base_acc = accuracy_score(y_test, y_base_pred)
base_f1 = f1_score(y_test, y_base_pred, average='weighted')

print("Model Performance:")
print(report)
print("Overall Accuracy:", round(acc, 3))
print("Baseline Accuracy:", round(base_acc, 3))

joblib.dump(rf, os.path.join(MODELS_DIR, 'quality_model.pkl')) #to save model for later use

Model Performance:
              precision    recall  f1-score   support

         Low       0.86      1.00      0.92        12
      Medium       1.00      0.67      0.80         9
        High       0.80      1.00      0.89         4

    accuracy                           0.88        25
   macro avg       0.89      0.89      0.87        25
weighted avg       0.90      0.88      0.87        25

Overall Accuracy: 0.88
Baseline Accuracy: 0.8


['C:\\Users\\Gauth\\models\\quality_model.pkl']

## 7. Real-Time Analysis Function

In [37]:
import requests

def analyze_url(url: str, top_k_similar: int = 5, similarity_threshold: float = 0.5):
    # Scrape
    html = ''
    try:
        resp = requests.get(url, timeout=10, headers={'User-Agent':'Mozilla/5.0'})
        if resp.status_code == 200:
            html = resp.text
    except Exception:
        pass

    # Parse
    title, body = parse_html_get_title_body(html)
    wc = len(body.split())
    sc = sentence_count(body)
    re_score = safe_flesch(body)

    # Clean & vectorize using TF-IDF/SVD 
    text_clean = clean_text(body)
    vec = tfidf.transform([text_clean])
    vec_svd = svd.transform(vec)

    # Similarities to corpus
    sims = cosine_similarity(vec_svd, X_svd)[0]
    idx_sorted = np.argsort(sims)[::-1]
    similar_list = []
    for idx in idx_sorted[:top_k_similar]:
        sim = float(sims[idx])
        if sim >= similarity_threshold:
            similar_list.append({
                'url': extracted_df.loc[idx, 'url'],
                'similarity': round(sim, 4)
            })

    # Thin & label
    is_thin = wc < 500
    if (wc > 1500) and (50 <= re_score <= 70):
        label = 'High'
    elif (wc < 500) or (re_score < 30):
        label = 'Low'
    else:
        label = 'Medium'

    # Predict model quality class 
    model_pred = rf.predict([[wc, sc, re_score]])[0]
    inv_map = {0:'Low',1:'Medium',2:'High'}
    model_label = inv_map.get(int(model_pred), label)

    return {
        'url': url,
        'title': title,
        'word_count': wc,
        'sentence_count': sc,
        'readability': round(re_score, 2),
        'is_thin': bool(is_thin),
        'rule_label': label,
        'model_label': model_label,
        'similar_to': similar_list
    }

In [38]:
# Demo 
result = analyze_url('https://www.leadwalnut.com/')
print(json.dumps(result, indent=2))

{
  "url": "https://www.leadwalnut.com/",
  "title": "LeadWalnut | SEO & CRO Agency for B2B Tech Growth & Pipeline Impact",
  "word_count": 830,
  "sentence_count": 43,
  "readability": 42.09,
  "is_thin": false,
  "rule_label": "Medium",
  "model_label": "Medium",
  "similar_to": [
    {
      "url": "https://www.shopify.com/blog/ecommerce-seo-beginners-guide",
      "similarity": 0.6463
    },
    {
      "url": "https://apnews.com/hub/artificial-intelligence",
      "similarity": 0.5698
    },
    {
      "url": "https://www.twilio.com/en-us/blog/insights/content-marketing-best-practices",
      "similarity": 0.5576
    },
    {
      "url": "https://mailchimp.com/marketing-glossary/content-marketing/",
      "similarity": 0.5178
    },
    {
      "url": "https://blog.hubspot.com/marketing/what-is-digital-marketing",
      "similarity": 0.5177
    }
  ]
}




## 8. Save/Load Artifacts

In [39]:
# Save features and duplicates 
print('Extracted content ->', os.path.join(DATA_DIR, 'extracted_content.csv'))
print('Features ->', os.path.join(DATA_DIR, 'features.csv'))
print('Duplicates ->', os.path.join(DATA_DIR, 'duplicates.csv'))
print('Model ->', os.path.join(MODELS_DIR, 'quality_model.pkl'))

Extracted content -> C:\Users\Gauth\data\extracted_content.csv
Features -> C:\Users\Gauth\data\features.csv
Duplicates -> C:\Users\Gauth\data\duplicates.csv
Model -> C:\Users\Gauth\models\quality_model.pkl
