In [1]:
# train_seo_model.py
import os, re, joblib
import numpy as np, pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import textstat
import nltk
nltk.download('punkt', quiet=True)

os.makedirs("data", exist_ok=True)
os.makedirs("models", exist_ok=True)



Rows: 81
Saved features_intermediate.csv
Saved TF-IDF vectorizer and SVD
Label counts before relaxation:
 quality_label
Low       40
Medium    33
High       8
Name: count, dtype: int64
Classes mapping: {'High': 0, 'Low': 1, 'Medium': 2}
Class distribution: Counter({1: 40, 2: 33, 0: 8})
Test Accuracy: 0.88
              precision    recall  f1-score   support

        High       0.00      0.00      0.00         3
         Low       1.00      1.00      1.00        12
      Medium       0.77      1.00      0.87        10

    accuracy                           0.88        25
   macro avg       0.59      0.67      0.62        25
weighted avg       0.79      0.88      0.83        25



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


CV accuracy (5-fold): 0.8022058823529411 [0.82352941 0.6875     0.9375     0.75       0.8125    ]
Saved classifier and encoders to models/
Top features: [('flesch', 0.10847617781788874), ('word_count', 0.06577506902266071), ('meta_count', 0.05200038688169177), ('svd_36', 0.04349649476763824), ('svd_14', 0.03837238983226825), ('svd_39', 0.03401888397953561), ('svd_22', 0.027561534936863426), ('svd_3', 0.026898548779812872), ('svd_6', 0.02545272675681532), ('svd_19', 0.025241207555555863), ('svd_5', 0.023695122732777362), ('svd_34', 0.023648850232819145), ('link_count', 0.021881050640847503), ('svd_47', 0.021399094420429258), ('svd_49', 0.019884796552567897)]


In [None]:
# --------- LOAD CSV (adjust path if necessary) ----------
INPUT_CSV = r"C:\Users\Hp\Downloads\data.csv" # or "data/data.csv"
df_raw = pd.read_csv(INPUT_CSV)
print("Rows:", len(df_raw))

# --------- CLEAN HTML -> TEXT & BASIC META ----------
def clean_html(html):
    if pd.isna(html): 
        return ""
    soup = BeautifulSoup(str(html), "html.parser")
    for tag in soup(["script","style","noscript","header","footer","nav","form"]):
        tag.decompose()
    text = soup.get_text(separator=" ", strip=True)
    return re.sub(r'\s+', ' ', text).strip()

df_raw['clean_text'] = df_raw['html_content'].apply(clean_html)
df_raw['word_count'] = df_raw['clean_text'].apply(lambda t: len(t.split()))
def tag_counts(html):
    if pd.isna(html): return 0,0
    s = BeautifulSoup(str(html), "html.parser")
    return len(s.find_all('a')), len(s.find_all('meta'))
df_raw[['link_count','meta_count']] = df_raw['html_content'].apply(lambda h: pd.Series(tag_counts(h)))
df_raw['flesch'] = df_raw['clean_text'].apply(lambda t: textstat.flesch_reading_ease(t) if len(t.split())>10 else np.nan)

df = df_raw.copy()
df.to_csv("data/features_intermediate.csv", index=False)
print("Saved features_intermediate.csv")



In [None]:
# --------- TF-IDF + SVD ----------
vectorizer = TfidfVectorizer(max_features=2000, stop_words='english')
tfidf = vectorizer.fit_transform(df['clean_text'].fillna(""))
svd = TruncatedSVD(n_components=50, random_state=42)
tfidf_svd = svd.fit_transform(tfidf)   # dense nx50

# append svd features
svd_cols = [f"svd_{i}" for i in range(tfidf_svd.shape[1])]
df_svd = pd.DataFrame(tfidf_svd, columns=svd_cols, index=df.index)
df = pd.concat([df, df_svd], axis=1)

# Save tfidf matrix for duplicates search (optional)
from scipy.sparse import save_npz
save_npz("data/tfidf_matrix.npz", tfidf)

# persist vectorizer + svd
joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")
joblib.dump(svd, "models/tfidf_svd.pkl")
print("Saved TF-IDF vectorizer and SVD")



In [None]:
# --------- LABELING per assignment rules (Low/Medium/High) ----------
def assign_label(row):
    wc = row['word_count']
    flesch = row['flesch'] if not np.isnan(row['flesch']) else 50.0
    if (wc > 1500) and (50 <= flesch <= 70):
        return "High"
    if (wc < 500) or (flesch < 30):
        return "Low"
    return "Medium"

df['quality_label'] = df.apply(assign_label, axis=1)
print("Label counts before relaxation:\n", df['quality_label'].value_counts())



In [None]:
# If any class missing or extremely underrepresented, relax thresholds:
if df['quality_label'].nunique() < 2 or df['quality_label'].value_counts().min() < 3:
    def assign_relaxed(r):
        wc = r['word_count']
        if wc < 800: return "Low"
        if wc > 1200: return "High"
        return "Medium"
    df['quality_label'] = df.apply(assign_relaxed, axis=1)
    print("Label counts after relaxation:\n", df['quality_label'].value_counts())



In [None]:
# --------- FEATURES and TARGET ----------
num_cols = ['word_count','link_count','meta_count','flesch'] + svd_cols
X = df[num_cols].fillna(0).values
le = LabelEncoder()
y = le.fit_transform(df['quality_label'].values)
print("Classes mapping:", dict(zip(le.classes_, range(len(le.classes_)))))

# Optional balancing: if a class has very few examples, we will undersample majority classes
from collections import Counter
cnt = Counter(y)
print("Class distribution:", cnt)
# If needed, create a small balanced sample for training:
if min(cnt.values()) < 5:
    # simple undersample to ensure at least 5 per class (keeps deterministic)
    df_bal = pd.concat([
        df[df['quality_label'] == cls].sample(n=min(max(5, min(cnt.values())), len(df[df['quality_label'] == cls])), random_state=42)
        for cls in df['quality_label'].unique()
    ])
    X = df_bal[num_cols].fillna(0).values
    y = le.transform(df_bal['quality_label'].values)
    print("After undersample distribution:", Counter(y))



In [None]:
# --------- TRAIN / EVALUATE ----------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42, stratify=y)
clf = RandomForestClassifier(n_estimators=300, random_state=42, class_weight='balanced_subsample')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=le.classes_))

# cross-val
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
try:
    cv_scores = cross_val_score(clf, X, y, cv=cv, scoring='accuracy')
    print("CV accuracy (5-fold):", cv_scores.mean(), cv_scores)
except Exception as e:
    print("CV skipped:", e)



In [None]:
# --------- Save artifacts ----------
joblib.dump(clf, "models/quality_model.pkl")
joblib.dump(le, "models/label_encoder.pkl")
joblib.dump(num_cols, "models/feature_names.pkl")
print("Saved classifier and encoders to models/")

# Print top feature importances for inspection
importances = clf.feature_importances_
top_idx = np.argsort(importances)[-20:][::-1]
top_features = [(num_cols[i], importances[i]) for i in top_idx if i < len(num_cols)]
print("Top features:", top_features[:15])


In [None]:
# MODEL EVALUATION & VISUALIZATION
# ===========================
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    classification_report, accuracy_score, f1_score
)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
plt.figure(figsize=(6,5))
disp.plot(values_format="d", cmap="Blues")
plt.title("Confusion Matrix - SEO Quality Classifier")
plt.show()

In [None]:
# Accuracy & F1 Score
test_acc = accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred, average="macro")
print("\n Final Model Performance")
print("Test Accuracy:", round(test_acc, 3))
print("Test F1 Score:", round(test_f1, 3))

print("\n Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

In [None]:
# Feature Importance Bar Plot
importances = clf.feature_importances_
indices = np.argsort(importances)[-15:][::-1]  # top 15
plt.figure(figsize=(10,6))
sns.barplot(x=importances[indices], y=np.array(num_cols)[indices])
plt.title("Top 15 Important Features")
plt.xlabel("Importance Score")
plt.ylabel("Feature Name")
plt.tight_layout()
plt.show()

In [None]:
# ROC CURVES (multiclass)
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc

y_test_bin = label_binarize(y_test, classes=list(range(len(le.classes_))))
y_score = clf.predict_proba(X_test)

plt.figure(figsize=(8,6))
for i, class_name in enumerate(le.classes_):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, lw=2, label=f"{class_name} (AUC={roc_auc:.2f})")

plt.plot([0,1],[0,1],'--')
plt.title("ROC Curves - Multi-Class")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid()
plt.show()