In [31]:
import pandas as pd
import numpy as np
import requests, json, time
from bs4 import BeautifulSoup
from tqdm import tqdm
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import textstat
import joblib
import os
from pathlib import Path


[nltk_data] Downloading package punkt to /Users/srinija/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [32]:
df=pd.read_csv("/Users/srinija/Desktop/seo-content-detector_2448526/data/data.csv")

In [33]:
df.sample(4)

Unnamed: 0,url,html_content
15,https://en.wikipedia.org/wiki/SD-WAN,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
73,https://en.wikipedia.org/wiki/Digital_marketing,"<!DOCTYPE html>\n<html class=""client-nojs vect..."
7,https://www.cisco.com/site/us/en/learn/topics/...,"\n<!DOCTYPE HTML>\n\n<html lang=""en-US"" dir=""l..."
43,https://developers.google.com/search/docs/fund...,\n\n\n\n\n\n\n\n\n<!doctype html>\n<html \n ...


In [34]:
df.columns

Index(['url', 'html_content'], dtype='object')

In [35]:
print(f"Loaded dataset with {len(df)} rows.")

Loaded dataset with 81 rows.


In [36]:
df.isnull().sum()

url              0
html_content    12
dtype: int64

# identity the null values and remove them from the dataset

In [37]:
missing_html = df[df['html_content'].isnull()]
print("Rows with missing html_content:")
print(missing_html)

# Then remove them
df = df.dropna(subset=['html_content']).reset_index(drop=True)


Rows with missing html_content:
                                                  url html_content
4   https://www.qnbtrust.bank/Resources/Learning-C...          NaN
12  https://www.connectwise.com/blog/phishing-prev...          NaN
18      https://www.hpe.com/us/en/what-is/sd-wan.html          NaN
20  https://support.microsoft.com/en-us/windows/ho...          NaN
23  https://www.cloudflare.com/learning/access-man...          NaN
40  https://towardsdatascience.com/machine-learnin...          NaN
41  https://www.analyticsvidhya.com/blog/2021/09/c...          NaN
51       https://www.investopedia.com/terms/s/seo.asp          NaN
55  https://www.dollardays.com/?srsltid=AfmBOopXjd...          NaN
75  https://www.reuters.com/technology/artificial-...          NaN
76      https://www.cnbc.com/artificial-intelligence/          NaN
77       https://www.bbc.com/news/topics/c404v061z99t          NaN


# now checking the len of the df

In [38]:
print(f"Loaded dataset with {len(df)} rows.") # from the previous 81 to 69

Loaded dataset with 69 rows.


In [39]:
# Apply extraction safely
parsed_rows = []
for i, row in tqdm(df.iterrows(), total=len(df)):
    html = row.get("html_content", "")
    title, body, wc = extract_content(html)
    parsed_rows.append({
        "url": row["url"],
        "title": title,
        "body_text": body,
        "word_count": wc
    })

# Create dataframe
parsed_df = pd.DataFrame(parsed_rows)

# Drop rows where no body text was extracted (optional)
parsed_df = parsed_df[parsed_df["body_text"].str.strip() != ""]

# Save output
output_path = "/Users/srinija/Desktop/seo-content-detector_2448526/data/extracted_content.csv"
parsed_df.to_csv(output_path, index=False)



print(f"Parsed successfully! Saved {len(parsed_df)} rows to extracted_content.csv")
print(parsed_df.sample(5))

100%|██████████| 69/69 [00:02<00:00, 23.03it/s]

Parsed successfully! Saved 65 rows to extracted_content.csv
                                                  url  \
27         https://copyblogger.com/content-marketing/   
53                https://americasstealsanddeals.com/   
15  https://www.fortinet.com/resources/cyberglossa...   
38  https://www.coursera.org/articles/content-stra...   
22                              https://www.efax.com/   

                                                title  \
27  The Complete Content Marketing Guide For 2025 ...   
53  Americas Steals & Deals | Online Steals & Deal...   
15  What is SD-WAN (Software-Defined WAN)? Benefit...   
38  How to Develop a Content Strategy: Step-by-Ste...   
22  Fax Online with eFax | Secure, Scalable, Enter...   

                                            body_text  word_count  
27  Content marketing is an excellent strategy to ...        5523  
53  Ships by Ships by Be the first to know when th...          77  
15  Watch this demo to see how FortiManager enable




In [41]:
e=pd.read_csv("/Users/srinija/Desktop/seo-content-detector_2448526/data/extracted_content.csv")
e.sample(5)

Unnamed: 0,url,title,body_text,word_count
18,https://www.fortinet.com/solutions/enterprise-...,Zero Trust Network Access (ZTNA) to Control Ap...,Zero Trust is all about trusting users and dev...,48
53,https://www.wikihow.com/Make-Money-Online,4 Ways to Make Money Online - wikiHow,"Last Updated: June 5, 2025 Approved This artic...",273
31,https://jakevdp.github.io/PythonDataScienceHan...,Python Data Science Handbook | Python Data Sci...,Jake VanderPlas This website contains the full...,63
49,https://www.dealnews.com/,"DealNews: Best Daily Deals, Discounts & Sales",Bag nice savings on several handy household it...,1439
61,https://simple.wikipedia.org/wiki/Artificial_i...,Artificial intelligence - Simple English Wikip...,Artificial intelligence ( AI or A.I. ) is a c...,1187


In [48]:
e['title']

0                                   Cyber Security Blog
1     Top 10 Cybersecurity Awareness Tips: How to St...
2     11 Cyber Defense Tips to Stay Secure at Work a...
3     Cybersecurity Best Practices | Cybersecurity a...
4        Network Security 101: Understanding the Basics
                            ...                        
60                  Artificial intelligence - Wikipedia
61    Artificial intelligence - Simple English Wikip...
62                        Digital marketing - Wikipedia
63    Digital marketing - Simple English Wikipedia, ...
64                    Artificial intelligence | AP News
Name: title, Length: 65, dtype: object

In [49]:
e['body_text']

0     Cyber Crisis Tabletop Exercise Cyber Security ...
1     Cybersecurity is gaining more importance globa...
2      Cybersecurity is inextricably tied to the tec...
3     Cyberspace is particularly difficult to secure...
4     Every week, networks seem to grow in size and ...
                            ...                        
60     Artificial intelligence ( AI ) is the capabil...
61     Artificial intelligence ( AI or A.I. ) is a c...
62     Digital marketing is the component of marketi...
63    Digital marketing is advertising delivered thr...
64    Nvidia CEO Jensen Huang said his companys inve...
Name: body_text, Length: 65, dtype: object

In [23]:
from html.parser import HTMLParser

# using the HTMLPARSER