# Task 2 — SEO Blog Post Creation Tool (Colab)
Run cells in order. This notebook:
1. Scrapes sample product listings (books.toscrape.com) — safe demo site.
2. Extracts SEO candidate keywords using YAKE.
3. Generates 150–200 word SEO-friendly blog posts via OpenAI (or a fallback).
4. Optionally posts the blog to a WordPress site (requires WP credentials).
5. Saves blog markdown files and a Task2 report.

Before you run the OpenAI or WordPress cells, add your keys in the `.env` cell.

In [1]:
!pip install --quiet requests beautifulsoup4 openai python-dotenv yake wordcloud

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/80.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m80.7/80.7 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/360.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.5/360.5 kB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
%%writefile .env
OPENAI_API_KEY=
WP_URL=https://yourwordpresssite.com
WP_USER=shafiq2201069cs
WP_PASS=Amishkhan@123

# Load env
from dotenv import load_dotenv
import os
load_dotenv('.env')
print("Loaded .env. OPENAI_API_KEY present:", bool(os.getenv('OPENAI_API_KEY')))

Writing .env


In [3]:
import os, json, requests
from pathlib import Path
from dotenv import load_dotenv
load_dotenv('.env')

OUT_DIR = Path('task2_outputs')
OUT_DIR.mkdir(exist_ok=True)
(OUT_DIR/'blogs').mkdir(exist_ok=True)

print("Output folder:", OUT_DIR.resolve())



Output folder: /content/task2_outputs


In [4]:
from bs4 import BeautifulSoup

BASE = "http://books.toscrape.com/"

def scrape_books_toscrape(max_pages=1):
    products = []
    for page in range(1, max_pages+1):
        url = BASE if page==1 else BASE + f"catalogue/page-{page}.html"
        r = requests.get(url, timeout=15, headers={'User-Agent':'Mozilla/5.0'})
        r.raise_for_status()
        soup = BeautifulSoup(r.text, 'html.parser')
        items = soup.select('article.product_pod')
        for it in items:
            title = it.h3.a['title']
            rel_link = it.h3.a['href']
            link = BASE + rel_link.replace('../../','catalogue/')
            price = it.select_one('.price_color').text.strip()
            pr = requests.get(link, timeout=15, headers={'User-Agent':'Mozilla/5.0'})
            psoup = BeautifulSoup(pr.text, 'html.parser')
            desc_tag = psoup.select_one('#product_description')
            desc = ""
            if desc_tag:
                desc_par = desc_tag.find_next_sibling('p')
                if desc_par:
                    desc = desc_par.text.strip()
            products.append({
                'title': title,
                'price': price,
                'desc': desc or "No description available.",
                'url': link
            })
        import time; time.sleep(1)
    return products
products = scrape_books_toscrape(max_pages=1)
print("Scraped products:", len(products))
for p in products[:5]:
    print("-", p['title'])

Scraped products: 20
- A Light in the Attic
- Tipping the Velvet
- Soumission
- Sharp Objects
- Sapiens: A Brief History of Humankind


In [5]:
import yake

def extract_keywords_yake(text, top=4):
    kw_extractor = yake.KeywordExtractor(lan='en', n=2, top=top)
    keywords = [kw for kw,score in kw_extractor.extract_keywords(text)]
    seen=set(); out=[]
    for k in keywords:
        kk=k.lower()
        if kk not in seen:
            seen.add(kk); out.append(k)
    return out

for p in products:
    txt = p['title'] + ". " + p['desc']
    kws = extract_keywords_yake(txt, top=4)
    p['keywords'] = kws
    print(p['title'], "->", kws)

A Light in the Attic -> ['Attic', 'Light', 'Shel Silverstein', 'Silverstein']
Tipping the Velvet -> ['Nan King', 'Kitty Butler', 'Book Review', 'York Times']
Soumission -> ['Dans', 'une France', 'France assez', 'une']
Sharp Objects -> ['Camille Preakerâ', 'Camille', 'heart Words', 'reporter Camille']
Sapiens: A Brief History of Humankind -> ['inhabited Earth', 'years ago', 'international bestsellerâ', 'History']
The Requiem Red -> ['Requiem Red', 'Soothing Hills', 'Hills Asylum', 'Red']
The Dirty Little Secrets of Getting Your Dream Job -> ['Dream Job', 'Don Raskinâ', 'dream job.Don', 'Dream']
The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull -> ['Victoria Woodhull', 'Coming Woman', 'Woman', 'Victoria']
The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics -> ['Berlin Olympics', 'Epic Quest', 'Olympic gold', 'Olympics Daniel']
The Black Maria -> ['Aracelis Girmay', 'Oprah Magazine', 'Black Maria', 'Girmay']


In [6]:
import os
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
use_openai = bool(OPENAI_API_KEY)

def generate_blog_openai(title, desc, keywords):
    import openai
    openai.api_key = OPENAI_API_KEY
    prompt = f"""
Write a single SEO-friendly blog paragraph of **about 150-200 words** that highlights this product.
Product title: {title}
Short description: {desc}
Primary keywords (use them naturally at least once): {', '.join(keywords)}
Tone: persuasive, informative, user-focused. Keep it useful and concise.
Return only the blog paragraph (no headings).
"""
    resp = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=[{"role":"user","content":prompt}],
        max_tokens=450,
        temperature=0.6
    )
    return resp['choices'][0]['message']['content'].strip()

def fallback_blog(title, desc, keywords):
    kw_text = ', '.join(keywords)
    base = (f"{title} is a standout choice for readers seeking quality and value. {desc} "
            f"Combining reliable craftsmanship with thoughtful features, this product is ideal for those looking for {keywords[0]} and performance. "
            f"With positive user feedback and great price-to-value, it serves as a trustworthy option. "
            f"Highlighted features include {kw_text}. For buyers wanting a dependable selection, {title} offers an excellent balance of function and affordability. "
            f"See the product page for more details: ")
    words = base.split()
    if len(words) < 150:
        base += " It's a great buy for many users."
    return base
for idx, p in enumerate(products[:6]):
    kws = p['keywords'] or [p['title'].split()[0]]
    if use_openai:
        try:
            blog_text = generate_blog_openai(p['title'], p['desc'], kws)
        except Exception as e:
            print("OpenAI call failed:", e)
            blog_text = fallback_blog(p['title'], p['desc'], kws)
    else:
        blog_text = fallback_blog(p['title'], p['desc'], kws)
    p['blog'] = blog_text
    md = f"# {p['title']}\n\n{blog_text}\n\n**Product URL:** {p['url']}\n"
    file_path = OUT_DIR / 'blogs' / f"blog_{idx+1}.md"
    file_path.write_text(md, encoding='utf-8')
    print("Saved", file_path)

Saved task2_outputs/blogs/blog_1.md
Saved task2_outputs/blogs/blog_2.md
Saved task2_outputs/blogs/blog_3.md
Saved task2_outputs/blogs/blog_4.md
Saved task2_outputs/blogs/blog_5.md
Saved task2_outputs/blogs/blog_6.md


In [7]:
WP_URL = os.getenv('WP_URL') or ""
WP_USER = os.getenv('WP_USER') or ""
WP_PASS = os.getenv('WP_PASS') or ""

def post_to_wordpress(title, content, status='publish'):
    if not (WP_URL and WP_USER and WP_PASS):
        raise ValueError("WP_URL / WP_USER / WP_PASS not configured in .env")
    endpoint = WP_URL.rstrip('/') + '/wp-json/wp/v2/posts'
    data = {"title": title, "content": content, "status": status}
    r = requests.post(endpoint, auth=(WP_USER, WP_PASS), json=data, timeout=20)
    r.raise_for_status()
    return r.json()

In [8]:
report_lines = []
report_lines.append("# Task 2 Report — SEO Blog Post Creation Tool\n")
report_lines.append("## Data Source\n- Demo scraping: books.toscrape.com (safe demo site)\n")
report_lines.append("## Steps followed\n1. Scraped product title, price, description and URL.\n2. Extracted candidate SEO keywords using YAKE.\n3. Generated a 150-200 word SEO-friendly blog for each product using OpenAI (fallback if no key).\n4. Saved blog markdown files in task2_outputs/blogs/.\n")
report_lines.append("## Outputs (saved files):\n")
for f in sorted((OUT_DIR/'blogs').glob('*.md')):
    report_lines.append(f"- {f.name}\n")
report_lines.append("\n## Notes & Limitations\n- Real search-volume for keywords not fetched (requires Google Keyword Planner/SerpAPI).\n- Amazon / other sites often disallow scraping; use APIs or explicit permission.\n- OpenAI usage may incur token costs.\n")
(report_path := OUT_DIR / 'Task2_Report.md').write_text("\n".join(report_lines), encoding='utf-8')
print("Saved report to", report_path)

Saved report to task2_outputs/Task2_Report.md


In [9]:
from google.colab import files
files_to_download = [str(report_path)]
for i, f in enumerate(sorted((OUT_DIR/'blogs').glob('*.md'))):
    if i >= 3: break
    files_to_download.append(str(f))

for fp in files_to_download:
    print("Downloading:", fp)
    files.download(fp)

Downloading: task2_outputs/Task2_Report.md


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: task2_outputs/blogs/blog_1.md


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: task2_outputs/blogs/blog_2.md


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloading: task2_outputs/blogs/blog_3.md


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>