In [None]:
#fetch from open Library
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time

df10 = df.iloc[:5000].copy()

descriptions = []

headers = {
    "User-Agent": "Mozilla/5.0 (compatible; Sahil-OpenLibrary-PTag/Debug)"
}

for i, row in df10.iterrows():
    isbn = str(row.get("ISBN", "")).strip()
    print(f"\nRow {i+1} | ISBN: {isbn}")

    desc = "ISBN Not Matched"

    if isbn:
        url = f"https://openlibrary.org/isbn/{isbn}"
        try:
            r = requests.get(url, headers=headers, timeout=10, allow_redirects=True)

            if r.status_code == 200:
                print("ISBN page found ✅")

                soup = BeautifulSoup(r.text, "html.parser")
                p_tag = soup.select_one(
                    "div.book-description div.read-more__content p"
                )

                if p_tag:
                    desc = p_tag.get_text(strip=True)
                    print("Description extracted ✅")
                else:
                    desc = "Description Not Available"
                    print("Description missing ❌")

            else:
                
                print("ISBN not found ❌")

        except Exception as e:
            print("Error:", e)

    else:
        print("ISBN missing in CSV ❌")

    descriptions.append(desc)
    time.sleep(1)

df10["description"] = descriptions
df10.to_csv("OpenLibrary_5000_rows.csv", index=False)

print("\nSaved CSV with correct description labels ✅")


In [None]:
#HTML tag Through using google books colab 
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import re
 
df100 = df

descriptions = []

headers = {
    "User-Agent": "Mozilla/5.0"
}

def clean_isbn(isbn):
    return re.sub(r"[^0-9Xx]", "", str(isbn))

for i, isbn in enumerate(df100["ISBN"]):
    isbn = clean_isbn(isbn)
    print(f"Processing Row {i+1} | ISBN: {isbn}")

    desc = "Not Found"

    if isbn:
        url = f"https://books.google.com/books?vid=ISBN{isbn}"

        try:
            r = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(r.text, "html.parser")

            desc_div = soup.find("div", id="synopsis")
            if desc_div:
                desc = desc_div.get_text(separator=" ", strip=True)

        except Exception as e:
            print("Error:", e)

    descriptions.append(desc)
    time.sleep(0.2)

df100["description"] = descriptions
df100.to_csv("HTML_tag_through_All_36000.csv", index=False)

print("\nSaved first 100 rows with descriptions ✅")


In [None]:
# merge HTML TAG  side
import pandas as pd

html_df = pd.read_csv("HTML_tag_through_All_36000.csv")
ol_df = pd.read_csv("OpenLibrary_5000_rows.csv")

missing_vals = ["Not Found", "ISBN Not Matched", "Description Not Available"]

# Align OL descriptions to HTML by index
ol_desc_aligned = ol_df["description"].reindex(html_df.index)

# Replace where HTML missing and OL has valid description
mask = html_df["description"].isin(missing_vals) & ~ol_desc_aligned.isin(missing_vals)

html_df.loc[mask, "description"] = ol_desc_aligned[mask]

html_df.to_csv("Final_Merged_Descriptions.csv", index=False)

print("Merged successfully Sahil ✅")


In [None]:
#find open Library through title +Author through 
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import time
from urllib.parse import quote

# Load data
df = pd.read_csv("Final_Merged_descriptions.csv")

# Rows where description is missing OR Not Found
original_nf = df[
    (df["description"].isna()) |
    (df["description"].astype(str).str.strip() == "") |
    (df["description"] == "Not Found")
].copy()

original_nf=original_nf[:500].copy()
 
def clean_text(text):
    if pd.isna(text):
        return ""
    text = text.lower().strip()
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

# Clean title and author
df["clean_title"] = df["Title"].apply(clean_text)
df["clean_author"] = df["Author_Editor"].apply(clean_text)

# Search OpenLibrary
def search_openlibrary(title, author):
    query = quote(f"{title} {author}")
    url = f"https://openlibrary.org/search?q={query}"
    try:
        res = requests.get(url, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")
        link = soup.select_one("a.results")
        if link:
            return "https://openlibrary.org" + link["href"]
    except:
        return None
    return None
 
def get_description(book_url):
    try:
        res = requests.get(book_url, timeout=10)
        soup = BeautifulSoup(res.text, "html.parser")
        desc = soup.select_one("div.read-more__content.markdown-content")
        if desc:
            return desc.get_text(strip=True)
    except:
        return None
    return None

# Process only missing description rows
for idx, row in original_nf.iterrows():

    current_desc = df.at[idx, "description"]

    # ✅ Skip if description already exists
    if pd.notna(current_desc) and str(current_desc).strip() not in ["", "Not Found"]:
        print("Skipping (already available):", row["Title"])
        continue

    print("Processing:", row["Title"])

    book_url = search_openlibrary(
        df.at[idx, "clean_title"],
        df.at[idx, "clean_author"]
    )

    if book_url:
        desc = get_description(book_url)

        if desc:
            df.at[idx, "description"] = desc
        else:
            df.at[idx, "description"] = "Description not available"
    else:
        df.at[idx, "description"] = "Not Found"

    time.sleep(1)

# Final output only for originally missing rows
final_output = original_nf[["Title", "Author_Editor"]].copy()
final_output["description"] = df.loc[original_nf.index, "description"]

# Save result
final_output.to_csv("Final_NotFound_With_Scraped.csv", index=False)

print("All done Sahil ✅ File ready")
