Milestone 3: Analyze customer reviews and implement Sentiment analysis

In [1]:
# ------------------------------------------------------------------
# STEP 0: INSTALL TOOLS
# We are downloading the necessary software to run this program.
# ------------------------------------------------------------------
# The '-q' at the end means "quiet", so it won't show messy logs.
!pip install playwright nest_asyncio google-generativeai sentence-transformers scikit-learn -q
!playwright install chromium
!playwright install-deps
!apt-get install libatk1.0-0 libatk-bridge2.0-0 libatspi2.0-0 libxcomposite1 -y -q

# ------------------------------------------------------------------
# STEP 1: IMPORT LIBRARIES & SILENCE WARNINGS
# We bring in the tools we just installed so the code can use them.
# ------------------------------------------------------------------
import asyncio              # Lets the computer do two things at once (like scraping)
import csv                  # Lets us read and write Excel-like files (CSV)
import json                 # Lets us handle structured data
import logging              # Tool to control system messages (we use this to hide errors)
import nest_asyncio         # A specific fix for running code in these notebooks
from playwright.async_api import async_playwright # The tool that browses the web automatically
import google.generativeai as genai # The tool that connects to Google's Gemini AI
from sentence_transformers import SentenceTransformer # The Local AI for understanding meaning
from sklearn.metrics.pairwise import cosine_similarity # The math tool for comparing ideas

# Apply the fix to make the code run smoothly in the notebook
nest_asyncio.apply()

# üîá SILENCE THE LOGS (Clean Console)
# This forces the system to ignore "Warning" messages.
# This stops the ugly red "429" or "tornado" text from cluttering your screen.
logging.getLogger("tornado.access").setLevel(logging.CRITICAL)
logging.getLogger("urllib3").setLevel(logging.CRITICAL)
logging.getLogger("asyncio").setLevel(logging.CRITICAL)

# ------------------------------------------------------------------
# STEP 2: CONFIGURATION
# Setting up passwords and file names.
# ------------------------------------------------------------------
# This is your API key to access Google's services.
API_KEY = "AIzaSyBj0NVO43NcnMElpvJRdR8oLredTqstaQ0"
genai.configure(api_key=API_KEY)

# We select the standard Gemini model.
model = genai.GenerativeModel("gemini-flash-latest")

# These are the names of the 3 files we will create on your computer.
BOOKS_FILE = "books_inventory.csv"
NEWS_FILE = "headlines.csv"
OUTPUT_FILE = "final_pricing_strategy.csv"

# ------------------------------------------------------------------
# PHASE 1: SCRAPE BOOK DATA
# This function goes to the book website and copies down every book.
# ------------------------------------------------------------------
async def scrape_all_books():
    print("\nPHASE 1: Collecting ALL 1000 books (This takes about 2-3 mins)...")

    # Open a new file called 'books_inventory.csv' in write mode ('w')
    with open(BOOKS_FILE, "w", newline="", encoding="utf-8") as f:
        # Create a writer tool to add rows to the file
        writer = csv.writer(f)
        # Write the first row (the column headers)
        writer.writerow(["ID", "Title", "Price", "Description", "URL"])

    book_id = 0 # Start counting books from zero

    # Start the automated web browser
    async with async_playwright() as p:
        # Launch the browser in "headless" mode (invisible background mode)
        browser = await p.chromium.launch(headless=True)
        # Open a new empty tab
        page = await browser.new_page()
        # The website address structure (we will replace {} with the page number)
        base_url = "https://books.toscrape.com/catalogue/page-{}.html"

        # Loop from page 1 to 50. This covers the entire website (1000 books).
        for page_num in range(1, 51):
            try:
                # Tell the browser to go to the specific page number
                await page.goto(base_url.format(page_num), timeout=30000)

                # Find all the HTML boxes that contain book info (class="product_pod")
                pods = await page.query_selector_all(".product_pod")
                rows = [] # A list to hold the books we find on this page

                # Loop through each book box we found
                for pod in pods:
                    book_id += 1 # Increase the book count

                    # Find the title element (it's inside an <h3> tag)
                    title_el = await pod.query_selector("h3 a")
                    # Get the text of the title
                    title = await title_el.get_attribute("title")

                    # Find the price element
                    price_el = await pod.query_selector(".price_color")
                    # Get the text of the price
                    price_text = await price_el.inner_text()
                    # Clean the price: remove the '¬£' sign and spaces, then turn it into a number
                    price = float(price_text.replace("¬£", "").replace("√¢", "").strip())

                    # Find the link to the book
                    title_el = await pod.query_selector("h3 a")
                    rel_url = await title_el.get_attribute("href")

                    # Create a simple description so the AI has something to read later
                    desc = f"A book titled '{title}' discussing various themes."

                    # Add this book's details to our list
                    rows.append([book_id, title, price, desc, rel_url])

                # Open the file again in 'append' mode ('a') to add these new books
                with open(BOOKS_FILE, "a", newline="", encoding="utf-8") as f:
                    csv.writer(f).writerows(rows)

                # Every 10 pages, print a message so we know it's working
                if page_num % 10 == 0:
                    print(f"   ...Scraped {page_num} pages ({book_id} books so far)")
            except:
                # If a page fails to load, just ignore it and keep going
                continue

        # Close the browser when we are done
        await browser.close()
    print(f"PHASE 1 COMPLETE. Collected {book_id} books.")

# ------------------------------------------------------------------
# PHASE 2: SCRAPE NEWS HEADLINES
# This function gets the latest news from CNN.
# ------------------------------------------------------------------
async def scrape_headlines():
    print("\nPHASE 2: Reading the News...")
    headlines = [] # A list to store the news headlines

    async with async_playwright() as p:
        # Launch the browser
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        # Go to the lightweight version of CNN (it loads fast)
        await page.goto("https://lite.cnn.com/", timeout=60000)

        # Find all the list items that are links
        elements = await page.query_selector_all("li a")

        # Look at the top 15 links
        for e in elements[:15]:
            # Get the text inside the link
            text = await e.inner_text()
            # If the text is longer than 20 characters, it's probably a real headline
            if len(text) > 20: headlines.append(text)
        await browser.close()

    # Save the headlines to a new CSV file
    with open(NEWS_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["Headline"]) # Write the header
        for h in headlines:
            writer.writerow([h]) # Write each headline

    print("PHASE 2 COMPLETE. Headlines saved.")

# ======================================================================
# ‚≠ê MAIN SEMANTIC ANALYSIS HIGHLIGHT
# ======================================================================
# This is the most important part of the code.
# We load a local "AI Brain" (SentenceTransformer).
# It converts text (like "War") into numbers (Vectors).
# By comparing the numbers of the book vs. the numbers of the news,
# we can mathematically calculate if they are related.
# ======================================================================

# Loading the Local AI model (without printing messages)
semantic_model = SentenceTransformer("all-MiniLM-L6-v2")

# Function to turn news headlines into math numbers (vectors)
def embed_news(headlines):
    return semantic_model.encode(headlines)

# Function to decide the price of ONE book
def semantic_pricing_engine(book, news_embeddings):
    # Get the book details from the list
    title = book[1]
    desc = book[3]
    old_price = float(book[2])

    # 1. Turn this book's title and description into math numbers (vector)
    book_text = f"{title}. {desc}"
    book_vec = semantic_model.encode([book_text])

    # 2. Compare the book's numbers to the news numbers
    # cosine_similarity gives a score from 0.0 (No match) to 1.0 (Perfect match)
    sims = cosine_similarity(book_vec, news_embeddings)[0]

    # Find the single highest match score from the list
    max_score = sims.max()

    # Default decision: Do nothing (STABLE)
    action = "STABLE"
    new_price = old_price
    reason = "No strong trend match"

    # 3. Decision Logic (TUNED FOR YOUR DATA)
    # Based on your previous file, the best books had scores around 0.30.
    # So, we set the threshold to 0.15 to ensure the top ~20% of books INCREASE.

    if max_score > 0.15: # If match score is greater than 0.15
        action = "INCREASE"
        new_price = old_price * 1.15 # Increase price by 15%
        reason = "Relevant to current news"

    elif max_score < 0.06: # If match score is very low (less than 0.06)
        action = "DECREASE"
        new_price = old_price * 0.90 # Discount price by 10%
        reason = "Low relevance, discounting"

    # If the score is between 0.06 and 0.15, it stays STABLE.

    return action, round(new_price, 2), reason, max_score

# ------------------------------------------------------------------
# GEMINI CALIBRATION (SILENT FAIL-OVER)
# We try to use Google's Cloud AI for a summary.
# If it is busy (Error 429), we just skip it silently.
# ------------------------------------------------------------------
def gemini_calibration(headlines):
    prompt = f"""
    You are an AI pricing expert.
    Based on these headlines: {json.dumps(headlines)}
    Explain how global news influences book demand.
    """
    try:
        # Send the question to Google
        response = model.generate_content(prompt)
        print("\n[Gemini Insight]:")
        print(response.text)
    except:
        # If an error happens, do nothing (pass)
        pass

# ------------------------------------------------------------------
# FULL PIPELINE
# This function runs Phase 3: Loading data and calculating prices.
# ------------------------------------------------------------------
async def process_pricing():
    print("\nPHASE 3: AI Pricing Analysis (Processing 1000 books)...")

    # Load the news from the file
    with open(NEWS_FILE, "r", encoding="utf-8") as f:
        # Read the file, skipping the first row (header)
        headlines = [row[0] for row in list(csv.reader(f))[1:]]

    # Load the books from the file
    with open(BOOKS_FILE, "r", encoding="utf-8") as f:
        # Read the file, skipping the first row (header)
        books = list(csv.reader(f))[1:]

    # Try to get Gemini's opinion (it will skip silently if busy)
    gemini_calibration(headlines)

    # Turn all the news headlines into math numbers (vectors)
    news_vecs = embed_news(headlines)

    # Create the final output file
    with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        # Write the header row
        writer.writerow(["ID", "Title", "Old Price", "New Price", "Action", "Reason"])

        # Loop through every single book in our list
        for book in books:
            # Run the "semantic_pricing_engine" function we wrote above
            action, new_p, reason, score = semantic_pricing_engine(book, news_vecs)

            # Write the result into the file
            writer.writerow([
                book[0], book[1], book[2], new_p, action,
                f"{reason} (Score: {score:.2f})"
            ])

    print("PHASE 3 COMPLETE. Strategy Saved.")

# ------------------------------------------------------------------
# MAIN FUNCTION
# This is the master function that runs everything in order.
# ------------------------------------------------------------------
async def main():
    await scrape_all_books()    # Step 1: Get books
    await scrape_headlines()    # Step 2: Get news
    await process_pricing()     # Step 3: Calculate prices
    print("\n‚úÖ ALL DONE! Files created.")

# Start the program!
await main()

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m46.0/46.0 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Chromium 143.0.7499.4 (playwright build v1200)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1200/chromium-linux.zip[22m
[1G164.7 MiB [] 0% 0.0s[0K[1G164.7 MiB [] 0% 45.8s[0K[1G164.7 MiB [] 0% 15.2s[0K[1G164.7 MiB [] 0% 15.4s[0K[1G164.7 MiB [] 0% 9.9s[0K[1G164.7 MiB [] 1% 5.4s[0K[1G164.7 MiB [] 2% 4.0s[0K[1G164.7 MiB [] 3% 3.4s[0K[1G164.7 MiB [] 4% 3.4s[0K[1G164.7 MiB [] 4% 3.2s[0K[1G164.7 MiB [] 5% 2.8s[0K[1G164.7 MiB [] 6% 2.6s[0K[1G164.7 MiB [] 7% 2.4s[0K[1G164.7 MiB [] 8% 2.4s[0K[1G164.7 MiB [] 9% 2.3s[0K[1G164.7 MiB [] 10% 2.3s[0K[1G164.7 MiB [] 11% 2.1s[0K[1G164.7 MiB [] 12% 2.0s[0K[1G164.7 MiB [] 13% 1.9s[0K[1G164.7 MiB [] 14% 1.9s[0K[1G164.7 MiB [] 15% 1.9s[0K[1G164.7 MiB [] 16% 1.8s[0K

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]


PHASE 1: Collecting ALL 1000 books (This takes about 2-3 mins)...
   ...Scraped 10 pages (200 books so far)
   ...Scraped 20 pages (400 books so far)
   ...Scraped 30 pages (600 books so far)
   ...Scraped 40 pages (800 books so far)
   ...Scraped 50 pages (1000 books so far)
PHASE 1 COMPLETE. Collected 1000 books.

PHASE 2: Reading the News...
PHASE 2 COMPLETE. Headlines saved.

PHASE 3: AI Pricing Analysis (Processing 1000 books)...
PHASE 3 COMPLETE. Strategy Saved.

‚úÖ ALL DONE! Files created.
