<a href="https://colab.research.google.com/github/sergekamanzi/Web-Scraping/blob/main/webscraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [50]:
%%capture
# Step 1: Install Required Libraries
!pip install requests beautifulsoup4 pandas matplotlib seaborn

In [51]:
# Step 2: Import Required Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [59]:
# Step 3: Define the Scraper Function
def scrape_alibaba_products():
    url = "https://www.alibaba.com/premium/shopping_sights.html?src=sem_ggl&field=UG&from=sem_ggl&cmpgn=20784571611&adgrp=155419523666&fditm=&tgt=kwd-300951543343&locintrst=&locphyscl=1012087&mtchtyp=b&ntwrk=g&device=c&dvcmdl=&creative=681578485863&plcmnt=&plcmntcat=&aceid=&position=&gad_source=1&gclid=Cj0KCQiA8fW9BhC8ARIsACwHqYpfLHC2r0KygiKdCbXHiH0-L3G4NW9M9L-MWtfpB3JpOn4IvZFfxL0aAjugEALw_wcB"

    # Set headers to mimic a browser
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print("Failed to retrieve the website. Status Code:", response.status_code)
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    # Find all product containers (Modify selector based on Alibaba's structure)
    product_containers = soup.find_all("div", class_="item-main")

    product_list = []

    for product in product_containers:
        try:
            name = product.find("h2").text.strip()
            price = product.find("span", class_="price").text.strip()
            link = product.find("a")["href"]

            product_list.append({
                "Product Name": name,
                "Price": price,
                "Product Link": link
            })
        except AttributeError:
            continue  # Skip if any data is missing

        time.sleep(1)  # Respect Alibaba's servers

    return product_list


In [60]:
# Step 4: Scrape Data & Store in DataFrame
products = scrape_alibaba_products()
if products:
    df = pd.DataFrame(products)
    df.to_csv("alibaba_products.csv", index=False)
    print("✅ Data scraped and saved successfully!")
else:
    print("❌ No data scraped.")


❌ No data scraped.


In [61]:
# Step 5: Load Data and Clean Price Column
df = pd.read_csv("alibaba_products.csv")

# Convert Price column to numerical values
def clean_price(price):
    if isinstance(price, str):
        price = price.replace("$", "").replace(",", "").split("-")[0].strip()
        return float(price) if price.replace('.', '', 1).isdigit() else None
    return None

df["Cleaned Price"] = df["Price"].apply(clean_price)
df.dropna(subset=["Cleaned Price"], inplace=True)


FileNotFoundError: [Errno 2] No such file or directory: 'alibaba_products.csv'

In [None]:

# Step 6: Data Analysis & Visualization
plt.figure(figsize=(12,6))
sns.histplot(df["Cleaned Price"], bins=20, kde=True)
plt.xlabel("Price (in USD)")
plt.ylabel("Count of Products")
plt.title("Price Distribution of Products on Alibaba")
plt.show()


In [None]:
# Step 7: Top 10 Most Expensive Products
top_10_expensive = df.sort_values(by="Cleaned Price", ascending=False).head(10)
plt.figure(figsize=(12,6))
sns.barplot(y=top_10_expensive["Product Name"], x=top_10_expensive["Cleaned Price"], palette="coolwarm")
plt.xlabel("Price (in USD)")
plt.ylabel("Product Name")
plt.title("Top 10 Most Expensive Products")
plt.show()


In [None]:
# Step 8: Display Data Table
import ace_tools as tools
tools.display_dataframe_to_user(name="Alibaba Products Data", dataframe=df)


In [44]:
import requests
from bs4 import BeautifulSoup
import csv
import time
import pandas as pd

# Headers to mimic a browser visit
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept-Language": "en-US,en;q=0.9"
}

BASE_URL = "https://www.ebay.com/globaldeals"


def get_soup(url):
    """Fetch page content and return BeautifulSoup object"""
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        return BeautifulSoup(response.text, "html.parser")
    else:
        print(f"Failed to retrieve page: {response.status_code}")
        return None


def scrape_ebay():
    """Scrape product details from eBay search results"""
    products = []

    for page in range(1, 3):  # Scrape first 2 pages
        print(f"Scraping page {page}...")
        url = f"{BASE_URL}?_pgn={page}"
        soup = get_soup(url)

        if not soup:
            continue

        for item in soup.select(".s-item"):
            name_tag = item.select_one(".s-item__title")
            price_tag = item.select_one(".s-item__price")
            link_tag = item.select_one(".s-item__link")

            if name_tag and price_tag and link_tag:
                name = name_tag.text.strip()
                price = price_tag.text.strip()
                link = link_tag["href"]

                products.append([name, price, link])

        time.sleep(2)  # Pause to avoid getting blocked

    return products


def save_to_csv(data, filename="ebay_products.csv"):
    """Save scraped data to CSV"""
    with open(filename, "w", newline="", encoding="utf-8") as file:
        writer = csv.writer(file)
        writer.writerow(["Product Name", "Price", "Product Link"])
        writer.writerows(data)
    print(f"Data saved to {filename}")


def display_data(filename="ebay_products.csv"):
    """Load and display data from CSV"""
    df = pd.read_csv(filename)
    print(df.head(12))  # Display first 12 rows
    return df

# Run the scraper
product_data = scrape_ebay()
save_to_csv(product_data)
display_data()

Scraping page 1...
Scraping page 2...
Data saved to ebay_products.csv
Empty DataFrame
Columns: [Product Name, Price, Product Link]
Index: []


Unnamed: 0,Product Name,Price,Product Link


In [46]:
import pandas as pd
data = pd.read_csv("/content/ebay_products.csv")
data.head()

Unnamed: 0,Product Name,Price,Product Link
