<a href="https://colab.research.google.com/github/sergekamanzi/Web-Scraping/blob/main/webscraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
%%capture
# Step 1: Install Required Libraries
!pip install requests beautifulsoup4 pandas matplotlib seaborn

In [14]:
# Step 2: Import Required Libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [15]:
# Step 3: Define the Scraper Function
def scrape_alibaba_products():
    url = ""

    # Set headers to mimic a browser
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }

    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print("Failed to retrieve the website. Status Code:", response.status_code)
        return []

    soup = BeautifulSoup(response.text, "html.parser")

    # Find all product containers (Modify selector based on Alibaba's structure)
    product_containers = soup.find_all("div", class_="item-main")

    product_list = []

    for product in product_containers:
        try:
            name = product.find("h2").text.strip()
            price = product.find("span", class_="price").text.strip()
            link = product.find("a")["href"]

            product_list.append({
                "Product Name": name,
                "Price": price,
                "Product Link": link
            })
        except AttributeError:
            continue  # Skip if any data is missing

        time.sleep(1)  # Respect Alibaba's servers

    return product_list


In [12]:
# Step 4: Scrape Data & Store in DataFrame
products = scrape_alibaba_products()
if products:
    df = pd.DataFrame(products)
    df.to_csv("alibaba_products.csv", index=False)
    print("✅ Data scraped and saved successfully!")
else:
    print("❌ No data scraped.")


❌ No data scraped.


In [None]:
# Step 5: Load Data and Clean Price Column
df = pd.read_csv("alibaba_products.csv")

# Convert Price column to numerical values
def clean_price(price):
    if isinstance(price, str):
        price = price.replace("$", "").replace(",", "").split("-")[0].strip()
        return float(price) if price.replace('.', '', 1).isdigit() else None
    return None

df["Cleaned Price"] = df["Price"].apply(clean_price)
df.dropna(subset=["Cleaned Price"], inplace=True)


In [None]:

# Step 6: Data Analysis & Visualization
plt.figure(figsize=(12,6))
sns.histplot(df["Cleaned Price"], bins=20, kde=True)
plt.xlabel("Price (in USD)")
plt.ylabel("Count of Products")
plt.title("Price Distribution of Products on Alibaba")
plt.show()


In [None]:
# Step 7: Top 10 Most Expensive Products
top_10_expensive = df.sort_values(by="Cleaned Price", ascending=False).head(10)
plt.figure(figsize=(12,6))
sns.barplot(y=top_10_expensive["Product Name"], x=top_10_expensive["Cleaned Price"], palette="coolwarm")
plt.xlabel("Price (in USD)")
plt.ylabel("Product Name")
plt.title("Top 10 Most Expensive Products")
plt.show()


In [None]:
# Step 8: Display Data Table
import ace_tools as tools
tools.display_dataframe_to_user(name="Alibaba Products Data", dataframe=df)
