## Task 2 Webscraping
### By Romina Goodarzi

Please open this notebook on Google Colab.

In [1]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_list_page(url, site_name):
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')

    products = []
    if site_name == 'missetam':
        product_containers = soup.find_all('div', class_='product-item')
        for i, container in enumerate(product_containers[:10]):
            product = {'url': url, 'page_type': 'list', 'position': i+1}
            name_tag = container.find('a', class_='product-item-link')
            price_tag = container.find('span', class_='price')
            discount_tag = container.find('span', class_='old-price')
            product['name'] = name_tag.text.strip() if name_tag else None
            product['price'] = price_tag.text.strip() if price_tag else None
            product['discounted_price'] = discount_tag.text.strip() if discount_tag else None
            products.append(product)
    elif site_name == 'gap':
        product_containers = soup.find_all('div', class_='product-card')
        for i, container in enumerate(product_containers[:10]):
            product = {'url': url, 'page_type': 'list', 'position': i+1}
            name_tag = container.find('a', class_='product-card__link')
            price_tag = container.find('span', class_='product-card__price')
            discount_tag = container.find('span', class_='product-card__old-price')
            product['name'] = name_tag.text.strip() if name_tag else None
            product['price'] = price_tag.text.strip() if price_tag else None
            product['discounted_price'] = discount_tag.text.strip() if discount_tag else None
            products.append(product)
    elif site_name == 'your_look_for_less':
        product_containers = soup.find_all('div', class_='sc-b18a510e-3 fBMEnw')
        for i, container in enumerate(product_containers[:10]):
            product = {'url': url, 'page_type': 'list', 'position': i+1}
            name_tag = container.find('strong', class_='sc-fed992a6-0 eJZrEW sc-b18a510e-5 cbyClg')
            price_container = container.find('div', class_='sc-7f1e497e-1 kPIifX')
            if price_container:
                sale_price_tag = price_container.find('span', class_='sc-49115527-0 bkKyDO')
                regular_price_tag = price_container.find('span', class_='sc-49115527-0 bVjZVj')
                product['price'] = sale_price_tag.text.replace('€\xa0', '€ ') if sale_price_tag else None
                product['discounted_price'] = regular_price_tag.text.replace('€\xa0', '€ ') if regular_price_tag else None
            product['name'] = name_tag.text if name_tag else None
            products.append(product)
    return products

def scrape_product_page(url, site_name):
    response = requests.get(url)
    html_content = response.content
    soup = BeautifulSoup(html_content, 'html.parser')

    product = {'url': url, 'page_type': 'product'}
    if site_name == 'missetam':
        name_tag = soup.find('h1', class_='product-name')
        price_tag = soup.find('span', class_='price')
        discount_tag = soup.find('span', class_='old-price')
        brand_tag = soup.find('div', class_='product-brand')
        images = soup.find_all('img', class_='gallery-image')
        color_container = soup.find('div', class_='product-colors')
        desc_tag = soup.find('div', class_='product-description')

        product['name'] = name_tag.text.strip() if name_tag else None
        product['price'] = price_tag.text.strip() if price_tag else None
        product['discounted_price'] = discount_tag.text.strip() if discount_tag else None
        product['brand'] = brand_tag.text.strip() if brand_tag else None
        product['number_of_photos'] = len(images)
        product['number_of_colors'] = len(color_container.find_all('div')) if color_container else 0
        product['description'] = desc_tag.text.strip() if desc_tag else None
    elif site_name == 'gap':
        name_tag = soup.find('h1', class_='pdp-title')
        price_tag = soup.find('span', class_='pdp-price')
        discount_tag = soup.find('span', class_='pdp-price-old')
        brand_tag = soup.find('a', class_='pdp-brand')
        images = soup.find_all('img', class_='pdp-thumbnail')
        color_container = soup.find('ul', class_='pdp-color-options')
        desc_tag = soup.find('div', class_='pdp-description')

        product['name'] = name_tag.text.strip() if name_tag else None
        product['price'] = price_tag.text.strip() if price_tag else None
        product['discounted_price'] = discount_tag.text.strip() if discount_tag else None
        product['brand'] = brand_tag.text.strip() if brand_tag else None
        product['number_of_photos'] = len(images)
        product['number_of_colors'] = len(color_container.find_all('li')) if color_container else 0
        product['description'] = desc_tag.text.strip() if desc_tag else None
    elif site_name == 'your_look_for_less':
        name_tag = soup.find('h1', class_='sc-b18a510e-0')
        price_tag = soup.find('span', class_='sc-7f1e497e-2')
        discount_tag = soup.find('span', class_='sc-49115527-0')
        brand_tag = soup.find('div', class_='sc-298c6716-0')
        images = soup.find_all('img', class_='sc-b9c7e565-0')
        color_container = soup.find('div', class_='sc-375b6dc6-0')
        desc_tag = soup.find('div', class_='sc-7f1e497e-4')

        product['name'] = name_tag.text.strip() if name_tag else None
        product['price'] = price_tag.text.strip() if price_tag else None
        product['discounted_price'] = discount_tag.text.strip() if discount_tag else None
        product['brand'] = brand_tag.text.strip() if brand_tag else None
        product['number_of_photos'] = len(images)
        product['number_of_colors'] = len(color_container.find_all('div')) if color_container else 0
        product['description'] = desc_tag.text.strip() if desc_tag else None

    return product

def main():
    list_pages = {
        'missetam': "https://www.missetam.nl/nl/collectie/jurken/",
        'gap': "https://www.gap.com/browse/category.do?cid=5664&nav=meganav%3AWomen%3ACategor",
        'your_look_for_less': "https://www.your-look-for-less.nl/goedkope-blouses"
    }

    product_pages = {
        'missetam': "https://www.missetam.nl/nl/3844173/top-print-zwart/",
        'gap': "https://www.gap.com/browse/product.do?pid=794603002&rrec=true&mlink=5001,1,dynami",
        'your_look_for_less': "https://www.your-look-for-less.nl/p/99055"
    }

    all_products = []

    for site_name, url in list_pages.items():
        products = scrape_list_page(url, site_name)
        all_products.extend(products)

    for site_name, url in product_pages.items():
        product = scrape_product_page(url, site_name)
        all_products.append(product)

    with open('products.csv', 'w', newline='') as csvfile:
        fieldnames = ['url', 'page_type', 'name', 'brand', 'price', 'discounted_price', 'position', 'number_of_photos', 'number_of_colors', 'description']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader()
        for product in all_products:
            writer.writerow(product)

if __name__ == "__main__":
    main()


In [2]:
pip install requests beautifulsoup4 pandas



## Thank You!
I enjoy feedback - if you found something interesting, found an error, or have any comments, please let me know.