In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import re
import pandas as pd
import numpy as np
import time
import asyncio
from requests_html import HTMLSession, AsyncHTMLSession

In [62]:
# URL to scrape
base_url = "https://www.vogue.com/fashion-shows/seasons"

In [106]:
# Function to get all the fashion show and slideshow URLs under /seasons
def get_fashion_season_urls(base_url):
    # Request the main seasons page
    response = requests.get(base_url)
    if response.status_code != 200:
        print(f"Failed to retrieve the page: {response.status_code}")
        return []

    # Parse the page content
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all 'a' tags that contain fashion show URLs
    show_urls = []
    for a_tag in soup.find_all('a', href=True):
        href = a_tag['href']
        # Filter URLs that contain "/fashion-shows/" and are not general category pages
        if "/fashion-shows/" in href and not href.endswith(("seasons", "latest-shows", "designers", "featured", "image-archive")):
            # Create full URL by joining base URL with relative paths
            full_url = urljoin(base_url, href)
            # Append the full URL to the list
            show_urls.append(full_url)

    # Remove duplicates
    show_urls = list(set(show_urls))

    return show_urls

    
# Function to get all slideshow URLs from a single fashion show page
def get_show_urls(show_url, verbose=False):
    slideshow_links = []
    show_name = re.sub("^.*fashion-shows/", '', show_url)
    if verbose: print(show_name)
    
    # Find slideshow links (ending with #1, #2, etc.)
    pattern = re.compile(r'.*SummaryItem.*')

    for _ in range(50):
        response = requests.get(show_url)
        if response.status_code != 200:
            print(f"Failed to retrieve the show page: {response.status_code}")
            return []

        soup = BeautifulSoup(response.content, "html.parser")

        for a_tag in soup.find_all('a', {'class': pattern}):
            href = a_tag.get('href')
            if show_name in href:
                full_url = urljoin(show_url, href)
                full_url += "/slideshow/collection#1"
                slideshow_links.append(full_url)
        
        if len(slideshow_links) > 0: break
    
    # Remove duplicates and return slideshow URLs
    if len(slideshow_links) > 0: return list(set(slideshow_links))
    else: print("WARNING:", show_name, "NOT CONNECTED")


In [108]:
# Call the function and print the result
fashion_season_urls = get_fashion_season_urls(base_url)

shows_urls = list(map(lambda item: get_show_urls(item, verbose=True), fashion_season_urls))
# print(shows_urls)

bridal-fall-2018
fall-2012-ready-to-wear
resort-2025
lagos-spring-2024
copenhagen-spring-2022
australia-spring-2015
spring-2015-menswear
fall-1996-couture
sao-paulo-resort-2017
kiev-spring-2019
resort-2019
resort-2007
spring-2009-couture
madrid-fall-2023
spring-2021-ready-to-wear
fall-2004-ready-to-wear
fall-2021-ready-to-wear
spring-2009-menswear
seoul-spring-2018
berlin-fall-2017
fall-2024-ready-to-wear
kiev-spring-2016
tbilisi-fall-2018
fall-2007-couture
fall-1989-ready-to-wear
fall-2002-menswear
fall-1994-ready-to-wear
copenhagen-spring-2019
pre-fall-2024-menswear
resort-2012
pre-fall-2025
shanghai-fall-2024
resort-2016
spring-1992-ready-to-wear
fall-1988-ready-to-wear
fall-2000-couture
spring-2022-ready-to-wear
stockholm-fall-2019
fall-2011-menswear
spring-1999-menswear
shanghai-spring-2023
kiev-spring-2018
berlin-spring-2025
spring-2008-couture
fall-2013-menswear
tbilisi-fall-2017
tokyo-spring-2024
fall-2010-ready-to-wear
resort-2021
ukraine-fall-2016
bridal-fall-2019
spring-1995

In [117]:
for i in range(len(shows_urls)):
    if len(shows_urls[i]) == 0:
        print(i, fashion_season_urls[i], shows_urls[i])
        print("RERUN")
        shows_urls[i] = get_show_urls(fashion_season_urls[i])
        print(shows_urls[i])

In [125]:
data_dict = {'season': [re.sub("^.*fashion-shows/", '', url) for url in fashion_season_urls],
        'season_url': fashion_season_urls,
        'show_url': shows_urls}

df_shows = pd.DataFrame(data_dict)
df_shows.to_csv('data/shows_season.csv', index_label='season_ID')

In [131]:
df_long = df_shows.explode('show_url')

# Reset the index if necessary
df_long = df_long.reset_index(drop=True)
df_long['year'] = df_long['season'].str.extract(r'(\d{4})')
df_long.to_csv('data/shows_season_long.csv', index_label='show_ID')

In [132]:
df_long

Unnamed: 0,season,season_url,show_url,year
0,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018
1,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018
2,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018
3,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018
4,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018
...,...,...,...,...
4305,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015
4306,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015
4307,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015
4308,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015


In [2]:
# from selenium import webdriver
# from selenium.webdriver.chrome.service import Service
# from selenium.webdriver.common.by import By
# from webdriver_manager.chrome import ChromeDriverManager

# # Setup Chrome options
# options = webdriver.ChromeOptions()
# options.add_argument("--start-maximized")  # Start maximized
# options.add_argument("--no-sandbox")       # Bypass OS security model
# options.add_argument("--disable-dev-shm-usage")  # Overcome limited resource problems

In [2]:
div_pattern = re.compile(r'RunwayGalleryLookNumberText.*')

asession = AsyncHTMLSession()

df_long = pd.read_csv('data/shows_season_long.csv')

In [3]:
async def get_npic(url, max_retries=50, delay=1):
    attempt = 0
    while attempt < max_retries:
        try:
            response = await asession.get(url)
    
            # Render the JavaScript on the page (if needed)
            await response.html.arender()

            
            soup = BeautifulSoup(response.html.html, "html.parser")

            div = soup.find('div', {'class': div_pattern})

            if div:
                npics = int(div.text.strip().split()[-1])
                return (url, npics)  # return the found <div> with indentation
            else:
                attempt += 1
                continue
        except Exception as e:
            attempt += 1
            print(f"Error scraping {url}")

    return None


async def run_tasks(s):
    tasks = [get_npic(arg) for arg in s]
    results = await asyncio.gather(*tasks)
    return results

In [4]:
df_long['total_pics'] = pd.Series([None] * len(df_long))
df_long

Unnamed: 0,show_ID,season,season_url,show_url,year,total_pics
0,0,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018,
1,1,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018,
2,2,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018,
3,3,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018,
4,4,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018,
...,...,...,...,...,...,...
4305,4305,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015,
4306,4306,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015,
4307,4307,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015,
4308,4308,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015,


In [None]:
npics_list = await run_tasks(df_long['show_url'])
npics_list

In [6]:
# Set up the Chrome driver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
driver.set_page_load_timeout(15)

df_long['total_pics'][:100] = df_long['show_url'][:100].apply(get_npic)

# Close the Selenium WebDriver after scraping is complete
driver.quit()

Error scraping https://www.vogue.com/fashion-shows/bridal-fall-2018/marchesa/slideshow/collection#1
Error scraping https://www.vogue.com/fashion-shows/bridal-fall-2018/elie-saab/slideshow/collection#1
Error scraping https://www.vogue.com/fashion-shows/resort-2025/alexander-mcqueen/slideshow/collection#1
Error scraping https://www.vogue.com/fashion-shows/lagos-spring-2024/orange-culture/slideshow/collection#1
Error scraping https://www.vogue.com/fashion-shows/sao-paulo-resort-2017/patbo/slideshow/collection#1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_long['total_pics'][:100] = df_long['show_url'][:100].apply(get_npic)


In [7]:
df_long

Unnamed: 0,show_ID,season,season_url,show_url,year,total_pics
0,0,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018,12
1,1,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018,15
2,2,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018,19
3,3,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018,24
4,4,bridal-fall-2018,https://www.vogue.com/fashion-shows/bridal-fal...,https://www.vogue.com/fashion-shows/bridal-fal...,2018,23
...,...,...,...,...,...,...
4305,4305,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015,
4306,4306,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015,
4307,4307,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015,
4308,4308,copenhagen-fall-2015,https://www.vogue.com/fashion-shows/copenhagen...,https://www.vogue.com/fashion-shows/copenhagen...,2015,
