In [1]:
# Dataset Source:
# This dataset was collected from publicly available knitting and crochet pattern pages on Ravelry.
# Official website: https://www.ravelry.com
# The collected metadata includes pattern titles, designer names, descriptions, fiber types, skill levels, and tags.
# The data is used exclusively for research purposes to enhance pattern search functionality using NLP.

import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

#Scrapes raw data from Ravelry.com pattern pages and stores them in an organized CSV file. There is no data preprocesing performed in this script.
def scrape_data():
    df_urls = pd.read_csv("ravelry_pattern_urls.csv")
    pattern_urls = df_urls["URL"].tolist()
    
    #GET request
    headers = {"User-Agent" : "Mozilla/5.0"}
    
    #Find pattern details
    patterns = []
    for url in pattern_urls:
        response = requests.get(url, headers = headers)
        
        #Parse HTML
        soup = BeautifulSoup(response.text, "html.parser")
    
        #Extract pattern details
        name = soup.find("h2", class_="rsp_hidden").text.strip() if soup.find("h2", class_="rsp_hidden") else "N/A"
        designer = soup.select_one(".pattern_author a").text.strip() if soup.select_one(".pattern_author a") else "N/A"
        craft = soup.find("label", string="Craft").find_next("div", class_="value").text.strip() if soup.find("label", string="Craft") else "N/A"
        category = " → ".join([span.text.strip() for span in soup.select("div.category span")]) if soup.select("div.category span") else "N/A"
        yarn_weight_field = None
        for field in soup.find_all("div", class_="field core_item_content__field"):
            label = field.find("label", class_="core_item_content__label")
            if label and "Yarn weight" in label.get_text():
                yarn_weight_field = field
                break  
        yarn_weight = yarn_weight_field.find("div", class_="value").get_text(strip=True) if yarn_weight_field else "N/A"
        gauge = soup.find("label", string="Gauge").find_next("div", class_="value").text.strip() if soup.find("label", string="Gauge") else "N/A"
        needle_size = ", ".join([div.text.strip() for div in soup.select("label:-soup-contains('Needle size') + div.value")]) if soup.select("label:-soup-contains('Needle size') + div.value") else "N/A"
        yardage = soup.find("label", string="Yardage").find_next("div", class_="value").text.strip() if soup.find("label", string="Yardage") else "N/A"
        sizes_available = soup.find("label", string="Sizes available").find_next("div", class_="value").text.strip() if soup.find("label", string="Sizes available") else "N/A"
    
        # Extract tags/attributes
        tags = ", ".join([tag.text.strip() for tag in soup.select("li.tag a")]) if soup.select("li.tag a") else "N/A"
    
        # Extract pattern description
        description_div = soup.select_one(".notes.markdown.core_item_content__text_block")
        description = description_div.text.strip() if description_div else "N/A"
    
        #Extract languages field
        languages = ", ".join([lang.get_text(strip=True) for lang in soup.select("label:-soup-contains('Languages') + div.value span")]) if soup.select("label:-soup-contains('Languages') + div.value span") else "N/A"
        
        # Store data
        patterns.append([name, designer, craft, category, yarn_weight, gauge, needle_size, yardage, sizes_available, languages, tags, description, url])
        time.sleep(1)
    
    #Convert to DataFrame
    df = pd.DataFrame(patterns, columns=["name", "designer", "craft", "category", "yarn_weight", "gauge", "needle_size", "yardage", "sizes_available", "languages", "tags", "description", "url"])
    
    #Save to CSV
    df.to_csv("patterns_raw.csv", index=False)


Scraped data saved to patterns_raw.csv
