# Manage Source and Raw Data

General overview

- Download content and data to S3 (source data)
- Process source data to produce raw data (nnn_kits and nnn_kits_details)

## Globals

In [None]:
import os
import re
import csv
import json
import boto3
import cloudscraper
import pandas as pd
from io import StringIO
from bs4 import BeautifulSoup
from bs4 import Comment
from dotenv import load_dotenv
from urllib.parse import urlparse

load_dotenv('../.env.local')
if 'AWS_PROFILE' in os.environ:
    del os.environ['AWS_PROFILE']

# AWS S3 Configuration
aws_bucket_name = os.getenv('AWS_BUCKET_NAME')
aws_region = os.getenv('AWS_REGION')
aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')

# Set up S3 client
s3_client = boto3.client(
    's3',
    region_name=aws_region,
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key
)

### Get HTML Content

Retrieves the HTML content of a given URL using S3 as a cache.

In [None]:
def get_html_content(base_url, url, refresh=False): 

    # Use the URL as the S3 key
    s3_key = url

    # Check if the object exists in S3
    object_exists = False
    try:
        s3_client.head_object(Bucket=aws_bucket_name, Key=s3_key)
        object_exists = True
    except s3_client.exceptions.ClientError as e:
        # Object does not exist or other error
        object_exists = False

    # If object does not exist or refresh is True, retrieve and store HTML
    if not object_exists or refresh:
        print(f"Going to Web for {base_url}{url}")
        full_url = base_url + url
        scraper = cloudscraper.create_scraper(delay=10, browser='chrome')
        html_content = scraper.get(full_url).text

        # Store the HTML content in S3
        try:
            s3_client.put_object(
                Bucket=aws_bucket_name, Key=s3_key, Body=html_content, ContentType='text/html')
        except Exception as e:
            print(f"Error saving to S3: {e}")
            return None

    else:
        # Retrieve the HTML content from S3
        print(f"Cache from S3 for {base_url}{url}")
        try:
            response = s3_client.get_object(Bucket=aws_bucket_name, Key=s3_key)
            html_content = response['Body'].read().decode('utf-8')
        except Exception as e:
            print(f"Error reading from S3: {e}")
            return None

    print(f"Retrieved HTML content for {base_url}{url}")
    return html_content

## Estes Functions


### Estes Product Pages
iterates through product pages extracting links to product details

In [None]:
def get_all_products_estes(url):

    html_content = get_html_content('https://help.estesrockets.com', url)
    soup = BeautifulSoup(html_content, "html.parser")

    links = soup.find_all('a', class_='article-list-link')
    extracted_links = [(link['href'], link.get_text()) for link in links]

    next_page = soup.find('a', class_='pagination-next-link')
    if next_page and 'href' in next_page.attrs:
        extracted_links += get_all_products_estes(next_page['href'])

    return extracted_links

### Estes Product Details

In [None]:
def extract_product_estes(url):

    html_content = get_html_content('https://help.estesrockets.com', url)
    soup = BeautifulSoup(html_content, "html.parser")

    # Extract image source URL from the 'article-body' class
    image_src = soup.find('div', class_='article-body').find('img')['src']

    # Extract description and product URL
    article_body = soup.find('div', class_='article-body')
    description, product_url = None, None
    for p_tag in article_body.find_all('p', recursive=False):
        if 'Purchase Link' in p_tag.text:
            product_url = p_tag.find('a')['href'] if p_tag.find('a') else None
        elif not p_tag.find():
            description = p_tag.get_text().strip()

    # Extract key features from the product attributes table
    features = {}
    table = soup.find(
        'table', class_='woocommerce-product-attributes shop_attributes')
    if table:
        for row in table.find_all('tr'):
            feature_name = row.find('th').get_text().strip()
            feature_value = row.find('td').get_text().strip()
            if feature_name:
                features[feature_name] = feature_value

    # Extract the instructions PDF link
    instructions_pdf = soup.find('div', class_='article-attachments').find(
        'a')['href'] if soup.find('div', class_='article-attachments') else None

    return {
        "description": description,
        "image_src": image_src,
        "features": features,
        "instructions": instructions_pdf,
        "product_url": product_url
    }

### Write Estes Raw kit list

In [None]:
def write_estes_kits(filename, links, fields):
    with open(filename, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fields)

        writer.writeheader()  # Write the fieldnames as a header
        for href, text in links:
            match = re.match(r"(\d{1,4})\s*-?\s*(.*)", text)
            if match:
                model = match.group(1)
                # Remove leading dash and space, if any
                name = match.group(2).lstrip("– ").strip()
            else:
                model = 'Unknown'
                name = text
            writer.writerow({'URL': href, 'Model': model, 'Name': name})

In [None]:
def write_estes_kit_details(file_path, output_file_path):

    fieldnames = [
        "url",
        "description",
        "image_src",
        "Recommended Engines",
        "Projected Max Altitude",
        "Recovery System",
        "Length",
        "Diameter",
        "Estimated Weight",
        "Estimated Assembly Time",
        "Fin Materials",
        "Decal Type",
        "Launch System",
        "Launch Rod Size",
        "instructions",
        "Construction",
        "Wingspan",
        "Age Recommendation",
        "Launch Rod System",
        "Recovery",
        "Fin Material",
        "Estimated Assembly Weight"
    ]
    with open(file_path, newline='', encoding='utf-8') as csvfile, \
            open(output_file_path, 'w', newline='', encoding='utf-8') as outfile:
        reader = csv.reader(csvfile, quotechar='"')
        writer = csv.DictWriter(
            outfile, fieldnames=fieldnames, quotechar="'", quoting=csv.QUOTE_ALL)
        writer.writeheader()

        next(reader)  # Skip the header row

        for row in reader:
            url = row[0]
            kit_info = extract_product_estes(url)
            # Flatten the 'features' dictionary
            flattened_features = {k: v for k,
                                  v in kit_info['features'].items()}

            # Merge all data into a single dictionary
            row_data = {'url': kit_info['product_url'], **flattened_features,
                        'description': kit_info['description'], 'image_src': kit_info['image_src'], 'instructions': kit_info['instructions'] }
            writer.writerow(row_data)

## Loc Precision Functions


### Loc Product Item

Extracts product information from HTML content.

Args:

- base_url (str): The base URL of the website.
- html_content (str): The HTML content to extract product information from.

Returns:

- list: A list of dictionaries containing product information.

Each dictionary contains the following keys:

- handle (str): The product handle.
- id (str): The product ID.
- detail_url (str): The URL of the product detail page.
- image_url (str): The URL of the product image.


In [None]:
def extract_products_loc(base_url, html_content):

    soup = BeautifulSoup(html_content, 'html.parser')
    products = []

    for product in soup.find_all('div', class_='grid-product'):
        product_handle = product.get('data-product-handle')
        product_id = product.get('data-product-id')

        # Extracting the URL for the product detail page
        detail_url = product.find('a', class_='grid-product__link')['href']
        full_detail_url = f'{base_url}{detail_url}'

        # Extracting the image URL
        image_tag = product.find('img', class_='grid__image-contain')
        image_url = image_tag.get(
            'data-src').replace('{width}', '540') if image_tag else None

        product_info = {
            'handle': product_handle,
            'id': product_id,
            'detail_url': full_detail_url,
            'image_url': image_url
        }

        products.append(product_info)

    return products

### Loc Product Pages

Retrieves all products from a given URL and its subsequent pages.

Args:

- url (str): The URL of the page to start extracting products from.

Returns:

- list: A list of product details extracted from the given URL and its subsequent pages.


In [None]:
def get_all_products_loc(url):

    html_content = get_html_content('https://locprecision.com', url)
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract product details from the current page
    products = extract_products_loc('https://locprecision.com', html_content)

    # Find the link to the next page
    next_span = soup.find('span', class_='next')
    if next_span:
        next_page = next_span.find('a', title='Next')
        if next_page and 'href' in next_page.attrs:
            # Recursively extract products from the next page
            products += get_all_products_loc(next_page['href'])

    return products

### Write Loc Kit List

Write a list of products to a CSV file.

Args:

- products (list): A list of dictionaries representing products.
- filename (str, optional): The name of the CSV file to write to. Defaults to 'loc_kits.csv'.


In [None]:
def write_loc_kits(products, filename='./data_raw/loc_kits.csv'):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        fieldnames = ['handle', 'id', 'detail_url', 'image_url']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for product in products:
            writer.writerow(product)

In [None]:
def extract_json_ld_loc(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    comments = soup.find_all(string=lambda text: isinstance(text, Comment))
    for comment in comments:
        if 'application/ld+json' in comment:
            comment_soup = BeautifulSoup(comment, 'html.parser')
            json_ld_tag = comment_soup.find(
                'script', type='application/ld+json')
            if json_ld_tag:
                return json.loads(json_ld_tag.string)
    return None

In [None]:
def extract_product_details(url):
    parsed_url = urlparse(url)

    html_content = get_html_content(
        f"{parsed_url.scheme}://{parsed_url.netloc}", parsed_url.path)
    soup = BeautifulSoup(html_content, 'html.parser')

    # Initialize a dictionary to hold the extracted details
    details = {
        'Complexity': None,
        'Diameter': None,
        'Height': None,
        'Weight': None,
        'Motor Mount': None,
        'Parachute Size': None,
        'Shock Cord Type': None,
        'Shock Cord Mount': None,
        'Fin Thickness': None,
        'Ring Thickness': None,
        'Instructions': None,
        'Decal': None,
        'Name': None,
        'Image URL': None,
        'SKU': None,
        'Price': None,
        'Currency': None,
        'Stock Status': None,
        'Product URL': None,
        'Description': None,
        'Links': []
    }

    # Extract JSON-LD data
    json_ld_data = extract_json_ld_loc(html_content)
    if json_ld_data:
        details.update({
            'Name': json_ld_data.get('name'),
            'Image URL': json_ld_data.get('image', {}).get('url'),
            'SKU': json_ld_data.get('sku'),
            'Price': json_ld_data.get('offers', [{}])[0].get('price'),
            'Currency': json_ld_data.get('offers', [{}])[0].get('priceCurrency'),
            'Stock Status': json_ld_data.get('offers', [{}])[0].get('availability'),
            'Product URL': json_ld_data.get('url')
        })

    # Extract other details from the description
    description_div = soup.find(
        'div', class_='product-single__description rte')
    if description_div:
        p_tags = description_div.find_all('p')
        pattern = re.compile(r'^(\w+(?:\s\w+){0,2}):\s*(.+)')
        for p_tag in p_tags:
            if not p_tag.find():
                details['Description'] = p_tag.get_text().strip()
            elif not pattern.search(p_tag.get_text()):
                # Extract <a> tags
                a_tags = p_tag.find_all('a')
                for a in a_tags:
                    link_info = {'text': a.get_text(), 'href': a['href']}
                    details['Links'].append(link_info)
            else:
                lines = p_tag.get_text(separator='\n').split('\n')
                for line in lines:
                    match = pattern.search(line)
                    if match:
                        name, value = match.groups()
                        details[name] = value

    return details

## Process Data


### Process Loc Data


In [None]:
all_products = get_all_products_loc('/collections/rocket-kits')
write_loc_kits(all_products)

with open('./data_raw/loc_kits.csv', newline='') as csvfile, open('./data_raw/loc_kits_details.csv', 'w', newline='') as outputfile:
    reader = csv.DictReader(csvfile, quotechar='"')
    fieldnames = [
        'Name', 'Image URL', 'Complexity', 'Diameter', 'Height', 'Weight',
        'Motor Mount', 'Parachute Size', 'Shock Cord Type', 'Shock Cord Mount',
        'Fin Thickness', 'Ring Thickness', 'Instructions', 'Decal', 'Launch Pad', 'Electronics Bay',
        'Price', 'Product URL', 'Currency', 'SKU', 'Stock Status', 'Description', 'Rail Buttons',
        'Links', 'Vinyl Decals', 'Tec features', 'Decals', 'Fire Blanket', 'Vinyl Decal', 'Parachute', 'Fin Array', 'Rocksim', 'Parachutes', 'Additional Decals'
    ]
    writer = csv.DictWriter(outputfile, fieldnames=fieldnames,
                            quotechar="'", quoting=csv.QUOTE_ALL)
    writer.writeheader()

    for row in reader:
        detail_url = row['detail_url']
        details = extract_product_details(detail_url)
        writer.writerow(details)

### Process Estes Data


In [None]:
all_links = get_all_products_estes(
    '/hc/en-us/sections/8356411218829-Currently-Manufactured-Rockets')
fieldnames = ['URL', 'Model', 'Name']  # Define your fieldnames
write_estes_kits('./data_raw/estes_kits.csv', all_links, fieldnames)

In [None]:
write_estes_kit_details('./data_raw/estes_kits.csv', './data_raw/estes_kits_details.csv')