https://cwspirits.com/

In [None]:
from parser import parse_alcohol_section
from models import Item, Money, Volume, Brand, BeverageType
from service_funcs import generate_sku

from constant_data.brands import brands as data_brands
from constant_data.types import types as data_types
from constant_data.countries import countries as data_countries

import pymongo
from config import MONGO_DB
from bson import Decimal128

from models import Brand, BeverageType, Country
from random import choice, randint
import re
import json

from constant_data.brands import brands as data_brands
from constant_data.types import types as data_types


client = pymongo.MongoClient(MONGO_DB)
db = client['pourpal']

countries_col = db['countries']
brands_col = db['beverage_brands']
types_col = db['beverage_types']
items_col = db['items']

countries = countries_col.find()


In [None]:
whiskey_countries = [
    'Ireland',
    'United Kingdom',
    'United States',
    'Canada',
    'Australia',
]

wine_countries = [
    'France',
    'Italy',
    'Spain',
    'Portugal',
    'Germany',
    'United States',
]

vodka_countries = [
    'Russia',
    'Poland',
    'Lithuania',
    'Latvia',
    'Estonia',
    'Finland',
    'Sweden',
]

rum_countries = [
    'Jamaica',
    'Barbados',
    'Guatemala',
    'Mexico',
    'Dominican Republic',
    'Haiti',
    'Cuba',
]

gin_countries = [
    'United Kingdom',
    'United States',
    'Canada',
    'Australia',
    'New Zealand',
    'South Africa',
    'India',
]

beer_countries = [
    'United States',
    'Canada',
    'United Kingdom',
    'Germany',
    'Belgium',
    'Netherlands',
    'France',
]

tequila_countries = [
    'Mexico',
    'United States',
    'Canada',
    'Australia',
    'New Zealand',
    'South Africa',
]

brandy_countries = [
    'France',
    'United States',
    'Canada',
    'Australia',
    'New Zealand',
    'South Africa',
]

liqueur_countries = [
    'France',
    'Italy',
    'Netherlands',
    'Germany',
    'Ireland',
    'United States',
    'Mexico',
    'United Kingdom',
    'Denmark',
    'Switzerland',
    'Austria',
    'Spain',
    'Greece',
    'Czechia',
    'Poland',
    'Jamaica',
]

wine_countries = [
    'France',
    'Italy',
    'Spain',
    'Portugal',
    'Germany',
    'United States',
]

champagne_countries = [
    'France',
]

alcohol_countries = dict(
    Whiskey = whiskey_countries,
    Vodka = vodka_countries,
    Rum = rum_countries,
    Gin = gin_countries,
    # Beer = beer_countries,
    Tequila = tequila_countries,
    Brandy = brandy_countries,
    Liqueur = liqueur_countries,
    Wine = wine_countries,
    Champagne = champagne_countries,
)


In [None]:
bevarage_sections_urls = [
    dict(url='https://cwspirits.com/collections/whiskey', type_name='Whiskey'),
    dict(url='https://cwspirits.com/collections/vodka', type_name='Vodka'),
    dict(url='https://cwspirits.com/collections/rum', type_name='Rum'),
    dict(url='https://cwspirits.com/collections/gin', type_name='Gin'),
    dict(url='https://cwspirits.com/collections/tequila', type_name='Tequila'),
    dict(url='https://cwspirits.com/collections/brandy', type_name='Brandy'),
    dict(url='https://cwspirits.com/collections/liqueur', type_name='Liqueur'),
    dict(url='https://cwspirits.com/collections/red-wines', type_name='Wine'),
    dict(url='https://cwspirits.com/collections/white-wine', type_name='Wine'),
    dict(url='https://cwspirits.com/collections/rose', type_name='Wine'),
    dict(url='https://cwspirits.com/collections/champagne-and-sparkling-wine', type_name='Champagne'),
]

In [None]:
products_info = []
scraped_urls_num = 0
scraping_counter = 0
for url_dict in bevarage_sections_urls:
    scraping_counter += 1
    try:
        print(f'>>> Parsing [{scraping_counter}/{len(bevarage_sections_urls)}]: {url_dict["url"]}')
        products_info.extend(parse_alcohol_section(url=url_dict['url'], type_name=url_dict['type_name'], max_pages=None, max_products=None))
        scraped_urls_num += 1
    except Exception as e:
        print('>>> URL ERROR:', e)

print('\n>>> Parsing completed')
print(f'Total urls: {len(bevarage_sections_urls)}')
print(f'Scraped {scraped_urls_num} urls')


### Save products info to file


In [None]:
# with open('products_info.json', 'w', encoding='utf-8') as file:
#     json.dump(products_info, file, ensure_ascii=False)

### Read products info from file

In [None]:
with open('products_info.json', 'r', encoding='utf-8') as file:
    products_info = json.load(file)

### Process origin countries

In [None]:
for product in products_info:
    try:
        origin_country_name = product.get('origin_country_name')
        if origin_country_name:
            origin_country_name = origin_country_name.split(', ')[-1]
            product['origin_country_name'] = origin_country_name
        db_country = countries_col.find_one({'name': origin_country_name}) if origin_country_name else None

        if not db_country:
            product['origin_country_name'] = choice(alcohol_countries[product.get('type_name')])
    except Exception as e:
        raise e


#### Create items

In [None]:
items = []
for product in products_info:
    try:
        volume = product.get('volume')
        volume, volume_unit = re.match(r'(\d+(?:\.\d+)?)\s*([a-zA-Z]+)', volume).groups()
        volume_unit = volume_unit.lower()

        origin_country = countries_col.find_one({'name': product.get('origin_country_name')})

        items.append(
            Item(
                sku=generate_sku(type_name=product.get('type_name')),
                title=product.get('title'),
                image_url=product.get('image_url'),
                description=product.get('description'),

                type_id='',
                type_name=product.get('type_name'),

                price=Money(amount=Decimal128(product.get('price'))),
                volume=Volume(amount=Decimal128(volume), unit=volume_unit),
                alcohol_volume=Volume(amount=Decimal128(product.get('alcohol_volume').replace('%', '')), unit='%'),
                quantity=randint(1, 230),
                origin_country_code=origin_country['code'],
                origin_country_name=origin_country['name'],

                brand_id='',
                brand_name=product.get('brand_name'),  
            )
        )
    except Exception as e:
        continue


### Create brands

In [None]:
# get all brands from items
brands = set([item.brand_name for item in items] + data_brands)

brands = [
    Brand(brand=brand).model_dump()
    for brand in brands
]

# drop collection brands
brands_col.drop()

# insert brands
brands_col.insert_many(brands)



### Create types

In [None]:
# get all type from items
types = set([item.type_name for item in items] + data_types)

types = [
    BeverageType(type=type).model_dump()
    for type in types
]

# drop collection types
types_col.drop()

# insert types
types_col.insert_many(types)


### Process items

In [None]:
# Create dictionaries for faster lookup
brand_dict = {brand['brand']: brand['brand_id'] for brand in brands}
type_dict = {type['type']: type['type_id'] for type in types}

for item in items:
    item.brand_id = brand_dict.get(item.brand_name, '')
    item.type_id = type_dict.get(item.type_name, '')

# drop collection items
items_col.drop()

# insert items
items_col.insert_many([item.model_dump() for item in items])


In [None]:
client.close()