In [221]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import requests
from bs4 import BeautifulSoup
import re
import time
from tqdm.notebook import tqdm
import json
import pandas as pd

In [233]:
def get_listing_details(url):

    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    
    seller_text = soup.find('p', class_="wt-text-body-01 wt-mr-xs-1").find('a', class_="wt-text-link-no-underline").text.strip()
    seller_href = soup.find('p', class_="wt-text-body-01 wt-mr-xs-1").find('a', class_="wt-text-link-no-underline")['href']
    
    listing_title = soup.find('h1', class_="wt-text-body-03 wt-line-height-tight wt-break-word wt-mb-xs-1").text.strip()
    
    listing_price = soup.find('p', class_="wt-text-title-03 wt-mr-xs-2").text

    listing_highlights = [
        _.text.strip() for _ 
        in soup.find_all('li', class_="wt-list-unstyled wt-display-flex-xs wt-align-items-flex-start")
    ]

    if soup\
        .find('div', class_="wt-display-inline-flex-xs wt-align-items-center wt-mb-xs-2 wt-flex-wrap")\
        .find_all('span', class_='wt-text-caption'):
        store_sales = soup\
            .find('div', class_="wt-display-inline-flex-xs wt-align-items-center wt-mb-xs-2 wt-flex-wrap")\
            .find('span', class_='wt-text-caption').text
    else:
        store_sales = None
    
    if soup\
        .find('div', class_="wt-display-inline-flex-xs wt-align-items-center wt-mb-xs-2 wt-flex-wrap")\
        .find_all('input', attrs={'name': 'initial-rating'}):
        store_rating = soup\
            .find('div', class_="wt-display-inline-flex-xs wt-align-items-center wt-mb-xs-2 wt-flex-wrap")\
            .find('input', attrs={'name': 'initial-rating'})['value']
    else:
        store_rating = None

    if soup.find_all('span', class_="wt-badge wt-badge--status-03 wt-mt-xs-1 wt-mr-xs-1 search-half-unit-mb"):
        listing_bestseller = soup\
            .find('span', class_="wt-badge wt-badge--status-03 wt-mt-xs-1 wt-mr-xs-1 search-half-unit-mb").text.strip()
    else:
        listing_bestseller = '-'

    # get description
    if soup.find_all('p', class_="wt-text-body-01 wt-break-word"):
        listing_desc = soup.find('p', class_="wt-text-body-01 wt-break-word").text
    else:
        listing_desc = None

    # GET REVIEWS FOR ITEM
    if soup.find_all('button', attrs={'aria-controls': 'same-listing-reviews-panel'}):
        listing_reviews = soup.find('button', attrs={'aria-controls': 'same-listing-reviews-panel'}).find('span').text.strip()
    else:
        listing_reviews = None
        
    # get shop reviews
    if soup.find_all('h3', class_="wt-mr-xs-2 wt-text-body-03"):
        store_reviews = soup.find('h3', class_="wt-mr-xs-2 wt-text-body-03").text.strip()
    else:
        store_reviews = None
        
    details = {
        'seller_text': seller_text,
        'seller_href': seller_href,
        'store_sales': store_sales,
        'store_rating': store_rating,
        'listing_title': listing_title,
        'listing_bestseller': listing_bestseller,
        'listing_price': listing_price,
        'listing_highlights': listing_highlights,
        'listing_desc': listing_desc,
        'listing_reviews': listing_reviews,
        'store_reviews': store_reviews
    }

    return details


def get_search_listings(cat_url):

    # get listing urls for cat or store url
    r = requests.get(cat_urls[0])

    soup = BeautifulSoup(r.content)

    search_listings = [
        x.find('a')['href'] for x in 
        soup.find(
            'ul', 
            class_="responsive-listing-grid wt-grid wt-grid--block wt-justify-content-flex-start wt-list-unstyled wt-pl-xs-0 tab-reorder-container"
        )\
        .find_all('li')
    ]

    next_page = soup.find('a', attrs={'data-page': 2})['href']
    
    return search_listings, next_page


def get_two_search(cat_url):
    
    search_listings, next_page = get_search_listings(cat_urls[0])
    search_listing_results = search_listings

    search_listings, next_page = get_search_listings(next_page)
    search_listing_results.extend(search_listings)

    return list(set(search_listing_results))


def iterate_cat_urls(cat_urls):
    
    data = []
    for _ in tqdm(cat_urls):
        results = get_two_search(_)
        data.extend(results)

    urls_clean = [
        re.search(r'(?:(?!\?).)*', _).group(0) for _
        in list(set(data))
    ]
    
    return urls_clean


def iterate_listings(urls_clean):
    
    data = []
    for _ in tqdm(urls_clean):
        
        try:
            url_result = get_listing_details(_)
            url_result.update({'url': _})
            time.sleep(1)
            data.append(url_result)
            json.dump(data, open('temp_data.json', 'w')) # in case of interruption
            
        except KeyboardInterrupt:
            return data
    
    return data


def recover():
    data = json.load(open('temp_data.json', 'r'))
    return data

In [248]:
cat_urls = [
    'https://www.etsy.com/c/paper-and-party-supplies/paper/calendars-and-planners',
    'https://www.etsy.com/c/paper-and-party-supplies/paper/invitations-and-announcements/templates',
    'https://www.etsy.com/c/paper-and-party-supplies/paper/stationery/design-and-templates',
    'https://www.etsy.com/shop/DIYPaperBoutique'
]

# cat_urls = [
#     'https://www.etsy.com/search?q=template'
# ]

In [249]:
urls_clean = iterate_cat_urls(cat_urls)

In [225]:
data = iterate_listings(urls_clean)

HBox(children=(FloatProgress(value=0.0, max=134.0), HTML(value='')))




In [193]:
data = recover()

In [226]:
len(data)

134

In [227]:
for _ in data:
    listing_price_clean = float(re.search(r'\$[0-9]{1,}\.[0-9]{1,}', _['listing_price']).group(0).replace('$', ''))
    store_sales_clean = int(re.search(r'[0-9,]{1,}', _['store_sales']).group(0).replace(',', ''))
    store_reviews_clean = int(re.search(r'[0-9,]{1,}', _['store_reviews']).group(0).replace(',', ''))
    listing_reviews_clean = int(_['listing_reviews']) if _['listing_reviews'] is not None else 0
    
    _['listing_price_clean'] = listing_price_clean
    _['store_sales_clean'] = store_sales_clean
    _['store_reviews_clean'] = store_reviews_clean
    _['listing_reviews_clean'] = listing_reviews_clean

In [243]:
df = pd.DataFrame(data)[[
    'seller_text', 'store_sales_clean', 'store_rating', 'store_reviews_clean', 'listing_reviews_clean',
    'listing_price_clean', 'listing_bestseller', 'listing_title', 'listing_highlights',
     'url'
]]

In [244]:
df.drop_duplicates(subset=['url'], inplace=True)

In [245]:
df.sort_values('listing_reviews_clean', ascending=False).to_csv('listings.csv', index=False)