In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

import requests
from bs4 import BeautifulSoup
import re
import time
from tqdm.notebook import tqdm
import json
import pandas as pd

In [123]:
def get_listing_details(url):

    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    
    if soup.find_all('p', class_="wt-text-body-01 wt-mr-xs-1"):
        if soup.find('p', class_="wt-text-body-01 wt-mr-xs-1").find_all('a', class_="wt-text-link-no-underline"):
            seller_text = soup.find('p', class_="wt-text-body-01 wt-mr-xs-1").find('a', class_="wt-text-link-no-underline").text.strip()
        else:
            seller_text = None
    else:
        seller_text = None
        
    if soup.find_all('p', class_="wt-text-body-01 wt-mr-xs-1"):
        if soup.find('p', class_="wt-text-body-01 wt-mr-xs-1").find_all('a', class_="wt-text-link-no-underline"):
            seller_href = soup.find('p', class_="wt-text-body-01 wt-mr-xs-1").find('a', class_="wt-text-link-no-underline")['href']
        else:
            seller_href = None
    else:
        seller_href = None
    
    if soup.find_all('h1', class_="wt-text-body-03 wt-line-height-tight wt-break-word wt-mb-xs-1"):
        listing_title = soup.find('h1', class_="wt-text-body-03 wt-line-height-tight wt-break-word wt-mb-xs-1").text.strip()
    else:
        listing_title = None
        
    if soup.find_all('p', class_="wt-text-title-03 wt-mr-xs-2"):
        listing_price = soup.find('p', class_="wt-text-title-03 wt-mr-xs-2").text
    else:
        listing_price = None

    listing_highlights = [
        _.text.strip() for _ 
        in soup.find_all('li', class_="wt-list-unstyled wt-display-flex-xs wt-align-items-flex-start")
    ]

    if soup\
        .find_all('div', class_="wt-display-inline-flex-xs wt-align-items-center wt-mb-xs-2 wt-flex-wrap"):
        if soup\
        .find('div', class_="wt-display-inline-flex-xs wt-align-items-center wt-mb-xs-2 wt-flex-wrap")\
        .find_all('span', class_='wt-text-caption'):
            store_sales = soup\
                .find('div', class_="wt-display-inline-flex-xs wt-align-items-center wt-mb-xs-2 wt-flex-wrap")\
                .find('span', class_='wt-text-caption').text
        else:
            store_sales = None
    else:
        store_sales = None
    
    if soup\
        .find_all('div', class_="wt-display-inline-flex-xs wt-align-items-center wt-mb-xs-2 wt-flex-wrap"):
        if soup\
        .find('div', class_="wt-display-inline-flex-xs wt-align-items-center wt-mb-xs-2 wt-flex-wrap")\
        .find_all('input', attrs={'name': 'initial-rating'}):
            store_rating = soup\
                .find('div', class_="wt-display-inline-flex-xs wt-align-items-center wt-mb-xs-2 wt-flex-wrap")\
                .find('input', attrs={'name': 'initial-rating'})['value']
        else:
            store_rating = None
    else:
        store_rating = None

    if soup.find_all('span', class_="wt-badge wt-badge--status-03 wt-mt-xs-1 wt-mr-xs-1 search-half-unit-mb"):
        listing_bestseller = soup\
            .find('span', class_="wt-badge wt-badge--status-03 wt-mt-xs-1 wt-mr-xs-1 search-half-unit-mb").text.strip()
    else:
        listing_bestseller = '-'

    # get description
    if soup.find_all('p', class_="wt-text-body-01 wt-break-word"):
        listing_desc = soup.find('p', class_="wt-text-body-01 wt-break-word").text
    else:
        listing_desc = None

    # GET REVIEWS FOR ITEM
    if soup.find_all('button', attrs={'aria-controls': 'same-listing-reviews-panel'}):
        listing_reviews = soup.find('button', attrs={'aria-controls': 'same-listing-reviews-panel'}).find('span').text.strip()
    else:
        listing_reviews = None
        
    # get shop reviews
    if soup.find_all('h3', class_="wt-mr-xs-2 wt-text-body-03"):
        store_reviews = soup.find('h3', class_="wt-mr-xs-2 wt-text-body-03").text.strip()
    else:
        store_reviews = None
        
    details = {
        'seller_text': seller_text,
        'seller_href': seller_href,
        'store_sales': store_sales,
        'store_rating': store_rating,
        'listing_title': listing_title,
        'listing_bestseller': listing_bestseller,
        'listing_price': listing_price,
        'listing_highlights': listing_highlights,
        'listing_desc': listing_desc,
        'listing_reviews': listing_reviews,
        'store_reviews': store_reviews
    }

    return details


def get_search_listings(cat_url):

    # get listing urls for cat or store url
    r = requests.get(cat_urls[0])

    soup = BeautifulSoup(r.content)

    search_listings = [
        x.find('a')['href'] for x in 
        soup.find(
            'ul', 
            class_="responsive-listing-grid wt-grid wt-grid--block wt-justify-content-flex-start wt-list-unstyled wt-pl-xs-0 tab-reorder-container"
        )\
        .find_all('li')
    ]

    next_page = soup.find('a', attrs={'data-page': 2})['href']
    
    return search_listings, next_page


def get_two_search(cat_url):
    
    search_listings, next_page = get_search_listings(cat_urls[0])
    search_listing_results = search_listings

    search_listings, next_page = get_search_listings(next_page)
    search_listing_results.extend(search_listings)

    return list(set(search_listing_results))


def iterate_cat_urls(cat_urls):
    
    data = []
    for _ in tqdm(cat_urls):
        results = get_two_search(_)
        data.extend(results)

    urls_clean = [
        re.search(r'(?:(?!\?).)*', _).group(0) for _
        in list(set(data))
    ]
    
    return urls_clean


def iterate_listings(urls_clean):
    
    data = []
    for _ in tqdm(urls_clean):
        
        try:
            url_result = get_listing_details(_)
            url_result.update({'url': _})
            time.sleep(1)
            data.append(url_result)
            json.dump(data, open('temp_data.json', 'w')) # in case of interruption
            
        except KeyboardInterrupt:
            return data
    
    return data


def recover():
    data = json.load(open('temp_data.json', 'r'))
    return data

In [71]:
def get_url_from_search(search_url, pages):

    def request_urls(search_url):
        
        # get clean URLs from search results
        r = requests.get(search_url)

        soup = BeautifulSoup(r.content)

        clean_urls = [
            _['href'] for _ in soup.find_all('a') if _.has_attr('href') and 'listing' in _['href']
        ]
        
        return clean_urls
    
    clean_urls = request_urls(search_url)
    
    for page in range(2, pages+1):
        search_url = soup.select('a[href*="page={}"]'.format(page))[0]['href']
        clean_urls.extend(request_urls(search_url))
        time.sleep(1)
    
    return clean_urls

In [72]:
to_search = [
    'printable business card',
    'snapchat filter',
    'banner',
    'wedding invitation',
    'printable wall art',
    'printable stickers',
    'printable planner',
    'digital planner',
    'svg',
    'printable quotes',
    'photo booth props',
    'printable selfie frames',
    'facebook template',
    'instagram template',
    'printable calendar',
    'fonts',
    'pinterest template',
    'youtube channel art',
    'logo',
    'coloring pages',
    'party decorations'
]

In [79]:
search_results = []
for search in tqdm(to_search):
    url = 'https://www.etsy.com/search?q=' + search.replace(' ', '+')
    result_urls = get_url_from_search(url, 3)
    temp_search_results = [
        {'url': _, 'search': search} 
        for _ in set([re.search(r'(?:(?!\?).)*', _).group(0) for _ in result_urls])
    ]
    search_results.extend(temp_search_results)
    json.dump(search_results, open('temp_search_results.json', 'w')) # save temp

HBox(children=(FloatProgress(value=0.0, max=21.0), HTML(value='')))




In [80]:
len(search_results)

2777

In [124]:
data = []
counter = 0
save_counter = 0
for _ in tqdm(search_results[:]):
    details = get_listing_details(_['url'])
    details.update({'search': _['search'], 'url': _['url']})
    data.append(details)
    counter += 1
    if counter >= 50:
        json.dump(
            data,
            open('data_save{}.json'.format(save_counter), 'w')
        )
        counter = 0
        save_counter += 1
        data = []
    time.sleep(.1)

HBox(children=(FloatProgress(value=0.0, max=1227.0), HTML(value='')))




In [125]:
# else if < 50, dump the remainder
json.dump(
    data,
    open('data_save{}.json'.format(save_counter), 'w')
)

In [127]:
import os

In [133]:
data = []
for _ in tqdm([_ for _ in os.listdir() if 'data_save' in _]):
    data_temp = json.load(open(_, 'r'))
    data.extend(data_temp)

HBox(children=(FloatProgress(value=0.0, max=57.0), HTML(value='')))




In [137]:
[_['store_sales'] for _ in data]

['21,406 sales',
 '336 sales',
 '6,933 sales',
 '9,294 sales',
 '2,567 sales',
 '51 sales',
 '44,849 sales',
 '21,700 sales',
 '140 sales',
 '9,294 sales',
 '1,786 sales',
 '10,621 sales',
 '3,763 sales',
 '34,687 sales',
 '348 sales',
 '4,651 sales',
 '2,567 sales',
 '50,852 sales',
 '64,796 sales',
 '21,700 sales',
 '21,406 sales',
 '336 sales',
 '6,933 sales',
 '9,294 sales',
 '2,567 sales',
 '51 sales',
 '44,849 sales',
 '21,700 sales',
 '140 sales',
 '9,294 sales',
 '788 sales',
 '1,943 sales',
 '25,448 sales',
 '171,639 sales',
 '3,763 sales',
 '108,063 sales',
 '70,784 sales',
 '9,294 sales',
 '6,933 sales',
 '17,026 sales',
 '17,177 sales',
 '70,784 sales',
 '9,430 sales',
 '237,218 sales',
 '2,567 sales',
 '2,994 sales',
 '237,218 sales',
 '25,448 sales',
 '17,026 sales',
 '3,763 sales',
 '171,641 sales',
 '171,641 sales',
 '237,217 sales',
 '25,448 sales',
 '6,933 sales',
 '34,975 sales',
 '197 sales',
 '237,224 sales',
 '981 sales',
 '353 sales',
 '21,700 sales',
 '3,824 sal

In [138]:
for _ in data:
    listing_price_clean = float(re.search(r'\$[0-9]{1,}\.[0-9]{1,}', _['listing_price']).group(0).replace('$', '')) if _['listing_price'] is not None else 0
    store_sales_clean = int(re.search(r'[0-9,]{1,}', _['store_sales']).group(0).replace(',', '')) if _['store_sales'] is not None else 0
    store_reviews_clean = int(re.search(r'[0-9,]{1,}', _['store_reviews']).group(0).replace(',', '')) if _['store_reviews'] is not None else 0
    listing_reviews_clean = int(_['listing_reviews']) if _['listing_reviews'] is not None else 0
    
    _['listing_price_clean'] = listing_price_clean
    _['store_sales_clean'] = store_sales_clean
    _['store_reviews_clean'] = store_reviews_clean
    _['listing_reviews_clean'] = listing_reviews_clean

df = pd.DataFrame(data)[[
    'seller_text', 'store_sales_clean', 'store_rating', 'store_reviews_clean', 'listing_reviews_clean',
    'listing_price_clean', 'listing_bestseller', 'listing_title', 'listing_highlights',
     'url'
]]

df.drop_duplicates(subset=['url'], inplace=True)

In [140]:
df.sort_values(['listing_reviews_clean', 'store_reviews_clean'], ascending=False).to_csv('search_listings.csv', index=False)

In [90]:
# # For browsing categories

# cat_urls = [
#     'https://www.etsy.com/c/paper-and-party-supplies/paper/calendars-and-planners',
#     'https://www.etsy.com/c/paper-and-party-supplies/paper/invitations-and-announcements/templates',
#     'https://www.etsy.com/c/paper-and-party-supplies/paper/stationery/design-and-templates',
#     'https://www.etsy.com/shop/DIYPaperBoutique'
# ]

# urls_clean = iterate_cat_urls(cat_urls)

# data = iterate_listings(urls_clean)

# for _ in data:
#     listing_price_clean = float(re.search(r'\$[0-9]{1,}\.[0-9]{1,}', _['listing_price']).group(0).replace('$', ''))
#     store_sales_clean = int(re.search(r'[0-9,]{1,}', _['store_sales']).group(0).replace(',', ''))
#     store_reviews_clean = int(re.search(r'[0-9,]{1,}', _['store_reviews']).group(0).replace(',', ''))
#     listing_reviews_clean = int(_['listing_reviews']) if _['listing_reviews'] is not None else 0
    
#     _['listing_price_clean'] = listing_price_clean
#     _['store_sales_clean'] = store_sales_clean
#     _['store_reviews_clean'] = store_reviews_clean
#     _['listing_reviews_clean'] = listing_reviews_clean

# df = pd.DataFrame(data)[[
#     'seller_text', 'store_sales_clean', 'store_rating', 'store_reviews_clean', 'listing_reviews_clean',
#     'listing_price_clean', 'listing_bestseller', 'listing_title', 'listing_highlights',
#      'url'
# ]]

# df.sort_values('listing_reviews_clean', ascending=False).to_csv('listings.csv', index=False)