In [None]:
# default_exp crawler

# Crawler
> This repository aims to explore the catalog available at wine.com.br, do some exploratory analysis in it and
create initially a toy recommendation engine / wine classifier and pricing tool.

In [1]:
import scrapy
from fire import Fire
from nbdev.showdoc import *
from scrapy.crawler import CrawlerProcess
from functools import partial
from bs4 import BeautifulSoup as soup

In [2]:
CATALOG = "//article"
NEXT = "/html/body/div[6]/div/div[2]/div[2]/div/div[4]/div"
prefix = "https://wine.com.br"
COUNT = 434
#url_short = "https://www.wine.com.br/browse.ep?cID=100851&exibirEsgotados=true&listagem=horizontal&sorter=featuredProducts-desc&filters=cVINHOS"
url_short = "https://www.wine.com.br/browse.ep?cID=100851&exibirEsgotados=true&pn=1&listagem=horizontal&sorter=featuredProducts-desc&filters=cVINHOS" 
url_next = "https://www.wine.com.br/browse.ep?cID=100851&exibirEsgotados=true&pn={page}&listagem=horizontal&sorter=featuredProducts-desc&filters=cVINHOS"

In [None]:
TIPOS = {'Branco', 'Espumante', 'Frisante', 'Licoroso', 'Rosé', 'Tinto'}
PAISES = {'África do Sul', 'Alemanha', 'Argentina', 'Austrália', 'Brasil', 'Chile'
          'China', 'Espanha', 'Estados Unidos', 'França', 'Hungria', 'Itália', 'Líbano'
          'Nova Zelândia', 'Portugal', 'Uruguai', 'Grécia', 'Marrocos'}

KEYS = {'Vinícola', 'Teor_Alcoólico', 'Amadurecimento', 'Safra','Classificação', 'Visual', 
        'Olfativo', 'Gustativo', 'Temperatura', 'Potencial_Guarda', 'Decantação''Harmonização'}

In [3]:
class CatalogClassic(scrapy.Spider):
    name = "catalog_classic"
    #url = url_short
    i=1

    def start_requests(self):
        yield scrapy.Request(url=url_short, callback=self.parse_page)

    def parse_page(self, response, count=i):
        
        wine_list = response.xpath(CATALOG)
        for block in wine_list:
            tag = soup(block.get(), 'lxml')
            key = []
            val = []

            link = prefix + block.css('div > a::attr("href")').get()

            key += ["link"]
            val += [link]

            title = tag.find('h2')
            
            key.append("Nome")

            if title:
                val.append(title.string)
            else:
                val.append(None)

            precos = tag.find_all(class_='Price-raw')
            
            key.extend(['Preço_Sócio', 'Preço_Normal'])

            if len(precos) >= 2:
                precos = sorted(list(set([float(p.string) for p in precos])))
                val.extend(precos[0:2])
            else:
                val.extend([None]*2)


            avaliação = tag.find("evaluation-tag")
                       
            key.append("Pontuação")

            if avaliação:
                val.append(float(avaliação[':evaluation']))
            else:
                val.append(None)
                       
                       
            key.append("Avaliações")
 
            rating = tag.find('a', class_='Rating-count', string=True)

            if rating:
                rating = rating.string.replace("(", "")
                rating = rating.replace(")", "")
                val.append(rating)
                
            parse_vinho = partial(self.ficha_tecnica, key=key, val=val)
                
            yield response.follow(link, parse_vinho)
                

        count +=1
        next_page = url_next.format(page=count)
        parse_next = partial(self.parse_page, count=count)
        if count <= COUNT:
            yield response.follow(next_page, parse_next)
            
    # Second parsing method
    def ficha_tecnica(self, response, key, val):
        page = response.xpath('/html').get()
        tag = soup(page, 'lxml')
        #key = []
        #val = []
        
        
        v = tag.find(class_="somelier__description")
        key.append('somelier')
        val.append(v.string.strip() if v else '')
        
        keys = [t.string for t in tag.find_all('dt')]
        vals = [t.string for t in tag.find_all('dd')]
        
               
        for k,v in zip(keys, vals):
            if k in TIPOS:
                key.append('tipo')
                val.append(k)
            elif k in PAISES:
                key.append('origem')
                val.append(f'{k}-{v}')
            else:
                key.append(k)
                val.append(v)
                
        
        yield dict(zip(key, val))

In [None]:
#export
# class CatalogFaster(scrapy.Spider):
#     name = "catalog"
#     start_urls = [url_short] + [url_next.format(page=i) for i in range(2, 85)]
    

#     def parse(self, response):
#         wine_list = response.xpath(CATALOG)
#         for block in wine_list:
#             tag = soup(block.get(), 'lxml')
#             key = []
#             val = []
            
#             key += ["link"]
#             val += [prefix + block.css('div > a::attr("href")').get()]
            
#             title = tag.find('h2')
            
#             if title:
#                 key.append("Nome")
#                 val.append(title.string)

#             precos = tag.find_all(class_='Price-raw')        

#             if len(precos) >= 2:
#                 precos = sorted(list(set([float(p.string) for p in precos])))
#                 key.append('Preço_Sócio')
#                 val.append(precos[0])
#                 key.append('Preço_Normal')
#                 val.append(precos[1])
            
                
#             avaliação = tag.find("evaluation-tag")
            
#            # print(f"Avaliação: {avaliação.attrs}")
#             if avaliação:
#                 key.append("Pontuação")
#                 val.append(float(avaliação[':evaluation']))
                

#             rating = tag.find('a', class_='Rating-count', string=True)
            
#             if rating:
#                 key.append("Avaliações")
#                 rating = rating.string.replace("(", "")
#                 rating = rating.replace(")", "")
#                 val.append(rating)
        
#             yield dict(zip(key, val))
            
#     # Second parsing method
#     def ficha_tecnica(self, key, val, tag):
#         tag = soup(tag, 'lxml')
#         key = []
#         val = []
        
#         v = tag.find(class_="somelier__description")
#         key.append('somelier')
#         val.append(v.string.strip() if v else '')
        
#         keys = [t.string for t in tag.find_all('dt')]
#         vals = [t.string for t in tag.find_all('dd')]
        
               
#         for k,v in zip(keys, vals):
#             if k in TIPOS:
#                 key.append('tipo')
#                 val.append(k)
#             elif k in PAISES:
#                 key.append('origem')
#                 val.append(f'{k}-{v}')
#             else:
#                 key.append(k)
#                 val.append(v)
                
        
#         return dict(zip(key, val))

In [None]:
def crawl():
    # Run the Spider
    process = CrawlerProcess()
    process.crawl(CatalogClassic)
    process.start()


if __name__ == "__main__":
    Fire(crawl)

In [8]:
?list.extend

[0;31mSignature:[0m [0mlist[0m[0;34m.[0m[0mextend[0m[0;34m([0m[0mself[0m[0;34m,[0m [0miterable[0m[0;34m,[0m [0;34m/[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m Extend list by appending elements from the iterable.
[0;31mType:[0m      method_descriptor


In [11]:
t = []
t.extend([1,2])

In [12]:
t

[1, 2]