# Demonstration of Spider App on All Alibaba Categories

This notebook will cover the results of running the spider application on all the categories Alibaba has on its website. An example of the scraped websites is shown below.

<img src='https://i.imgur.com/W9szA4w.png' />

In [1]:
import scrapy
from scrapy.selector import Selector
from scrapy.http import HtmlResponse, Request, Response
import requests
from scrapy.http import TextResponse
from scrapy.crawler import CrawlerProcess
import json
import logging
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
%matplotlib inline

In [2]:
main_url='https://www.alibaba.com/Products'
r = requests.get(main_url)
response = TextResponse(r.url, body=r.text, encoding='utf-8')

In [3]:
# title
response.selector.xpath('//title/text()').extract()

[u'Manufacturers, Suppliers and Exporters Directory on Alibaba.com']

In [4]:
url_to_scan=response.selector.css('div.cg-main').css('div.sub-item-cont-wrapper').css('li a::attr(href)').extract()

open_file='results/result.jl'
open_file2='results/result.json' 
class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open(open_file, 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item
    
class Ali_Spider(scrapy.Spider):
    name = "alibaba"
    start_urls = url_to_scan
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': open_file2,                       # Used for pipeline 2
        'DOWNLOAD_DELAY': 1 # delay between requests
    }
    def parse(self, response):
        #extract
        item_type=response.selector.xpath('//title/text()').extract()
        item_title=response.selector.xpath('//h2[@class="title"]/a/text()').extract()
        seller_title=response.selector.xpath('//div[@class="stitle"]/a/text()').extract()
        price=response.selector.xpath('//div[@class="price"]/b/text()').extract()
        min_order=response.selector.xpath('//div[@class="min-order"]/b/text()').extract()
        item_type=item_type*len(item_title)

        for item in zip(item_title,seller_title,price,min_order,item_type):
            scraped_info = {
            'item_title': item[0],
            'seller_title': item[1],
            'price': item[2],
            'min_order': item[3],
            'item_type': item[4]
            }
            yield scraped_info

In [5]:
process = CrawlerProcess({
    'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)'
})

process.crawl(Ali_Spider)
process.start(stop_after_crawl=True)

2018-04-26 16:36:15 [scrapy.utils.log] INFO: Scrapy 1.5.0 started (bot: scrapybot)
2018-04-26 16:36:15 [scrapy.utils.log] INFO: Versions: lxml 4.1.1.0, libxml2 2.9.4, cssselect 1.0.3, parsel 1.4.0, w3lib 1.19.0, Twisted 17.5.0, Python 2.7.14 |Anaconda custom (64-bit)| (default, Dec  7 2017, 11:07:58) - [GCC 4.2.1 Compatible Clang 4.0.1 (tags/RELEASE_401/final)], pyOpenSSL 17.5.0 (OpenSSL 1.0.2o  27 Mar 2018), cryptography 2.1.4, Platform Darwin-17.4.0-x86_64-i386-64bit
2018-04-26 16:36:15 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'results/result.json', 'LOG_LEVEL': 30, 'USER_AGENT': 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)', 'DOWNLOAD_DELAY': 1}


In [6]:
!tail -n 2 results/result.jl

{"item_title": "Alloy frame outdoor pop up tent,folding tent canopy for exhibition/trade show Customized quick roof top folding tent", "min_order": "50 Sets", "seller_title": "Heshan Feng Yushun Outdoor Products Co., Ltd.", "price": " US $18.0-19.01 ", "item_type": "Trade Show Tent, Trade Show Tent Products, Trade Show Tent Manufacturers, Trade Show Tent Suppliers and Exporters Directory "}
{"item_title": "Waterproof PU trade show pop up canopy outdoor tent 10x10", "min_order": "1 Piece", "seller_title": "Shenyang Morning Banner Display Co., Ltd.", "price": " US $50-200 ", "item_type": "Trade Show Tent, Trade Show Tent Products, Trade Show Tent Manufacturers, Trade Show Tent Suppliers and Exporters Directory "}


In [7]:
dfjson = pd.read_json('results/result.json')
dfjson.head()
#dfjson.to_csv('df_output.csv', encoding='utf-8',index=False)

Unnamed: 0,item_title,item_type,min_order,price,seller_title
0,High quality perlite for Horticulture,"Agricultural Growing Media, Agricultural Growi...",1 Cubic Meter,US $25-60,"Lingshou County Zhongrun Minerals Co., Ltd."
1,Hot Sale Perlite Filter Aid Expanded Perlite P...,"Agricultural Growing Media, Agricultural Growi...",1 Ton,US $290-356,"Xinyang Xuri Filter Aid Co., Ltd."
2,Horticultural Perlite,"Agricultural Growing Media, Agricultural Growi...",72 Cubic Meters,US $30-40,"Shijiazhuang Kedahua Imp. & Exp. Trade Co., Ltd."
3,Casting Slag Remover for Cast Iron,"Agricultural Growing Media, Agricultural Growi...",1 Ton,US $650-750,"Qingdao FSK Foundry Materials Co., Ltd."
4,expanded construction grade insulation perlite,"Agricultural Growing Media, Agricultural Growi...",1 Metric Ton,US $120-180,"Shijiazhuang Mining Imp & Exp Trade Co., Ltd."


In [11]:
dfjson.tail()

Unnamed: 0,item_title,item_type,min_order,price,seller_title
35117,Free design canopy printing cheap custom print...,"Trade Show Tent, Trade Show Tent Products, Tra...",1,150-399,"Shanghai Tongjie Image Production Co., Ltd."
35118,3m*6m Folding Pop Up Custom Gazebo Canopy fold...,"Trade Show Tent, Trade Show Tent Products, Tra...",1,45-299,"Wuyi Hotter Outdoor Products Co., Ltd."
35119,2016 New Products Cheap Aluminium Frame Custom...,"Trade Show Tent, Trade Show Tent Products, Tra...",1,299-849.9,"Jiangmen Eastern Signs Manufacturing Co., Ltd...."
35120,"Alloy frame outdoor pop up tent,folding tent c...","Trade Show Tent, Trade Show Tent Products, Tra...",50,18.0-19.01,"Heshan Feng Yushun Outdoor Products Co., Ltd."
35121,Waterproof PU trade show pop up canopy outdoor...,"Trade Show Tent, Trade Show Tent Products, Tra...",1,50-200,"Shenyang Morning Banner Display Co., Ltd."


In [12]:
dfjson.shape

(35122, 5)