## scrapy with project -> fast

##### reference link :  https://www.jitsejan.com/using-scrapy-in-jupyter-notebook.html

In [1]:
# Settings for notebook
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Show Python version
import platform
platform.python_version()

'3.7.0'

In [2]:
import scrapy
from scrapy.crawler import CrawlerProcess

In [3]:
import json

class JsonWriterPipeline(object):

    def open_spider(self, spider):
        self.file = open('productName.csv', 'w')

    def close_spider(self, spider):
        self.file.close()

    def process_item(self, item, spider):
        line = json.dumps(dict(item)) + "\n"
        self.file.write(line)
        return item

In [5]:
import os
import re
import logging
import pandas as pd

class productNameSpider(scrapy.Spider):
    BASE_DIR = os.getcwd()
    DATASET_PATH = os.path.join(BASE_DIR, u"")
    file1 = os.path.join(DATASET_PATH, "reviewNotDuplication.json")
    df = pd.read_json(file1)#, error_bad_lines=False, header=None)
    
    name = "productName"
    start_urls = ["https://www.amazon.com/product-reviews/{}".format(asin) for asin in set(df['asin'])]
    
    custom_settings = {
        'LOG_LEVEL': logging.WARNING,
        'ITEM_PIPELINES': {'__main__.JsonWriterPipeline': 1}, # Used for pipeline 1
        'FEED_FORMAT':'json',                                 # Used for pipeline 2
        'FEED_URI': 'productName.json'                        # Used for pipeline 2
    }
    
    def parse(self, response):
        response.url.split("/")[-1]
        link = response.css('body link[rel="canonical"]::attr(href)').get()
        productName = re.sub(r"(https:\/\/.*\.com\/)|(product-reviews\/)|(\?.*)","",link).split("/")[0]
        yield {
            'asin' : response.url.split("/")[-1],
            'productName' : productName
        }

In [7]:
process = CrawlerProcess({
    'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Whale/1.5.75.9 Safari/537.36',
})

process.crawl(productNameSpider)
process.start()

2019-08-08 00:41:08 [scrapy.utils.log] INFO: Scrapy 1.6.0 started (bot: scrapybot)
2019-08-08 00:41:08 [scrapy.utils.log] INFO: Versions: lxml 4.3.4.0, libxml2 2.9.9, cssselect 1.0.3, parsel 1.5.1, w3lib 1.20.0, Twisted 19.2.1, Python 3.7.0 (v3.7.0:1bf9cc5093, Jun 26 2018, 23:26:24) - [Clang 6.0 (clang-600.0.57)], pyOpenSSL 19.0.0 (OpenSSL 1.1.1c  28 May 2019), cryptography 2.7, Platform Darwin-18.7.0-x86_64-i386-64bit
2019-08-08 00:41:08 [scrapy.crawler] INFO: Overridden settings: {'FEED_FORMAT': 'json', 'FEED_URI': 'productName.json', 'LOG_LEVEL': 30}


<Deferred at 0x118759898>

In [8]:
ll productName.*

-rw-r--r--  1 Jay  staff  3833 Aug  8 00:41 productName.csv
-rw-r--r--  1 Jay  staff  3881 Aug  8 00:41 productName.json


In [9]:
!tail -n 2 productName.csv

{"asin": "B00MHPAFAG", "productName": "Sony-16-50mm-Mirrorless-Digital-Camera"}
{"asin": "B007VGGHW6", "productName": "Nikon-Digital-18-55mm-3-5-5-6-NIKKOR"}


In [10]:
!tail -n 2 productName.json

{"asin": "B007VGGHW6", "productName": "Nikon-Digital-18-55mm-3-5-5-6-NIKKOR"}
]

In [12]:
import pandas as pd
dfjson = pd.read_json('productName.json')
dfjson.head(5)

Unnamed: 0,asin,productName
0,B00IZDLD32,Sony-Mirrorless-Digital-Camera-24-3MP
1,B004J41T7Q,Canon-PowerShot-Digital-Camera-Black
2,B00B5HE2UG,Canon-PowerShot-Digital-Stabilized-2-7-Inch
3,B00I8BICB2,Sony-Mirrorless-Digital-3-0-Inch-16-50mm
4,B0075SUK14,Canon-PowerShot-Digital-Stabilized-Wide-Angle


In [1]:
len(dfjson['asin'])

NameError: name 'dfjson' is not defined

## scrapy without project -> slow

In [None]:
import re
import time
import requests
from scrapy.http import TextResponse

def getProductName(asinList):
    returnList = []
    errorList = []
    headers = {
        'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-US,en;q=0.9,ko;q=0.8',
        'user-agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Whale/1.5.75.9 Safari/537.36',
    }
    
    start = time.time()
    for asin in asinList:
        URL = "https://www.amazon.com/product-reviews/{}".format(asin)
        resp = requests.get(URL, headers = headers)

        response = TextResponse(resp.url, body=resp.text, encoding="utf-8")
        link = response.css('body link[rel="canonical"]::attr(href)').get()
        try:
            productName = re.sub(r"(https:\/\/.*\.com\/)|(product-reviews\/)|(\?.*)","",link).split("/")[0]
            returnList.append((asin, productName))
            if(len(returnList)%5==0):
                print("{}개".format(len(returnList)))
        except:
            errorList.append(asin)
    print("time: ", time.time() - start)
    
    print("에러: {}".format(errorList))
    return returnList


In [None]:
# resultList = getProductName([ "inputList" ])