In [5]:
import json
import unicodedata
import requests
from bs4 import BeautifulSoup, Tag
import csv
import headers

In [6]:
# resding csv file and generated links
def reading_csv():
    row = []
    all_links = []
    with open("Amazon Scraping - Sheet1.csv", "r", newline="") as file:
        content = csv.reader(file)
        next(content)
        for i in content:
            row.append(i)
    for i in row:
        link_format = "https://www.amazon.{country}/dp/{asin}"
        all_links.append(link_format.format(country=i[3], asin=i[2]))
    return all_links

In [7]:

# get price from string
def get_price(text):
    valid_character = "1234567890,."
    price = ""
    for i in text:
        if str(i) in valid_character:
            price = price+i
    return price


In [8]:

# removing control characters from string
def remove_control_characters(s):
    return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")

In [9]:

# get all details from link of product
def get_product_details(product_link, header):
    response = requests.get(product_link, headers=header)
    soup = BeautifulSoup(response.content, "html.parser")

    # checking if no data found page shown
    status = response.status_code
    if status == 404:
        print("Status: Not found")
        return "Not found"
    #checking if server error
    elif status>=500 and status<=600:
      print("Status: Server Error")
      return "Server Error"
    # checking if captcha verification found
    elif soup.find("form", {"action": "/errors/validateCaptcha"}):
        print("Status: Captcha")
        return "Captcha"

    # fetching product details
    else:

        # - fetching product_title
        product_title = soup.find("span", {"id": "productTitle"})
        product_title = product_title.text
        product_title = remove_control_characters(
            product_title).replace("\n", "").strip()

        # - fetching product image by '#imgBlkFront'
        # - checking if id changed and using alternate id
        product_image = soup.find("img", {"id": "imgBlkFront"})
        if not product_image:
            product_image = soup.find(
                "div", {"id": "imgTagWrapperId"}).select_one("img")["src"]
        else:
            product_image = product_image["src"]

        # - fetching product price by '#tmmSwatches'
        # - checking if id changed and using alternate id
        # - removing extra characters from price string
        product_price = soup.find(
            "div", {"id": "tmmSwatches"})
        if not product_price:
            product_price = soup.find("div", {
                "id": "corePriceDisplay_desktop_feature_div"}).select_one("span.a-offscreen")
        else:
            product_price = product_price.select_one(
                "a.a-button-text span.a-color-base")

        product_price = get_price(product_price.text)

        # - fetching product detail;s by '#detailBullets_feature_div'
        # - checking if id changed and using alternate id
        # - removing extra and invalid characters from product details string
        # - addind details in the form of dictionary {detail_heading : detail_text}
        all_product_details = soup.find(
            "div", {"id": "detailBullets_feature_div"})
        if not all_product_details:
            product_detail_row = soup.find("div", {"id": "prodDetails"}).select(
                "table#productDetails_techSpec_section_1 tr")
            product_details = []
            for detail in product_detail_row:
                heading = detail.select_one("th").text

                # removing special control character
                heading = remove_control_characters(
                    heading).replace("\n", "").strip()
                details = detail.select_one("td").text
                details = remove_control_characters(
                    details).replace("\n", "").strip()
                product_details.append({heading: details})
        else:
            all_product_details = all_product_details.select_one("ul")
            product_details = []
            for product_detail_items in all_product_details:
                if isinstance(product_detail_items, Tag):
                    title = product_detail_items.select_one(
                        "span.a-text-bold").text

                    # removing special control character
                    clean_title = remove_control_characters(
                        title).replace(":", "").strip()

                    detail = product_detail_items.find(
                        'span').find_next('span').find_next('span').text

                    clean_detail = remove_control_characters(
                        detail).replace(":", "").strip()
                    dic = {clean_title: clean_detail}
                    product_details.append(dic)

        product = {
            "product_title": product_title,
            "product_image": product_image,
            "product_price": product_price,
            "product_details": product_details
        }

        print("Status: Success")

        return product

In [10]:

# getting a list of all product details and bypass captcha by changing user agents for all link
def get_all_products_detail():
    products_list = []
    links = reading_csv()
    for idx, link in enumerate(links):
        current_header = headers.user_agents[idx % 5]
        print("Started #", idx, ": ", link)
        products_list.append(get_product_details(link, current_header))
    return products_list

In [11]:

# export data to JSON file
def export_to_json(json_data):
    json_object = json.dumps(json_data)
    with open("details_of_product.json", "w") as outfile:
        outfile.write(json_object)

In [12]:
all_products_details = get_all_products_detail()
all_products_details

Started # 0 :  https://www.amazon.de/dp/1015
Status: Not found
Started # 1 :  https://www.amazon.fr/dp/1015
Status: Not found
Started # 2 :  https://www.amazon.de/dp/000004458X
Status: Success
Started # 3 :  https://www.amazon.fr/dp/000004458X
Status: Not found
Started # 4 :  https://www.amazon.de/dp/1002198
Status: Not found
Started # 5 :  https://www.amazon.fr/dp/1002198
Status: Not found
Started # 6 :  https://www.amazon.fr/dp/1002791
Status: Not found
Started # 7 :  https://www.amazon.it/dp/1002791
Status: Not found
Started # 8 :  https://www.amazon.de/dp/1002864
Status: Not found
Started # 9 :  https://www.amazon.fr/dp/1002864
Status: Not found
Started # 10 :  https://www.amazon.de/dp/1003704
Status: Not found
Started # 11 :  https://www.amazon.fr/dp/1003704
Status: Not found
Started # 12 :  https://www.amazon.de/dp/1003763
Status: Not found
Started # 13 :  https://www.amazon.fr/dp/1003763
Status: Not found
Started # 14 :  https://www.amazon.fr/dp/1004271
Status: Not found
Started

['Not found',
 'Not found',
 {'product_title': 'Old Spice Rasur Creme - 70 G (original) - Packung Von 2',
  'product_image': 'https://images-eu.ssl-images-amazon.com/images/I/41xHbjdSXJL._SY300_SX300_QL70_ML2_.jpg',
  'product_price': '8,00',
  'product_details': [{'Item-Paket Dimensionen L x b x H': '18.3 x 6.7 x 3.9 cm'},
   {'Paketgewicht': '0.18 Kilogramm'},
   {'Nettogewicht des Einzelartikels': '0.15 Pfund'},
   {'Hersteller': 'Old Spice'},
   {'Marke': 'Old Spice'},
   {'Modellnummer': 'SG_000004458X_US'},
   {'Verpackungsabmessungen': '18.29 x 6.71 x 3.91 cm; 68.04 Gramm'},
   {'ASIN': '000004458X'}]},
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 {'product_title': 'Short Story: Violoncello und Klavier. (Cello-Bibliothek)',
  'product_image': 'https://images-eu.ssl-images-amazon.com/images/I/41zw0783tpL._SX198_BO1,204,203,200_QL40_ML2_.jpg',
 

In [None]:
all_products_details


In [9]:
export_to_json(all_products_details)

In [39]:
import mysql.connector

# Saving data in MySQL database
# To run this in local:
#   - connect to your database
#   - create table using query:
#
#   CREATE TABLE products(
# 	    id INT NOT NULL PRIMARY KEY AUTO_INCREMENT,
#   	product_title TEXT NOT NULL,
#   	product_image TEXT NOT NULL,
#   	product_price VARCHAR(255) NOT NULL,
#   	product_detail TEXT NOT NULL
#   )
def export_to_mysql_database(list_of_details):
    my_db = mysql.connector.connect(
        host = "localhost",
        user = "sagar",
        password = "password",
        database = "Credicxo"
    )
    cur = my_db.cursor()
    for row in list_of_details:
        if type(row) is dict:
            print("this row is inserted")
            cur.execute("INSERT INTO products_details(product_title, product_image, product_price, product_detail) values(%s, %s, %s, %s)", (str(row.get("product_title")), str(row.get("product_image")), str(row.get("product_price")), str(row.get("product_details"))   ))
        else:
            print("no data available in row")
            continue
    my_db.commit()
    cur.close()
    return "done"

In [40]:
export_to_mysql_database(all_products_details)

no data available in row
no data available in row
this row is inserted
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
this row is inserted
this row is inserted
no data available in row
no data available in row
this row is inserted
this row is inserted
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in row
no data available in

'done'

In [38]:
all_products_details

['Not found',
 'Not found',
 {'product_title': 'Old Spice Rasur Creme - 70 G (original) - Packung Von 2',
  'product_image': 'https://images-eu.ssl-images-amazon.com/images/I/41xHbjdSXJL._SY300_SX300_QL70_ML2_.jpg',
  'product_price': '8,00',
  'product_details': [{'Item-Paket Dimensionen L x b x H': '18.3 x 6.7 x 3.9 cm'},
   {'Paketgewicht': '0.18 Kilogramm'},
   {'Nettogewicht des Einzelartikels': '0.15 Pfund'},
   {'Hersteller': 'Old Spice'},
   {'Marke': 'Old Spice'},
   {'Modellnummer': 'SG_000004458X_US'},
   {'Verpackungsabmessungen': '18.29 x 6.71 x 3.91 cm; 68.04 Gramm'},
   {'ASIN': '000004458X'}]},
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 'Not found',
 {'product_title': 'Short Story: Violoncello und Klavier. (Cello-Bibliothek)',
  'product_image': 'https://images-eu.ssl-images-amazon.com/images/I/41zw0783tpL._SX198_BO1,204,203,200_QL40_ML2_.jpg',
 