### Crawler for the scraping all the product listings from 'https://www.partsgeek.com/'. This notebook contains the scraper for product page only. The links for these products have been previously scraped using a different crawler

In [None]:
from bs4 import BeautifulSoup
from random import randint
#import datetime
import time
import pandas as pd
import requests
import os
import numpy as np
import json
from sqlalchemy import create_engine
import psycopg2
import pandas.io.sql as psql

In [None]:
def get_brand(bb_pad):
    """get the part manufacturer/brand of the product"""
    return(bb_pad.find_all("span")[0].text)

In [None]:
def get_title(bb_pad):
    """get the title of the product listing"""
    return bb_pad.find_all("div", class_ = "bb_title")[0].text

In [None]:
def get_features(bb_pad):
    """get features and notes from the description and separate the feature header from feature description"""
    
    notes = (bb_pad.find_all("div", class_ = "bb_notes"))
    features = {}
    for i in notes:
        bb_bb = (i.find_all("span", class_ = "bb_bb"))
        bb_b = i.find_all("div")

    for i in range(len(bb_bb)):
        features.update({bb_bb[i].text.strip(":").replace(" ", "_") : bb_b[i].text.replace(bb_bb[i].text, '')})
        
    return json.dumps(features)

In [None]:
def get_images(bb_pad):
    """get images of the product and replace 'thumb' with 'full' for full size image"""
    product_images = []

    image = bb_pad.find_all("div", {"class" : "bb_image"})

    for i in image:
        a = i.find_all("img")
        for a2 in a:
            if a2['src'].endswith("jpg"):
                product_images.append(a2['src'].replace("thumb", "full"))
                
    return(product_images)

In [None]:
def get_table(bb_b):
    
    """get table with car fitment information"""
    
    table = bb_b.find_all("div", {"class" : "bb_pf"})
    #headers = {}
    for i in table:
        rows = (i.find_all("tr"))
        headers = i.find_all("th")
        
    ret_df = pd.DataFrame()

    for i in range(2, len(rows)):
        attr = rows[i].find_all("td")
        obj = {}
        for n in range(len(headers)):
            obj[headers[n].get_text()] = attr[n].get_text()
        ret_df = ret_df.append(pd.DataFrame(data = [obj]))
    
    ret_df.drop_duplicates(inplace = True)
    ret_df.reset_index(inplace = True, drop = True)
    result = ret_df.to_json(orient = "index")
    
    return result

In [None]:
def parts_data(soup, link):
    """use all above functions to scrape parts data along with their fitment"""
    
    all_parts_df = pd.DataFrame()
    
    prices = [item.text for item in soup.find_all("span", {"itemprop" : "price"})]
    category = [cat.text for cat in soup.find_all("div", {"id" : "bcrumbs"})][0]
    for i in soup.find_all("td", class_ = "bb_pad"):
        parts_dict = {}
        parts_dict.update({"brand" : get_brand(i),"title" : get_title(i), "price" : [item.text for item in i.find_all("span", {"itemprop" : "price"})][0]})
        parts_dict.update({"features" : get_features(i)})
        parts_dict.update({"image" : get_images(i), "category" : category})
        parts_dict.update({"fitment" : get_table(i), "link" : link})
        all_parts_df = all_parts_df.append(pd.DataFrame(data = [parts_dict]))
    return all_parts_df

In [None]:
# read file containing the scraped links to all product pages

file = pd.read_csv("part_links.csv")

In [None]:
engine = create_engine('postgresql://{username}:{password}@{host}:{port}/{database}')
for i in range(len(file)):
    try:
    
        response = requests.get(file['part_links'][i])
        soup = BeautifulSoup(response.content, 'html.parser')
        time.sleep(0.1)

        all_parts_data_df = parts_data(soup, file['part_links'][i])
        all_parts_data_df.to_sql('{inser table name}', con=engine, index=False, if_exists='append', schema = '{insert schema name}') #insert into sql database
        
        print(i, len(file))

    except:
        print(file['part_links'][i]) # print link of failed page