## Web Scraping with Selenium
_Author: Rachel Koenig_
_____

You may need to install selenium if you haven't already.  Just uncomment the below cell to do so. 

In [1]:
#pip install selenium

Imports 

In [2]:
import json
import pandas as pd
import gzip

from bs4 import BeautifulSoup
from requests import get
import requests

from selenium import webdriver
import time

from random import randint
from time import sleep

The below code is needed to unzip and read in a json zip file.  
Dataset is not shared on github because it is larger than the allowed max size, but can be downloaded here by clicking '5-core' next Clothing, Shoes and Jewelry.  Then change the getDF file path to wherever you save the download.   
http://jmcauley.ucsd.edu/data/amazon/index.html  

In [3]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('data/reviews_Clothing_Shoes_and_Jewelry_5.json.gz')

In [4]:
# Check the first 60 rows 
df.head(60)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A1KLRMWW2FWPL4,0000031887,"Amazon Customer ""cameramom""","[0, 0]",This is a great tutu and at a really great pri...,5.0,Great tutu- not cheaply made,1297468800,"02 12, 2011"
1,A2G5TCU2WDFZ65,0000031887,Amazon Customer,"[0, 0]",I bought this for my 4 yr old daughter for dan...,5.0,Very Cute!!,1358553600,"01 19, 2013"
2,A1RLQXYNCMWRWN,0000031887,Carola,"[0, 0]",What can I say... my daughters have it in oran...,5.0,I have buy more than one,1357257600,"01 4, 2013"
3,A8U3FAMSJVHS5,0000031887,Caromcg,"[0, 0]","We bought several tutus at once, and they are ...",5.0,"Adorable, Sturdy",1398556800,"04 27, 2014"
4,A3GEOILWLK86XM,0000031887,CJ,"[0, 0]",Thank you Halo Heaven great product for Little...,5.0,Grammy's Angels Love it,1394841600,"03 15, 2014"
5,A27UF1MSF3DB2,0000031887,"C-Lo ""Cynthia""","[0, 0]",I received this today and I'm not a fan of it ...,4.0,It's ok,1396224000,"03 31, 2014"
6,A16GFPNVF4Y816,0000031887,design maven,"[0, 0]",Bought this as a backup to the regular ballet ...,5.0,Great for dress-up and for ballet practice,1399075200,"05 3, 2014"
7,A2M2APVYIB2U6K,0000031887,Jamie P.,"[0, 0]",Great tutu for a great price. It isn't a &#34;...,5.0,Great value,1356220800,"12 23, 2012"
8,A1NJ71X3YPQNQ9,0000031887,JBerger,"[0, 0]","My daughter liked this, and it with her costum...",4.0,Good,1384041600,"11 10, 2013"
9,A3EERSWHAI6SO,0000031887,"Jeffrey Hollingshead ""Jillian hollingshead""","[7, 8]",For what I paid for two tutus is unbeatable an...,5.0,WOW !! ..is all I have to say!,1349568000,"10 7, 2012"


### EDA

In [5]:
#Check size of dataset: 278,677 rows & 9 columns. 
df.shape

(278677, 9)

In [6]:
# Check columns names 
df.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

In [7]:
# Check out an individual product ID
df[df['asin'] == 'B00971NLBS']

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
224886,A1J4GHI1VRLCX9,B00971NLBS,Alya,"[1, 1]",I will be purchasing more of this brand of shi...,5.0,Classic,1374624000,"07 24, 2013"
224887,A21UXFFQGQS4YJ,B00971NLBS,"Anne Harris ""Lady H""","[0, 0]",I found it to be a little roomy but it will be...,5.0,nice dress shirt,1393545600,"02 28, 2014"
224888,A22Z5LSEIPN8DU,B00971NLBS,Ashleyy,"[0, 0]","I ordered this shirt in a light blue, size two...",2.0,"Wrong color, wrong size.",1372291200,"06 27, 2013"
224889,A3SP43WB89B2UE,B00971NLBS,"cecilafayette ""ceci""","[0, 0]","This shirt fits well, looks nice, and is a gre...",4.0,Great Little Basic Shirt,1388102400,"12 27, 2013"
224890,A1Q3MDPIS017JE,B00971NLBS,Kay Z,"[0, 0]","The material is a little thin, but that was ex...",4.0,"Good fit, little thin",1400284800,"05 17, 2014"
224891,A1BCJXOINBPB38,B00971NLBS,"lascrucesgirl ""Growing boy's momma""","[3, 4]",I did not use the tie for my son. It was dress...,5.0,Nicely made and looks sharp.,1360627200,"02 12, 2013"
224892,A1AFORE8L2LGGW,B00971NLBS,mandamommyof3,"[5, 5]",This is adorable! I would buy more if the tie...,4.0,Tie is too short,1356652800,"12 28, 2012"
224893,A22E4B018G6NX7,B00971NLBS,RDeering,"[0, 0]",Nice little shirt I couldn't find elsewhere th...,5.0,can't complain,1388966400,"01 6, 2014"


In [8]:
#Check how many unique items are in the data 
len(df['asin'].unique())

23033

In [9]:
#Created a non-repeating list of product ID #s with the intent of looping through them in the url to scrape. 
asin_set = set(df['asin'])
asin_set = list(asin_set)

In [10]:
#Sort the list in alphabetical order so that it is the same everytime 
asin_set.sort()

In [11]:
# Look at the first 10 objects in the list 
asin_set[:10]

['0000031887',
 '0123456479',
 '1608299953',
 '1617160377',
 'B00001W0KA',
 'B00001WRHJ',
 'B00004SR8W',
 'B00004SR8Z',
 'B00004SR9P',
 'B00004U1J2']

In [12]:
#Check how many unique users are in the data. 
len(df['reviewerID'].unique())

39387

### Scrape additional data using Selenium

In [13]:
# Instantiate a webdriver 
driver = webdriver.Chrome(executable_path="../chromedriver/macos/chromedriver") 

NOTE: Below scraper is set to only loop through the first 20 items in the asin list, but can be changed to any amount.

In [14]:
# Set start time and print it on the first row 
t1 = time.time()  
print(t1)

# Instantiate an empty list 
list_of_dicts = []  

# Loop through every item in the asin list. 
for i in asin_set[:20]:  
    
    # Print the urls at the beginning of each loop 
    print(f'https://www.amazon.com/gp/product/{i}')
    
    # Send webdriver to the next url in the loop
    driver.get(f'https://www.amazon.com/gp/product/{i}')
    
    # Instantiate BeautifulSoup web scraper 
    soup = BeautifulSoup(driver.page_source, 'lxml') 
    
    # Create an empty dictionary 
    product_info = {}
    
    # Try to locate these tags and save the information as a key-value pair in the product_info dict.
    try:
        # Save product ID # so I can identify which product the scrape belongs to 
        product_info['asin'] = i
        
        # Look for product title 
        product_info['name'] = soup.find('span', {'id':'productTitle'}).text.strip()

        # Look for product description
        product_info['description'] = soup.find('ul', {'class':'a-unordered-list a-vertical a-spacing-none'}).text.strip()
        
        # Look for product categories list 
        product_info['category'] = soup.find('ul', {'class': 'a-unordered-list a-horizontal a-size-small'}).text.strip()
        
        # Look for product size 
        product_info['size'] = soup.find('div', {'id': 'variation_size_name'})
        
        # Look for additional product details as a catch all 
        product_info['details'] = soup.find_all('td', {'class': 'a-span7 a-size-base'})
        
        # There were 3 possibile tags to get color info, whichever one works will be the end of the loop
        if soup.find('div', {'id': 'variation_color_name'}):
            product_info['color'] = soup.find('div', {'id': 'variation_color_name'})
            list_of_dicts.append(product_info)  # Append scrape to dictionary 
        
        elif soup.find_all('span', {'class': 'selection'}):
            product_info['color'] = soup.find_all('span', {'class': 'selection'})
            list_of_dicts.append(product_info)
            
        elif soup.find_all('div', {'class': 'a-row a-spacing-micro'}):
            product_info['color'] = soup.find_all('div', {'class': 'a-row a-spacing-micro'})
            list_of_dicts.append(product_info)
        
        # If none existed, label as color unavailable 
        else:
            product_info['color'] = 'unavailable'
            list_of_dicts.append(product_info) # Append scrape to dictionary 
    
    # If product id in the url throws an error, rather than breaking the code, return info unknown.
    except:
        product_info['asin'] = i
        
        product_info['name'] = 'unknown product name'

        product_info['description'] = 'unknown details'
        
        product_info['category'] = 'unknown category'
        
        product_info['color'] = 'unknown color' 
        
        list_of_dicts.append(product_info)  # Append scrape to dictionary 
        print(len(list_of_dicts))   # If there is an error page, print the current length of the scrapes.
        
    # Sleep 1 or 2 seconds in bewteen scrapes so Amazon doesn't think we're a robot.
    sleep(randint(1,2))
    

# Print final time subtract start time to figure out how long the scraper took to run. 
print(time.time() - t1) 

1566771396.386451
https://www.amazon.com/gp/product/0000031887
https://www.amazon.com/gp/product/0123456479
https://www.amazon.com/gp/product/1608299953
https://www.amazon.com/gp/product/1617160377
https://www.amazon.com/gp/product/B00001W0KA
https://www.amazon.com/gp/product/B00001WRHJ
https://www.amazon.com/gp/product/B00004SR8W
https://www.amazon.com/gp/product/B00004SR8Z
https://www.amazon.com/gp/product/B00004SR9P
https://www.amazon.com/gp/product/B00004U1J2
https://www.amazon.com/gp/product/B000051SEN
https://www.amazon.com/gp/product/B000051SEP
https://www.amazon.com/gp/product/B00005JHKE
https://www.amazon.com/gp/product/B00005JSBK
https://www.amazon.com/gp/product/B00005KJXN
https://www.amazon.com/gp/product/B00005TQI7
16
https://www.amazon.com/gp/product/B0000643Q8
https://www.amazon.com/gp/product/B000067R84
https://www.amazon.com/gp/product/B00006I551
https://www.amazon.com/gp/product/B00006XXGO
79.44853019714355


Check the length of the scrape 

In [15]:
len(list_of_dicts)

20

Check the number of "unknown" rows to make sure it's low.

In [16]:
pd.DataFrame(list_of_dicts)['name'].value_counts().head()

SUUNTO X-Lander Wrist-Top Computer Watch with Altimeter, Barometer, Compass, and Chronograph    2
Spider-Man Child Standard Costume                                                               1
Woody Deluxe Child - Size: Child S(4-6)                                                         1
Disney Buzz Lightyear Toy Story 4 Boys' Inflatable Jet Pack                                     1
Learn Italian: Rosetta Stone Italian - Level 1                                                  1
Name: name, dtype: int64

Save first 8000 scrapes to a DataFrame and then to a csv

In [17]:
# product8000 = pd.DataFrame(list_of_dicts)

# product8000.to_csv('product8000.csv')

Saved next 534 scrapes to a DataFrame and then to a csv

In [18]:
# product8534 = pd.DataFrame(list_of_dicts)

# product8534.to_csv('data/product8534')

Saved next 6000 scrapes to a DataFrame and then to a csv

In [19]:
# product14000 = pd.DataFrame(list_of_dicts)

# product14000.to_csv('product14000.csv')

Saved another 6000 scrapes to a DataFrame and then to a csv

In [20]:
# product20000 = pd.DataFrame(list_of_dicts)

# product20000.to_csv('product20000.csv')

Saved next 1180 scrapes to a DataFrame and then to a csv

In [21]:
# product21180 = pd.DataFrame(list_of_dicts)

# product21180.to_csv('product21180.csv')

Saved last section of scrapes to a DataFrame and then to a csv


In [22]:
# product23033 = pd.DataFrame(list_of_dicts)

# product23033.to_csv('product23033.csv')

Many of the above scrapes were blocked by an anti-robot page so i'll need to go through and check for unavailable rows and make a new list of asins to rescrape.

In [23]:
# Function to remove html
def remove_html(text):
    try:
        soup = BeautifulSoup(text, 'lxml')
        html_free = soup.get_text(strip=True)
        return html_free
    except:
        return "missing"

In [24]:
# Function to clean the DataFrame
def EDA(df):
    # Remove html and strip off brackets and 'Color:' from color column
    df['color'] = df['color'].apply(lambda x: remove_html(x)).str.replace(']', '').str.replace('[', '').str.replace('Color:', '')
    
    # Replace escape characters and white space in Category column 
    df['category'] = df['category'].str.replace('\n', '').str.replace('  ', '')
    
    # Split Category column on the '›' symbol, up to 6 times and return them in a new df where each split is a new column 
    category = df['category'].str.split("›", n=6, expand=True)
    
    # Rename each category split column and add it onto the original df
    df['department'] = category[0]
    df['demographic'] = category[1]
    df['division'] = category[2]
    df['category'] = category[3]
    df['subcategory'] = category[4]
    df['type'] = category[5]
    df['detail_type'] = category[6]
    
    # Remove special characters from the description column 
    df['description'] = df['description'].str.replace('\t', '').str.replace('\n', '')
    
    # Remove html and strip off brackets from details column
    df['details'] = df['details'].apply(lambda x: remove_html(x)).str.replace(']', '').str.replace('[', '')
   
    #Remove html and cut off 'Size:' from the size column 
    df['size'] = df['size'].apply(lambda x: remove_html(x)).str.replace('Size:', '')
    
    return df.head()

In [25]:
product8000 = pd.read_csv('data/product8000.csv')

In [26]:
product20000 = pd.read_csv('data/product20000.csv')

In [27]:
product21180 = pd.read_csv('data/product21180.csv')

In [28]:
product23033 = pd.read_csv('data/product23033.csv')

In [29]:
#A list of DataFrames to be concatonated
dataframes = [product8000, product20000, product21180, product23033]

# Concat all 4 DataFrames into one, along the row axis.
total_product = pd.concat(dataframes, axis=0)

In [30]:
total_product.shape

(17033, 8)

In [31]:
EDA(total_product)

Unnamed: 0.1,Unnamed: 0,asin,category,color,description,details,name,size,department,demographic,division,subcategory,type,detail_type
0,0,0000031887,Dance,unavailable,This fits your . Make sure this fitsby ent...,,Mystiqueshapes Girls Ballet Tutu Neon Lime Green,missing,Sports & Outdoors,Sports & Fitness,Other Sports,Clothing,Girls,Skirts
1,1,0123456479,Jewelry Boxes & Organizers,unavailable,This fits your . Make sure this fitsby ent...,,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,missing,"Clothing, Shoes & Jewelry","Shoe, Jewelry & Watch Accessories",Jewelry Accessories,Jewelry Boxes,,
2,2,1608299953,,unavailable,Access for up to 5 family members Download act...,,Learn French: Rosetta Stone French - Level 1,missing,Software,Education & Reference,Languages,,,
3,3,1617160377,,unavailable,Access for up to 5 family members Download act...,,Learn Italian: Rosetta Stone Italian - Level 1,missing,Software,Education & Reference,Languages,,,
4,4,B00001W0KA,,Buzz Lightyear,Lead Free Child (4-6 & 7-8) Includes: Bodysuit...,,Buzz Lightyear Boy's Deluxe Toy Story Costume,SelectM(3T-4T)3T-4TSmall (4-6)Medium (7-8)Medi...,Toys & Games,Dress Up & Pretend Play,Costumes,,,


Create a dataframe for only rows that came back unknown.

In [32]:
redo_df = total_product[total_product['department'] == 'unknown category']

In [33]:
# Check the shape 
redo_df.shape

(12009, 14)

In [34]:
# Check for nulls 
redo_df.isnull().sum()

Unnamed: 0         0
asin               0
category       12009
color              0
description        0
details            0
name               0
size               0
department         0
demographic    12009
division       12009
subcategory    12009
type           12009
detail_type    12009
dtype: int64

In [35]:
redo_list = list(redo_df['asin'])
redo_list[:10]

['B00005TQI7',
 'B0000722HK',
 'B00007GD9I',
 'B0000864CZ',
 'B0000866JI',
 'B0000867AN',
 'B0000867GG',
 'B0000867ON',
 'B00008695M',
 'B0000869QI']

In [36]:
len(redo_list)

12009

Use `redo_list` to run another round of scrapes in case some were blocked by anit-robot pages on the first try.

Save redo scrapes to a DataFrame and then all rows that are not unknown to a csv

In [37]:
# redo2000 = pd.DataFrame(list_of_dicts)

#(redo2000[redo2000['department'] != 'unknown category']).to_csv('data/redo1.csv')

In [38]:
#redo6000 = pd.DataFrame(list_of_dicts)

# (redo6000[redo6000['department'] != 'unknown category']).to_csv('data/redo2.csv')

In [39]:
#redo6455 = pd.DataFrame(list_of_dicts)

# (redo6455[redo6455['department'] != 'unknown category']).to_csv('data/redo3.csv')

In [40]:
# Save rescrape to a DataFrame 
# redo8528 = pd.DataFrame(list_of_dicts)

# Save the cleaned data to a csv
# redo8528.to_csv('redo_list1.csv')

# Read in the cleaned data & check shape
redo8528 = pd.read_csv('scrapes/redo_list1.csv', index_col=[0])
redo8528.shape

(2073, 13)

In [41]:
# Use funtion to clean rescrape 
# EDA(redo8528)

In [42]:
#Check value_counts of category column to see if there are any noticable trends.
redo8528['category'].value_counts(dropna=False).head(10)

NaN                           1146
 Athletic                       98
 Sandals                        70
 Lingerie, Sleep & Lounge       55
Wrist Watches                   39
 Boots                          35
Pumps                           30
Flats                           30
 Earrings                       29
 Tops, Tees & Blouses           26
Name: category, dtype: int64

In [43]:
# Check how many product names were collected  
(redo8528[redo8528['name'] != 'unknown product name']).shape

(950, 13)

In [44]:
# Save only rows that have department values to a new csv to be joined on to the clean data in a different notebook.
# (redo8528[redo8528['department'] != 'unknown category']).to_csv('data/redo4.csv')

In [45]:
# Check if number of categories collected matches names
(redo8528[redo8528['department'] != 'unknown category']).shape

(950, 13)

In [46]:
# Saved rescrape 
#redo10000 = pd.DataFrame(list_of_dicts)

In [47]:
# Used function to clean rescraped data
# EDA(redo10000)

In [48]:
# Save known column names to a csv
# (redo10000[redo10000['name'] != 'unknown product name']).to_csv('data/redo5.csv')

In [49]:
# Save clean rescraped data to a csv
#redo10000.to_csv('redo_list2.csv')

redo10000 = pd.read_csv('scrapes/redo_list2.csv', index_col=[0])
redo10000.head()

Unnamed: 0,asin,category,color,description,details,name,size,department,demographic,division,subcategory,type,detail_type
0,B009L4OGDS,Earrings,unavailable,6mm Crystal Ball Stud Earrings Plain Post and ...,missing,Sterling Silver Pink Crystal Ball Stud Earrings,missing,"Clothing, Shoes & Jewelry",Women,Jewelry,Stud,,
1,B009L5081Q,Novelty,unavailable,Size: One size only (small) Total Length: Abou...,missing,Ninimour- Fashion Trendy Women's Stretchy Legg...,missing,"Clothing, Shoes & Jewelry",Novelty & More,Clothing,Women,Leggings,
2,B009L53QXI,Boots,missing,Synthetic Made in USA or Imported Synthetic so...,missing,Dark Tan Faux Leather Bold Red Back Zipper Buc...,missing,"Clothing, Shoes & Jewelry",Women,Shoes,Mid-Calf,,
3,B009L5YU7O,Shoulder Bags,missing,Leathette Imported fabric lining Snap closure ...,missing,MG Collection Yelena Top Handle Soft Hobo Shou...,missing,"Clothing, Shoes & Jewelry",Women,Handbags & Wallets,,,
4,B009LB8CX6,Active,missing,Made in USA or Imported Merino wool blend Fibe...,missing,Kirkland Signature Outdoor Trail Sock Merino W...,missing,"Clothing, Shoes & Jewelry",Women,Clothing,Athletic Socks,,


In [50]:
(redo10000[redo10000['name'] == 'unknown product name']).head()

Unnamed: 0,asin,category,color,description,details,name,size,department,demographic,division,subcategory,type,detail_type
8,B009LEFPAQ,,unknown color,unknown details,missing,unknown product name,missing,unknown category,,,,,
10,B009LEFR4A,,unknown color,unknown details,missing,unknown product name,missing,unknown category,,,,,
11,B009LEFS26,,unknown color,unknown details,missing,unknown product name,missing,unknown category,,,,,
12,B009LEG2EO,,unknown color,unknown details,missing,unknown product name,missing,unknown category,,,,,
13,B009LEG3XO,,unknown color,unknown details,missing,unknown product name,missing,unknown category,,,,,
16,B009LM7BWS,,unknown color,unknown details,missing,unknown product name,missing,unknown category,,,,,
17,B009LNATJY,,unknown color,unknown details,missing,unknown product name,missing,unknown category,,,,,
22,B009LZTK10,,unknown color,unknown details,missing,unknown product name,missing,unknown category,,,,,
24,B009M8UF38,,unknown color,unknown details,missing,unknown product name,missing,unknown category,,,,,
28,B009MAW0A2,,unknown color,unknown details,missing,unknown product name,missing,unknown category,,,,,


In [51]:
redo_list2 = (list((redo8528[redo8528['department'] == 'unknown category'])['asin'])) + (list((redo10000[redo10000['department'] == 'unknown category'])['asin']))

len(redo_list2)


2099

In [52]:
redo_list2[:25]

['B007WA2TCS',
 'B007WA2U7W',
 'B007WA2VB2',
 'B007WA2ZTA',
 'B007WA3558',
 'B007WA38IM',
 'B007WA38OG',
 'B007WA397M',
 'B007WA3B5C',
 'B007WA3CUQ',
 'B007WA3GMK',
 'B007WA3HLK',
 'B007WA3JE0',
 'B007WA3PD0',
 'B007WA48E0',
 'B007WA49DA',
 'B007WA4CQ4',
 'B007WACT6Y',
 'B007WACX3I',
 'B007WACY5K',
 'B007WAD2QU',
 'B007WADBUC',
 'B007WADJ7M',
 'B007WADN4G',
 'B007WADRBK']

In [53]:
#redo12009 = pd.DataFrame(list_of_dicts)

In [54]:
#EDA(redo12009)

In [55]:
#(redo12009[redo12009['name'] != 'unknown product name']).to_csv('data/redo6.csv')

In [56]:
#redo12009.to_csv('redo_list3.csv')

In [57]:
redo12009 = pd.read_csv('scrapes/redo_list3.csv', index_col=[0])
redo12009.head()

Unnamed: 0,asin,description,name
0,B007WA2TCS,Currently unavailable.\n \n \n ...,Allegra K Women V Neck Sleeveless Striped Pane...
1,B007WA2U7W,,unknown product name
2,B007WA2VB2,Currently unavailable.\n \n \n ...,Allegra K Women Scoop Neck Drawstring Mid-Calf...
3,B007WA2ZTA,Currently unavailable.\n \n \n ...,Allegra K Women Peter Pan 3/4 Sleeve Summer To...
4,B007WA3558,Currently unavailable.\n \n \n ...,Allegra K Lady Ruffle Tie Neck Short Sleeves B...


In [58]:
redo12009['name'].value_counts().head()

unknown product name                                                                                              1864
uiphgjwexzv                                                                                                          5
HONB Money Clip Credit Card Holder                                                                                   2
Metal Factory Sterling Silver Crystal Ball Bead Charm Fits Pandora Chamilia Biagi Trollbeads European Bracelet       2
Sterling Silver Green Crystal Ball Bead Charm                                                                        2
Name: name, dtype: int64

In [59]:
redo_list2.extend(list((redo12009[redo12009['name'] == 'unknown product name'])['asin']))


In [60]:
len(redo_list2)

3963

After scraping all 23000+ items, I noticed some pages that seemed valid were not getting picked up by the scraper. I picked out a few individually to inspect their html and tried rescraping with the below tags.

In [61]:
t1 = time.time()
print(t1)
list_of_dicts = []
for i in redo_list2[:10]:
    print(f'https://www.amazon.com/gp/product/{i}')
    driver.get(f'https://www.amazon.com/gp/product/{i}')
    soup = BeautifulSoup(driver.page_source, 'lxml') 
    product_info = {}
    try:
        product_info['asin'] = i
        product_info['name'] = soup.find('h1', {'id':'title'}).text.strip()

        product_info['description'] = soup.find('div', {'id':'availability_feature_div'}).text.strip()
        list_of_dicts.append(product_info)
        
    except:
        product_info['asin'] = i
        
        product_info['name'] = 'unknown product name'
        list_of_dicts.append(product_info)
        print(len(list_of_dicts))
    sleep(randint(1,2))
    

print(time.time() - t1)

1566771501.184383
https://www.amazon.com/gp/product/B007WA2TCS
https://www.amazon.com/gp/product/B007WA2U7W
2
https://www.amazon.com/gp/product/B007WA2VB2
https://www.amazon.com/gp/product/B007WA2ZTA
https://www.amazon.com/gp/product/B007WA3558
https://www.amazon.com/gp/product/B007WA38IM
6
https://www.amazon.com/gp/product/B007WA38OG
7
https://www.amazon.com/gp/product/B007WA397M
8
https://www.amazon.com/gp/product/B007WA3B5C
9
https://www.amazon.com/gp/product/B007WA3CUQ
10
24.65636110305786


Check length of scrape.

In [62]:
#redo_new = pd.DataFrame(list_of_dicts)

In [63]:
pd.DataFrame(list_of_dicts)['name'].value_counts()

unknown product name                                                                               6
Allegra K Women Scoop Neck Drawstring Mid-Calf Dress Heather Grey S                                1
Allegra K Women Peter Pan 3/4 Sleeve Summer Top Chiffon Blouse,X-Small / US 2,Pink                 1
Allegra K Lady Ruffle Tie Neck Short Sleeves Button Down Shirt Light Yellow XS                     1
Allegra K Women V Neck Sleeveless Striped Panel Flared Short Skater Dresses, Dark Blue, X-Small    1
Name: name, dtype: int64

In [64]:
#redo_new.to_csv('redo_list3.csv')

In [65]:
redo_new = pd.read_csv('scrapes/redo_list3.csv', index_col=[0]).drop(columns='description')
redo_new.head()

Unnamed: 0,asin,name
0,B007WA2TCS,Allegra K Women V Neck Sleeveless Striped Pane...
1,B007WA2U7W,unknown product name
2,B007WA2VB2,Allegra K Women Scoop Neck Drawstring Mid-Calf...
3,B007WA2ZTA,Allegra K Women Peter Pan 3/4 Sleeve Summer To...
4,B007WA3558,Allegra K Lady Ruffle Tie Neck Short Sleeves B...


In [66]:
# redo_new[redo_new['name'] != 'unknown product name'].to_csv('data/redo7.csv')