In [1]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pandas as pd
from splinter import Browser
import os
import numpy as np
from sqlalchemy import create_engine
from config import username, password

In [2]:
# URL of page to be scraped
#url = 'https://www.walgreens.com/store/store/category/productlist.jsp?N=520931&Eon=520931'


In [4]:
# Read HTML from file
filepath = os.path.join("..", "Resources", "walgreens.html")
with open(filepath) as file:
    wal_html = file.read()

In [5]:
executable_path = {'executable_path': "chromedriver.exe"}
browser = Browser('chrome', **executable_path, headless=False)

In [6]:
# Create beautifulsoup object
html = browser.html
soup = BeautifulSoup(wal_html, 'html.parser')

In [7]:
# <div class="brand" id="prodBrandNamecompare_sku6253945" data-reactid="468">PayDay</div>
# "brand"
#<input type="checkbox" id="filter_353669" tabindex="0" value="353669" name="353669" title="Sour Punch(2)">
results = soup.find_all("div", class_="item card card__product")
print(results)

t store</span></div><div class="avail-cta" id="not-available-for-shippingcompare_sku6151899" name="not-available-for-shipping compare_sku6151899"><!-- react-text: 4398 -->Not available for shipping<!-- /react-text --><span class="sr-only">Not available for shipping</span></div><button class="btn btn__blue" id="pickup-ship-btncompare_sku6151899" name="pickup-ship-btn"><!-- react-text: 4401 -->Pick up<!-- /react-text --><span class="sr-only">will open overlay</span></button></div></div><span aria-live="polite" class="sr-only" id="ownbrandAnnounce"></span></div>, <div class="item card card__product" id="productcard300400854"><div aria-hidden="true" id="productcardheight300400854" style="text-align: center;"><a href="/store/c/haribo-sweet-%26-scary-mix-assorted,-laydown-bag/ID=300400854-product" id="productOmniSelectcompare_400622247" tabindex="-1"><figure class="product__img"></figure><span class="sr-only"><!-- react-text: 4410 -->Haribo Sweet &amp; Scary Mix Assorted, Laydown Bag<!-- /re

In [8]:
len(results)

72

In [9]:
ratings_list = []
# loop through each result and scrape brand name, description and rating
for result in results:
    candy_ratings = {}
    candy_ratings['brand'] = result.find('div', class_='brand').text
    candy_ratings['description'] = result.find('strong', class_='description').text
    # We are using a try and except block to convert results that did not have a rating or review to 'NaN'
    try:
        candy_ratings['rating'] = result.find("span", class_="product__rating").img["alt"]
        
    except AttributeError:
        candy_ratings['rating'] = np.nan
    ratings_list.append(candy_ratings)
ratings_list

[{'brand': 'PayDay',
  'description': 'Snack Size Peanut Caramel Bars Peanut Caramel',
  'rating': '4.6 out of 5 total 12 reviews'},
 {'brand': 'Snickers',
  'description': 'Peanut Butter Squared Fun Size Candy',
  'rating': '5.0 out of 5 total 1 reviews'},
 {'brand': 'Tootsie Roll',
  'description': 'Caramel Apple Pops Green Apple',
  'rating': '5.0 out of 5 total 6 reviews'},
 {'brand': 'Milky Way',
  'description': 'Chocolate Candy Bar Fun Size',
  'rating': '4.9 out of 5 total 44 reviews'},
 {'brand': "Reese's",
  'description': 'Snack Size Peanut Butter Cups',
  'rating': '5.0 out of 5 total 3 reviews'},
 {'brand': 'Mounds',
  'description': 'Snack Size Candy Bars Dark Chocolate Coconut Filled',
  'rating': '4.8 out of 5 total 16 reviews'},
 {'brand': "Hershey's",
  'description': 'All Time Greats Snack Size Assortment, 30 Pieces',
  'rating': '3.9 out of 5 total 37 reviews'},
 {'brand': 'Kisses',
  'description': 'Gold with Pretzel Bits Candy',
  'rating': '5.0 out of 5 total 1 r

In [10]:
# Using our list of dictionaries, create a dataframe
df = pd.DataFrame(ratings_list)
# drop any results that did not have a rating
df = df.dropna(how='any')
df.head()

Unnamed: 0,brand,description,rating
0,PayDay,Snack Size Peanut Caramel Bars Peanut Caramel,4.6 out of 5 total 12 reviews
1,Snickers,Peanut Butter Squared Fun Size Candy,5.0 out of 5 total 1 reviews
2,Tootsie Roll,Caramel Apple Pops Green Apple,5.0 out of 5 total 6 reviews
3,Milky Way,Chocolate Candy Bar Fun Size,4.9 out of 5 total 44 reviews
4,Reese's,Snack Size Peanut Butter Cups,5.0 out of 5 total 3 reviews


In [11]:

for index, row in df.iterrows():
    # extract the individual rating
    rating = row['rating'][0:3]
    df.loc[index, 'individual_rating'] = float(rating)
    # extract number of reviews    
    count = [int(i) for i in row['rating'].split() if i.isnumeric()]
    df.loc[index, 'review'] = count[1]
df.head()

Unnamed: 0,brand,description,rating,individual_rating,review
0,PayDay,Snack Size Peanut Caramel Bars Peanut Caramel,4.6 out of 5 total 12 reviews,4.6,12.0
1,Snickers,Peanut Butter Squared Fun Size Candy,5.0 out of 5 total 1 reviews,5.0,1.0
2,Tootsie Roll,Caramel Apple Pops Green Apple,5.0 out of 5 total 6 reviews,5.0,6.0
3,Milky Way,Chocolate Candy Bar Fun Size,4.9 out of 5 total 44 reviews,4.9,44.0
4,Reese's,Snack Size Peanut Butter Cups,5.0 out of 5 total 3 reviews,5.0,3.0


In [12]:
# group by the brand to see the average brand rating and review
brands_df = df[['brand', 'individual_rating', 'review']].groupby('brand').mean()
brands_df = brands_df.rename(columns={'individual_rating': "Avg_Rating", "review": "Avg_Num_Reviews"})
brands_df

Unnamed: 0_level_0,Avg_Rating,Avg_Num_Reviews
brand,Unnamed: 1_level_1,Unnamed: 2_level_1
3 Musketeers,4.6,16.0
Airheads,4.0,2.0
Almond Joy,4.1,24.0
Blow Pop,5.0,2.0
Dubble Bubble,3.8,4.0
Dum Dums,5.0,2.0
Hershey's,4.68,35.4
Kathy Kaye,3.0,2.0
Kisses,5.0,1.0
Kit Kat,3.6,6.5


In [15]:
# Create connection to postgresql
rds_connection_string = f"{username}:{password}@localhost:5432/Web_Scraping_Challenge"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [16]:
# Use pandas to load csv converted DataFrame into database
df.to_sql(name='walgreens_candy', con=engine, if_exists='append', index=False)

In [22]:
# verify table was loaded successfully
pd.read_sql_query('select * from walgreens_candy', con=engine).head()

Unnamed: 0,brand,description,rating,individual_rating,review
0,PayDay,Snack Size Peanut Caramel Bars Peanut Caramel,4.6 out of 5 total 12 reviews,4.6,12.0
1,Snickers,Peanut Butter Squared Fun Size Candy,5.0 out of 5 total 1 reviews,5.0,1.0
2,Tootsie Roll,Caramel Apple Pops Green Apple,5.0 out of 5 total 6 reviews,5.0,6.0
3,Milky Way,Chocolate Candy Bar Fun Size,4.9 out of 5 total 44 reviews,4.9,44.0
4,Reese's,Snack Size Peanut Butter Cups,5.0 out of 5 total 3 reviews,5.0,3.0


In [28]:
# Use pandas to brands DataFrame into database
brands_df.to_sql(name='walgreens_brand_avg', con=engine, if_exists='append', index=True)

In [29]:
# verify table was loaded successfully
pd.read_sql_query('select * from walgreens_brand_avg', con=engine).head()

Unnamed: 0,brand,Avg_Rating,Avg_Num_Reviews
0,3 Musketeers,4.6,16.0
1,Airheads,4.0,2.0
2,Almond Joy,4.1,24.0
3,Blow Pop,5.0,2.0
4,Dubble Bubble,3.8,4.0
