In [311]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup


In [312]:
starbucks_response = requests.get("https://worldpopulationreview.com/state-rankings/starbucks-stores-by-state")

In [313]:
starbucks_soup = BeautifulSoup(starbucks_response.content, "html.parser")

In [314]:
table = (starbucks_soup.find_all("table"))[0]
table

<table class="wpr-table"><thead class="table-head bg-wpr-table_header_bg relative z-40 px-0 py-0 text-xs uppercase tracking-wider"><tr><th class="datatable-th bg-wpr-table_header_bg text-wpr-table_header border-wpr-table_border sticky top-0 border-b bg-clip-padding px-3 py-2 text-left align-bottom leading-4 md:px-4 pin left-0 z-50 border-l-0"><div class="flex flex-col items-stretch justify-start"><div class="flex flex-row items-center cursor-pointer select-none">State</div></div></th><th class="datatable-th bg-wpr-table_header_bg text-wpr-table_header border-wpr-table_border sticky top-0 border-b bg-clip-padding px-3 py-2 text-left align-bottom leading-4 md:px-4 z-40"><div class="flex flex-col items-stretch justify-start"><div class="flex flex-row items-center cursor-pointer select-none">Starbucks Stores 2023<svg class="icon icon-tabler icon-tabler-chevron-down" fill="none" height="14" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="2" viewbox="0 0 24

In [315]:
rows = []

# iterate over all rows in the population table
for row in table.find_all("tr")[1:]:

  # State is found in <th>
  state = row.find("th").text

  # Number of locations is found in <td>
  
  locations_2023 = row.find_all("td")[0].get_text(strip = True)

  locations_2021 = row.find_all("td")[1].get_text(strip = True)

  locations_2024 = row.find_all("td")[2].get_text(strip = True)

  rows.append({
    "State": state,
    "Starbucks Stores 2023": locations_2023,
    "Starbucks Stores 2021": locations_2021,
    "Starbucks Stores 2024": locations_2024
  })


In [316]:
starbucks = pd.DataFrame(rows)
starbucks.head()

Unnamed: 0,State,Starbucks Stores 2023,Starbucks Stores 2021,Starbucks Stores 2024
0,California,3080,2959,3117
1,Texas,1346,1215,1409
2,Florida,844,786,892
3,Washington,741,739,736
4,New York,692,643,715


In [317]:
dunkin_response = requests.get("https://worldpopulationreview.com/state-rankings/dunkin-donuts-by-state")
dunkin_soup = BeautifulSoup(dunkin_response.content, "html.parser")

In [318]:
table = (dunkin_soup.find_all("table"))[0]

In [319]:
rows = []

# iterate over all rows in the population table
for row in table.find_all("tr")[1:]:

  # State is found in <th>
  state = row.find("th").text

  # Number of locations is found in <td>
  
  locations_2024 = row.find_all("td")[0].get_text(strip = True)

  locations_2023 = row.find_all("td")[1].get_text(strip = True)

  rows.append({
    "State": state,
    "Dunkin Stores 2024": locations_2024,
    "Dunkin Stores 2023": locations_2023
  })

In [320]:
dunkin = pd.DataFrame(rows)
dunkin.head()

Unnamed: 0,State,Dunkin Stores 2024,Dunkin Stores 2023
0,New York,1431,1414
1,Massachusetts,1042,1068
2,Florida,909,883
3,New Jersey,872,866
4,Illinois,711,692


In [321]:
merged_data = pd.merge(starbucks, dunkin, on = "State")
merged_data.head()

Unnamed: 0,State,Starbucks Stores 2023,Starbucks Stores 2021,Starbucks Stores 2024,Dunkin Stores 2024,Dunkin Stores 2023
0,California,3080,2959,3117,143,134
1,Texas,1346,1215,1409,226,196
2,Florida,844,786,892,909,883
3,Washington,741,739,736,0,19
4,New York,692,643,715,1431,1414


In [322]:
population_response = requests.get("https://simple.wikipedia.org/wiki/List_of_U.S._states_by_population")
population_soup = BeautifulSoup(population_response.content, "html.parser")

In [323]:
table = (population_soup.find_all("table"))[0]

In [324]:
rows = []

# iterate over all rows in the population table
for row in table.find_all("tr")[1:]:

  # State is found in <th>
  state = row.find_all("td")[2].get_text(strip = True)

  # Number of locations is found in the select column of <td>
  population = row.find_all("td")[3].get_text(strip = True)

  rows.append({
    "State": state,
    "Population": population
  })  

In [325]:
population = pd.DataFrame(rows)
population.head()

Unnamed: 0,State,Population
0,California,39538223
1,Texas,30145505
2,Florida,21538187
3,New York,20201249
4,Pennsylvania,13002700


In [326]:
merged_data2 = pd.merge(merged_data,population, on="State")
merged_data2.head()

Unnamed: 0,State,Starbucks Stores 2023,Starbucks Stores 2021,Starbucks Stores 2024,Dunkin Stores 2024,Dunkin Stores 2023,Population
0,California,3080,2959,3117,143,134,39538223
1,Texas,1346,1215,1409,226,196,30145505
2,Florida,844,786,892,909,883,21538187
3,Washington,741,739,736,0,19,7705281
4,New York,692,643,715,1431,1414,20201249


In [327]:
merged_data2["Starbucks_Stock"] = 96
merged_data2["Dunkin_Stock"] = 106

In [328]:
regions = pd.DataFrame({
    'State': [
        'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado',
        'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho',
        'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana',
        'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota',
        'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada',
        'New Hampshire', 'New Jersey', 'New Mexico', 'New York',
        'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon',
        'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota',
        'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington',
        'West Virginia', 'Wisconsin', 'Wyoming'
    ],
    'Region': [
        'South', 'West', 'West', 'South', 'West', 'West', 'Northeast', 
        'South', 'South', 'South', 'West', 'West', 'Midwest', 'Midwest', 
        'Midwest', 'Midwest', 'South', 'South', 'Northeast', 'South', 
        'Northeast', 'Midwest', 'Midwest', 'South', 'Midwest', 'West', 
        'Midwest', 'West', 'Northeast', 'Northeast', 'West', 'Northeast',
        'South', 'Midwest', 'Midwest', 'South', 'West', 'Northeast', 
        'Northeast', 'South', 'Midwest', 'South', 'South', 'West', 
        'Northeast', 'South', 'West', 'South', 'Midwest', 'West'
    ]
})

In [329]:
merged_data3 = pd.merge(merged_data2, regions, on='State')


In [330]:
merged_data3['Starbucks Stores 2023'] = merged_data3['Starbucks Stores 2023'].str.replace(',', '').astype(int)
merged_data3['Starbucks Stores 2021'] = merged_data3['Starbucks Stores 2021'].str.replace(',', '').astype(int)
merged_data3['Dunkin Stores 2024'] = merged_data3['Dunkin Stores 2024'].str.replace(',', '').astype(int)
merged_data3['Dunkin Stores 2023'] = merged_data3['Dunkin Stores 2023'].str.replace(',', '').astype(int)
merged_data3['Population'] = merged_data3['Population'].str.replace(',', '').astype(int)

In [331]:
merged_data3

Unnamed: 0,State,Starbucks Stores 2023,Starbucks Stores 2021,Starbucks Stores 2024,Dunkin Stores 2024,Dunkin Stores 2023,Population,Starbucks_Stock,Dunkin_Stock,Region
0,California,3080,2959,3117.0,143,134,39538223,96,106,West
1,Texas,1346,1215,1409.0,226,196,30145505,96,106,South
2,Florida,844,786,892.0,909,883,21538187,96,106,South
3,Washington,741,739,736.0,0,19,7705281,96,106,West
4,New York,692,643,715.0,1431,1414,20201249,96,106,Northeast
5,Illinois,677,621,685.0,711,692,12812508,96,106,Midwest
6,Arizona,548,535,579.0,110,102,7151502,96,106,West
7,Colorado,495,497,506.0,48,43,5773714,96,106,West
8,Ohio,491,446,499.0,261,245,11799448,96,106,Midwest
9,Virginia,489,456,510.0,244,230,8631393,96,106,South


In [336]:
merged_data3[["State", "Starbucks Stores 2023", "Dunkin Stores 2023", "Population"]].groupby("State").sum()

Unnamed: 0_level_0,Starbucks Stores 2023,Dunkin Stores 2023,Population
State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,85,59,5024279
Alaska,49,0,733391
Arizona,548,102,7151502
Arkansas,55,9,3011524
California,3080,134,39538223
Colorado,495,43,5773714
Connecticut,123,480,3605944
Delaware,25,66,989948
Florida,844,883,21538187
Georgia,326,265,10711908


In [None]:
grouped_data = merged_data3[["Region", "Starbucks Stores", "Dunkin Stores", "Population"]].groupby("Region").sum()
grouped_data["People per Starbucks Store"] = (grouped_data["Population"] / grouped_data["Starbucks Stores"]).round(2)
grouped_data["People per Dunkin Store"] = (grouped_data["Population"] / grouped_data["Dunkin Stores"]).round(2)

grouped_data

Looking at this table, I can conclude there are the most Starbucks locations in the West region. There are also the least amount of people per Starbucks Store in the West region. I can also conclude there are the most Dunkin Donuts locations in the Northeast region, with the least amount of people per Dunkin Stores in the Northeast region. Starbucks Dominates the West, Midwest, and the South regions, whereas Dunkin Donuts is the King in the Northeast region. 

In [363]:
merged_data4 = merged_data3
# Replace 0s with Nan for States that don't have any Dunkin Stores
merged_data4["Dunkin Stores 2023"] = merged_data4["Dunkin Stores 2023"].replace(0, np.nan)
merged_data4["People per Starbucks Store"] = merged_data4["Population"] / merged_data4["Starbucks Stores 2023"]
merged_data4["People per Dunkin Store"] = merged_data4["Population"] / merged_data4["Dunkin Stores 2023"]
merged_data4["Starbucks Stock per Store"] = merged_data4["Starbucks_Stock"] / merged_data4["Starbucks Stores 2023"]
merged_data4["Dunkin Stock per Store"] = merged_data4["Dunkin_Stock"] / merged_data4["Dunkin Stores 2023"]

merged_data4[["Region", "Starbucks Stock per Store", "Dunkin Stock per Store", "People per Starbucks Store", "People per Dunkin Store"]].groupby("Region").mean().round(2)


Unnamed: 0_level_0,Starbucks Stock per Store,Dunkin Stock per Store,People per Starbucks Store,People per Dunkin Store
Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Midwest,1.48,2.06,34165.44,87388.27
Northeast,2.66,0.52,41134.8,10516.61
South,1.19,2.96,42579.66,112627.38
West,1.15,27.55,19104.39,576572.08


Looking at the table, Starbucks has the highest stock value per store in the Northeast region ($2.66) and the lowest in the West ($1.15). Dunkin' stock per store is highest in the West ($27.55) due to the lower prevalence of stores in that region, and lowest in the Northeast ($0.52), where it's most common. People per store also shows significant differences, with the West having the fewest people per Starbucks store (19,104) and the most per Dunkin' store (576,572), highlighting the stark difference in the distribution of Dunkin' stores across regions.
Starbucks has a more evenly distributed number of stores, with fewer people per store, especially in the West. Dunkin' is much more concentrated in the Northeast, where the people-per-store ratio is much lower than in other regions.


In [368]:
def webscrape(link):

    import requests
    from bs4 import BeautifulSoup

    response = requests.get(link)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find the table in the website
    table=(soup.find_all("table"))[0]
    rows= []

# Iterate over all rows in the table
    for row in table.find_all("tr")[1:]:

        # State is found in <th>
        state = row.find("th").text

        # Number of locations is found in <td>
        locations = row.find("td").text

        rows.append({
                "State": state,
                "Stores": locations,
    })
    df = pd.DataFrame(rows)
    return df    

In [370]:
webscrape("https://worldpopulationreview.com/state-rankings/walmart-stores-by-state")

Unnamed: 0,State,Stores
0,Texas,517
1,Florida,341
2,California,280
3,North Carolina,192
4,Georgia,189
5,Illinois,161
6,Ohio,146
7,Missouri,137
8,Tennessee,137
9,Pennsylvania,134
