In [1]:
import pandas as pd
import re
import requests 
from bs4 import BeautifulSoup 
import time 
import random
import lxml
import winsound

In [2]:
# read in data from github
path = "https://raw.githubusercontent.com/ppp-ds4a/ppp_cleaning_eda/main/data/yelp_precovid.csv"
df = pd.read_csv(path, index_col = "Unnamed: 0")

In [3]:
# remove rows with missing "combined" column
df = df[df["combined"].isnull() == False]

# rename the "open" column
df.rename(columns = {"is_open" : "open_precovid"}, inplace = True)

# generate the "rank" column
rank_d = dict(df["postal_code"].value_counts().rank(ascending=False))
df["rank"] = df["postal_code"].map(rank_d)
df = df.sort_values("rank")

print(df.shape)
df.head(5)

(149766, 9)


Unnamed: 0,business_id,name,address,city,state,postal_code,open_precovid,combined,rank
130720,XmO_f4I8Srd1oGF1EGbRhA,Starbucks,3752 Las Vegas Blvd S,Las Vegas,NV,89109,1,Starbucks Las Vegas,1.0
45936,eVvnRqb4GSJUfN5k2-9oOQ,Bird Bar,3555 Las Vegas Blvd S,Las Vegas,NV,89109,1,Bird Bar Las Vegas,1.0
102255,YzlgswAwmpkMQifS3-YQTw,Conservatory & Botanical Garden,3600 S Las Vegas Blvd,Las Vegas,NV,89109,1,Conservatory & Botanical Garden Las Vegas,1.0
70489,zVfSwYFqDnWiKsMA4_jsAg,The NoMad Hotel Las Vegas,3772 S Las Vegas Blvd,Las Vegas,NV,89109,1,The NoMad Hotel Las Vegas Las Vegas,1.0
102269,9gxS7SYLl8EuhVNHWU1omQ,Windows At Bally's Las Vegas,3655 Las Vegas Blvd S,Las Vegas,NV,89109,1,Windows At Bally's Las Vegas Las Vegas,1.0


In [4]:
# subset to only odd ranks
#df = df[df["rank"] % 2 == 1]

# subset to only even ranks
df = df[df["rank"] % 2 == 0]

In [5]:
def regex_clean(x):
    
    x = x.title()

    # character groups
    char1 = "['`]"              # chars to remove
    char2 = "[. / - , _ : -]"   # chars to replace with whitespace

    # regex substitution
    x = re.sub("[&]", "and", x)   # replace & with and
    x = re.sub(char1, "", x)      # remove
    x = re.sub(char2, " ", x)     # replace with whitespace
    x = re.sub(" +", "-", x)      # replace all sequences of one or more whitespace with one dash

    return "https://www.yelp.com/biz/" + x.lower()

In [65]:
# Change to adjust sample size of test
sample_start = 5000
sample_end = 6000

# completed runs
# 0 to 5000
# 20000 to 27000

In [66]:
# Return a list of random samples from "combined" column
sample = df[["combined", "business_id"]].iloc[sample_start:sample_end]

In [67]:
# generate a url column using the regex_clean function
sample["url"] = sample["combined"].apply(regex_clean)

In [68]:
print("Sample Length: ", len(sample))
sample.head()

Sample Length:  1000


Unnamed: 0,combined,business_id,url
115648,PT's Gold Las Vegas,8ovqtunN8zGdCbYi-MCTAQ,https://www.yelp.com/biz/pts-gold-las-vegas
24921,Bob's Autodynamics Las Vegas,MPGrHMnDFzXCr-EtNSDh2g,https://www.yelp.com/biz/bobs-autodynamics-las...
28950,Dotty's #7 Las Vegas,9WeJvvyT-j3btKwvTXyzjw,https://www.yelp.com/biz/dottys-#7-las-vegas
11641,Robert Sidell Injury Attorney Las Vegas,xM85pGuYSvLTUEfm3Nm5uA,https://www.yelp.com/biz/robert-sidell-injury-...
155194,History For Sale Las Vegas,C5RiblUAGreWQd2vLmkKaQ,https://www.yelp.com/biz/history-for-sale-las-...


In [69]:
agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246"

head = {}
head["User-Agent"] = agent
head["referrer"] = "https://www.google.com"
head["Dnt"] = "1" 
head["Connection"] = "keep-alive"
head["Upgrade-Insecure-Requests"] = "1"

In [None]:
# Collect results for statistic estimation
response_dict = {}
url_404_ls = []
url_404_index_ls = []
closed_open_ls = []

# loop through samples and clean and check for reponse codes
for x in sample.index:
    # Delay next execution by random integer between 1-25
    time.sleep(random.randint(5, 25))

    r = requests.get(sample.url[x], headers = head)

    if str(r) == '<Response [503]>':
        print(f"503 found at iteration: {x}")
        break
    
    # If 404 then continue
    if str(r) == '<Response [404]>':
        #url_404_ls.append(regex_clean(sample[x]))
        #url_404_index_ls.append(df[df["combined"].str.contains(sample[x])].index[0])
        closed_open_ls.append(-1)
        continue

    
    if str(r) not in response_dict:
        response_dict[str(r)] = 1
    elif str(r) in response_dict:
        response_dict[str(r)] += 1
    
    # Try to find alert saying business is closed, if not then return "no", continue if no list returned (bad request) 
    try: 
        e = "no"
        closed = BeautifulSoup(r.text, "lxml").find_all("span", {"class": "raw__373c0__3rcx7"})[0].text

        if len(closed) == 0:
            closed_open_ls.append(-1)
            continue
    except:
        closed = e

    # Save to list whether business is open or closed
    if closed == "Yelpers report this location has closed.":
        closed_open_ls.append(0)
    else:
        closed_open_ls.append(1)

In [None]:
len(closed_open_ls)

In [None]:
sample["open_postcovid"] = closed_open_ls
sample.head()

In [None]:
# merges the scrape output with original data and saves to csv
sample[["business_id", "url", "open_postcovid"]].merge(df[["business_id", "name", "address", "city", "state", "postal_code", "open_precovid"]], how = "inner").to_csv("data/output" + str(sample_start) + "to" + str(sample_end) + ".csv")

In [None]:
winsound.Beep(1500, 1000)