In [6]:
import pandas as pd
import re
import requests 
from bs4 import BeautifulSoup 
import time 
import random

In [7]:
# Read in data from github
PATH = "https://raw.githubusercontent.com/ppp-ds4a/ppp_cleaning_eda/main/data/web_scrape_names.csv"
df = pd.read_csv(PATH, index_col="Unnamed: 0")
print(df.shape)
df.head(5)

(209393, 3)


Unnamed: 0,name,city,combined
0,The Range At Lake Norman,Cornelius,The Range At Lake Norman Cornelius
1,"Carlos Santo, NMD",Scottsdale,"Carlos Santo, NMD Scottsdale"
2,Felinus,Montreal,Felinus Montreal
3,Nevada House of Hose,North Las Vegas,Nevada House of Hose North Las Vegas
4,USE MY GUY SERVICES LLC,Mesa,USE MY GUY SERVICES LLC Mesa


### Using Regex to clean strings

- Traditional pandas methods and built-in python functions cannot clean the text appropriately
- Using multiple regular expressions can help filter text based on text patterns
- 200 status codes mean OK

##### [Regular Expression Docs](https://docs.python.org/3/library/re.html#module-re)

##### [List of HTTPS Status Codes](https://en.wikipedia.org/wiki/List_of_HTTP_status_codes)

In [8]:
# Pandas apply function for cleaning text

def regex_clean(x):
    """
    Text cleaner for removing unwanted chars and replacing with appropriate chars
    """

    x = x.title()

    # Patterns
    pat1 = '[&]' # replace '&' with 'And'
    pat2 = "[' . / - ` , _]" #chars to remove

    # Regex substitution
    x = re.sub(pat1, "And", x)
    x = re.sub(pat2, "", x)

    # Split by uppercase letter then join with hyphen
    x = "-".join(re.findall('[A-Z][^A-Z]*', x))

    return "https://www.yelp.com/biz/" + x

In [22]:
%%time

# FUNCTION TEST

# Change to adjust sample size of test
SAMPLE_SIZE = 385

# Return a list of random samples from "combined" column
sample = [x for x in df.sample(SAMPLE_SIZE)["combined"]]

# Collect results for statistic estimation
response_dict = {}

# loop through samples and clean
for x in range(len(sample)):

    r = requests.get(regex_clean(sample[x]))

    # print(r)
    # print(regex_clean(sample[x]), "\n")

    if str(r) not in response_dict:
        response_dict[str(r)] = 1
    elif str(r) in response_dict:
        response_dict[str(r)] += 1
    time.sleep(random.randint(4,7)) # pause execution for random interval between 3-7 seconds

 # Response 200 is good 

CPU times: user 5.78 s, sys: 446 ms, total: 6.22 s
Wall time: 43min 10s



Reponse 503 means overloaded service and the halt is temporary.  

In [23]:
stat_val = response_dict['<Response [200]>'] / sum(response_dict.values())

print(f"Reponse Dictionary: {response_dict}")
print(f"Average 'Reponse 200' over 385 samples: {stat_val}")

Reponse Dictionary: {'<Response [200]>': 168, '<Response [404]>': 38, '<Response [503]>': 179}
Average 'Reponse 200' over 385 samples: 0.43636363636363634


### How many unique URL's

- Lower number of URL than number of original shape could be due too multiple places have the same web address like Franchises, Multiple locations, same owners, etc...

In [None]:
# How many unique addresses
df["combined"].apply(regex_clean).nunique()

183044

### Identify Closed Banners on yelp pages

- Create functionality that looks for a closed banner determined by users and return a 0 or 1 based on whether the business is closed or open.

<p align="center">
  <img width="500" height="400" src="https://imgur.com/P1hZDTY.png">
</p>

In [None]:
# closed example URL: https://www.yelp.com/biz/skin-factory-tattoo-and-body-piercing-las-vegas

# possible xpath for closed places
# //*[@id="wrap"]/div[3]/yelp-react-root/div/div[4]/div/div/div[1]/section[2]/div

### Notes

In [None]:
# Read in data
yelp_business_path = "../data/yelp_dataset/yelp_academic_dataset_business.json"
yelp_b_df = pd.read_json(yelp_business_path, lines=True)

In [None]:
# Concat name and city together
df = yelp_b_df[["name", "city"]].copy()
df["combined"] = yelp_b_df["name"] + " " + yelp_b_df["city"]

# This is the function for stripping values that we are working on
df["combined"] = df["combined"].str.strip(".-,")
df.head()

Unnamed: 0,name,city,combined
0,The Range At Lake Norman,Cornelius,The Range At Lake Norman Cornelius
1,"Carlos Santo, NMD",Scottsdale,"Carlos Santo, NMD Scottsdale"
2,Felinus,Montreal,Felinus Montreal
3,Nevada House of Hose,North Las Vegas,Nevada House of Hose North Las Vegas
4,USE MY GUY SERVICES LLC,Mesa,USE MY GUY SERVICES LLC Mesa


In [None]:
# Save the data
# df.to_csv("web_scrape_names.csv")

In [None]:
# https://www.yelp.com/biz/big-chickie-seattle

# The pandas apply function to append to the list
ls = []

# Emily thoughts
# chars to take care of : 
# & --> and
# + --> - / omit  
# ' --> omit
# "-" --> stays
# . --> omit / -
# , --> omit 
# / --> hyphen 
#       need to check for the presence of / in each word and replace with -
#       need to omit if it's the first char in url
# - --> stay
# array to hold characters we need to omit 
#    delete = [""]

def ls_maker(x): 
    web_string = f"https://www.yelp.com/biz/{str(x).replace(' ', '-')}"
    ls.append(web_string)
    return

df["biz_city"].apply(ls_maker)