# Web Scraping: BeautifulSoup, RegEx and Selenium 




In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from string import ascii_uppercase as alc

## 1.) Web Crawling Tables

### 1.A.) Create a list of links for all the wikipedia pages for NYSE traded companies A-Z and 0-9

In [2]:
URL = "https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(A)"

In [3]:
# Importing module to iterate through the alphabet
from string import ascii_uppercase as alc

In [4]:
# For A-Z
URL_list = []

for i in alc:
    URL = f"https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_({i})"
    URL_list.append(URL)
    
# For 0-9
URL_list.append("https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(0–9)")

# Print final list of links
URL_list

['https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(A)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(B)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(C)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(D)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(E)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(F)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(G)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(H)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(I)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(J)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_the_New_York_Stock_Exchange_(K)',
 'https://en.wikipedia.org/wiki/Companies_listed_on_th

### 1.B.) Crawl through all the URLs and make 1 DF with all the NYSE publically traded companies

In [5]:
# Initializing empty dataframe
company_table = pd.DataFrame()

# Looping through URL list to crawl tables
for i in URL_list:
    url = i 
    html = requests.get(url).content
    df_list = pd.read_html(html)
    company_table = pd.concat([company_table, df_list[1]])

In [6]:
# Dataframe with all publicly traded companies
company_table

Unnamed: 0,Stock name,Symbol,Country of origin
0,A. O. Smith Corporation,AOS,US
1,"A10 Networks, Inc.",ATEN,US
2,AAC Holdings Inc.,AAC,US
3,AAR Corporation,AIR,US
4,Aaron's Inc.,AAN,US
...,...,...,...
0,3D Systems Corporation,DDD,US
1,3M Company,MMM,US
2,500.com,WBAI,China
3,58.com Inc.,WUBA,China


### 1.C.) Calculate percetage of companies that contain 1 letter, 2 letters, 3 letters, 4 letters and 5 letters in the ticker

In [7]:
# Creating a column for ticker length
company_table['ticker_length'] = company_table['Symbol'].apply(len)

In [8]:
# Grouping by ticker length to get number of companies for each length
group = company_table.groupby("ticker_length")["Stock name"].count()

In [9]:
group

ticker_length
1       24
2      190
3     1775
4      404
5       22
6       45
7      223
8       35
9        2
11       2
13       1
Name: Stock name, dtype: int64

In [10]:
# Calculating and Printing percentage of companies for first 5 lengths of tickers
group[:5]/len(company_table)*100

ticker_length
1     0.881381
2     6.977598
3    65.185457
4    14.836577
5     0.807932
Name: Stock name, dtype: float64

## 2.) Web Scraping Using Beautiful Soup

### 2.A.) Using Beautiful soup .findAll method, webscrape the front page of Reddit. Get a list of all of the "timestamps"

In [11]:
URL = "https://www.reddit.com"

In [12]:
user_agent_list = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246" 
,"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.111 Safari/537.36 "
,"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2) AppleWebKit/601.3.9 (KHTML, like Gecko) Version/9.0.2 Safari/601.3.9 "
,"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1" 
,"Mozilla/5.0 (X11; CrOS x86_64 8172.45.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.64 Safari/537.36"] 

headers = {
    'User-Agent': user_agent_list[2]}

In [13]:
URL = "https://www.reddit.com"
page = requests.get(URL, headers = headers)

In [14]:
# Using beautiful soup to get the page's html content
soup = BeautifulSoup(page.content, "html.parser")

In [15]:
# Using span tag with data-click-id attribute to get timestamps
timestamps = soup.findAll("span",
                     attrs = {"data-click-id":"timestamp"})

In [16]:
#Extracting text from timestamps
clean_t = [t.text for t in timestamps]

In [17]:
clean_t

['17 hours ago',
 '5 hours ago',
 '5 hours ago',
 '5 hours ago',
 '12 hours ago',
 '9 hours ago',
 '5 hours ago']

### 2.B.) Using the functions findChild, descendants, etc. locate the post title and store in a list.

In [19]:
# Using parent tag "div" with specified class 
tags = soup.findAll("div", class_="_2SdHzo12ISmrC8H86TgSCp _3wqmjmv3tb_k-PROt7qFZe")
post_titles = []

for tag in tags:
    # Finding child "h3" to locate titles
    post_title_tag = tag.findChild("h3")
    post_title = post_title_tag.text
    post_titles.append(post_title)

In [21]:
#List of titles
post_titles

['My dad does this to avoid cutting pepperoni',
 'As Hollywood strike looms, five writers share their stories. Subscribe to the L.A. Times.',
 "which actor is an immediate turn off whenever they're announced in a movie?",
 'HOW MANY SUPREME COURT SEATS HAVE BEEN PUT UP FOR SALE?',
 'I found a grandfather clock at a thrift store and painted it',
 'The incels are out in force today.',
 '13-year-old Ukrainian singer refuses to share stage with Russian performer at Sanremo Junior Festival',
 'Mind blown.']

### 2.C.) Create a dataframe that has the associated title and post time for each post.

We see a length mismatch as there is one title which is a promoted advertisement with no associated time stamp, hence we drop that from our list of titles and then create a dataframe.

In [22]:
post_titles.remove("As Hollywood strike looms, five writers share their stories. Subscribe to the L.A. Times.")

In [23]:
reddit_df = pd.DataFrame({"Title": post_titles, "Post time": clean_t })

In [24]:
reddit_df

Unnamed: 0,Title,Post time
0,My dad does this to avoid cutting pepperoni,17 hours ago
1,which actor is an immediate turn off whenever ...,5 hours ago
2,HOW MANY SUPREME COURT SEATS HAVE BEEN PUT UP ...,5 hours ago
3,I found a grandfather clock at a thrift store ...,5 hours ago
4,The incels are out in force today.,12 hours ago
5,13-year-old Ukrainian singer refuses to share ...,9 hours ago
6,Mind blown.,5 hours ago


## 3.) RegEx

### 3.A.) Using RegEx, get all the urls of ladder faculty profiles for UCLA Economics

In [29]:
import re

In [30]:
URL = "https://economics.ucla.edu/faculty/ladder"

In [33]:
url = 'https://economics.ucla.edu/faculty/ladder'
response = requests.get(url)
html_content = response.text

# Regex pattern for URLs
pattern = r'<a href="(https://economics\.ucla\.edu/person/[a-z\-]+/)">'
matches = re.findall(pattern, html_content)
profile_urls = [url for url in matches]
profile_urls

['https://economics.ucla.edu/person/john-asker/',
 'https://economics.ucla.edu/person/andrew-atkeson/',
 'https://economics.ucla.edu/person/martha-bailey/',
 'https://economics.ucla.edu/person/david-baqaee/',
 'https://economics.ucla.edu/person/natalie-bau/',
 'https://economics.ucla.edu/person/saki-bigio/',
 'https://economics.ucla.edu/person/alexander-bloedel/',
 'https://economics.ucla.edu/person/simon-board/',
 'https://economics.ucla.edu/person/ariel-burstein/',
 'https://economics.ucla.edu/person/denis-chetverikov/',
 'https://economics.ucla.edu/person/dora-costa/',
 'https://economics.ucla.edu/person/pablo-fajgelbaum/',
 'https://economics.ucla.edu/person/francois-geerolf/',
 'https://economics.ucla.edu/person/michela-giorcelli/',
 'https://economics.ucla.edu/person/felipe-goncalves/',
 'https://economics.ucla.edu/person/daniel-haanwinckel/',
 'https://economics.ucla.edu/person/martin-b-hackmann/',
 'https://economics.ucla.edu/person/jinyong-hahn/',
 'https://economics.ucla.edu/

### 3.B.) Webcrawl the links from A and use RegEx to get all the emails and phone numbers of ladder faculty profiles

In [None]:
emails_list = []
phone_list = []

#Looping through all profile URLs
for url in profile_urls:
    url = url
    response = requests.get(url)
    html_content = response.text
    
    # Email Regex pattern
    email_pattern = re.compile(r'title="([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})"')
    emails = email_pattern.finditer(html_content)
    # Using .group(1) to get required email (and discard extraneous emails with similar format)
    email = [email.group(1) for email in emails]
    emails_list.append(email)
    
    # Phone Regex pattern
    phone_pattern = re.compile(r"\(?\d{3}[-\.\)] *\d{3}[-\.] *\d{4}")
    nums = phone_pattern.finditer(html_content)
    # Using .group(0) to get just the phone number from .finditer output
    num = [num.group(0) for num in nums]
    phone_list.append(num)
    
# Creating a dataframe with all the info
faculty_df = pd.DataFrame({"Email ID": emails_list, "Phone Numbers": phone_list})
faculty_df = faculty_df.applymap(lambda x: x[0] if len(x) > 0 else None)
faculty_df

Unnamed: 0,Email ID,Phone Numbers
0,johnasker@econ.ucla.edu,
1,andy@atkeson.net,310-869-0742
2,marthabailey@ucla.edu,
3,baqaee@econ.ucla.edu,
4,nbau@g.ucla.edu,
5,sbigio@econ.ucla.edu,(310) 825-9397
6,abloedel@econ.ucla.edu,
7,sboard@econ.ucla.edu,(310) 825-5304
8,arielb@econ.ucla.edu,(310) 206-6732
9,chetverikov@econ.ucla.edu,(310) 825-4521


## 4.) Selenium

### 4.A.) Pick a website that has useful data to a business or economic question. 

In [34]:
URL = "https://kayak.com/"

### 4.B.) Use Selenium to scrape valuable information from the website and store in a dataframe.

In [35]:
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time as t

In [39]:
def kayak_setup():
    driver = webdriver.Chrome("/Users/mallikachandra/Downloads/chromedriver")
    driver.get("https://kayak.com/")
    return driver

In [None]:
PATH = "/Users/mallikachandra/Downloads/chromedriver"

In [40]:
def kayak_info_date(depart_location, arrival_location, departure_date, arrival_date):
    #Set up kayak driver
    driver = kayak_setup()

    #Find the search bar and enter the locations
    loc_search = driver.find_elements("css selector", 'input.k_my-input')
    #Arrival location
    loc_search[0].send_keys(Keys.BACKSPACE)
    loc_search[0].send_keys(Keys.BACKSPACE)
    loc_search[0].send_keys(depart_location)
    t.sleep(3)
    loc_search[0].send_keys(Keys.RETURN)
    #Departure location
    loc_search[1].send_keys(Keys.BACKSPACE)
    loc_search[1].send_keys(Keys.BACKSPACE)
    loc_search[1].send_keys(arrival_location)
    t.sleep(3)
    loc_search[1].send_keys(Keys.RETURN)

    #Find the search button and click it
    original_window = driver.current_window_handle
    assert len(driver.window_handles) == 1
    search = driver.find_element("css selector","button.Iqt3.Iqt3-mod-bold.Button-No-Standard-Style.Iqt3-mod-variant-solid.Iqt3-mod-theme-progress.Iqt3-mod-shape-rounded-medium.Iqt3-mod-shape-mod-default.Iqt3-mod-spacing-default.Iqt3-mod-size-large.Iqt3-mod-animation-search")
    search.send_keys(Keys.RETURN)
    t.sleep(3)

    #Switch to the new window
    for window_handle in driver.window_handles:
        if window_handle != original_window:
            driver.switch_to.window(window_handle)
            break
    
    #Enter the dates of travel
    t.sleep(10)
    url = driver.current_url
    url_new = url[:38] + departure_date + "/" + arrival_date
    t.sleep(5)
    driver.get(url_new)
    t.sleep(5)
    
    #Find the cheap flights button and click it
    cheap = driver.find_element("css selector","div.Hv20-option")
    cheap.send_keys(Keys.RETURN)

    #Find the times of the flights and create a dataframe
    t.sleep(5)
    times = driver.find_elements("css selector","div.vmXl.vmXl-mod-variant-large")
    times_full = []
    t.sleep(5)
    for time in times:
        value = time.text
        times_full.append(value)
    outgoing_time = times_full[::2]
    incoming_time = times_full[1::2]  
    df_full = pd.DataFrame(list(zip(outgoing_time, incoming_time)), columns =['Outgoing Time', 'Incoming Time'])

    #Find the airlines of the flights and add it to the dataframe
    airlines = driver.find_elements("css selector","div.c_cgF.c_cgF-mod-variant-default")
    airlines_full = []
    for airline in airlines:
        value = airline.text
        airlines_full.append(value)  
    outgoing_airline = airlines_full[::10]
    incoming_airline = airlines_full[5::10]
    df_full["Outgoing Airline"] = outgoing_airline
    df_full["Incoming Airline"] = incoming_airline

    #Find the stops of the flights and add it to the dataframe
    stops = driver.find_elements("css selector","span.JWEO-stops-text")
    stops_full = []
    for stop in stops:
        value = stop.text
        stops_full.append(value)

    outgoing_stops = stops_full[::2]
    incoming_stops = stops_full[1::2]

    df_full["Outgoing Stops"] = outgoing_stops
    df_full["Incoming Stops"] = incoming_stops

    #Find the duration of the flights and add it to the dataframe
    lengths = driver.find_elements("css selector","div.vmXl.vmXl-mod-variant-default")
    lengths_full = []
    for length in lengths:
        value = length.text
        lengths_full.append(value)
    
    lengths_full = lengths_full[1::2]
    outgoing_lengths = lengths_full[::2]
    incoming_lengths = lengths_full[1::2]

    df_full["Outgoing Length"] = outgoing_lengths
    df_full["Incoming Length"] = incoming_lengths

    #Find the prices of the flights and add it to the dataframe
    prices = driver.find_elements("css selector","div.f8F1-price-text")
    lowest_prices = []
    for price in prices:
        value = price.text
        lowest_prices.append(value)
    df_full["price"] = lowest_prices

    #Display the dataframe
    display(df_full)
    return driver

In [41]:
# Calling function to get information
kayak_info_date("Washington, D.C.", "Las Vegas, NV", "2023-10-24", "2023-11-05")

  driver = webdriver.Chrome("/Users/mallikachandra/Downloads/chromedriver")


Unnamed: 0,Outgoing Time,Incoming Time,Outgoing Airline,Incoming Airline,Outgoing Stops,Incoming Stops,Outgoing Length,Incoming Length,price
0,6:00 am–11:31 am,6:10 am–1:41 pm,Spirit Airlines,Spirit Airlines,1 stop,nonstop,8h 31m,4h 31m,$286
1,6:00 am–11:31 am,11:49 am–7:21 pm,Spirit Airlines,Spirit Airlines,1 stop,nonstop,8h 31m,4h 32m,$286
2,9:14 pm–7:01 am+1,6:00 am–8:21 pm,Frontier,Frontier,1 stop,1 stop,12h 47m,11h 21m,$288
3,7:00 am–12:06 pm,11:00 am–9:54 pm,Frontier,Frontier,1 stop,1 stop,8h 06m,7h 54m,$299
4,7:00 am–12:06 pm,6:00 am–5:02 pm,Frontier,Frontier,1 stop,1 stop,8h 06m,8h 02m,$299
5,1:41 pm–7:03 pm,11:00 am–9:54 pm,Frontier,Frontier,1 stop,1 stop,8h 22m,7h 54m,$299
6,1:41 pm–7:03 pm,6:00 am–5:02 pm,Frontier,Frontier,1 stop,1 stop,8h 22m,8h 02m,$299
7,6:01 pm–12:07 am+1,11:00 am–9:54 pm,Frontier,Frontier,1 stop,1 stop,9h 06m,7h 54m,$299
8,6:01 pm–12:07 am+1,6:00 am–5:02 pm,Frontier,Frontier,1 stop,1 stop,9h 06m,8h 02m,$299
9,7:00 am–12:06 pm,9:36 pm–12:48 pm+1,Frontier,Frontier,1 stop,1 stop,8h 06m,12h 12m,$299


<selenium.webdriver.chrome.webdriver.WebDriver (session="2f59a1428d1d6f42169f26d92648a794")>

### 4.C.) Business value of the above function and the data it scrapes

The above is a function that, if provided departure and arrival locations as well as departure and return dates, will return information on round trip flights to and from these locations at the specified dates. For schedulers and planners looking to improve efficiency, this is an extremely valuable tool - rather than having to go on websites and manually enter in the same information over and over again, this function allows you to automate the process and get all the information you need in one place, in a simple table that is easily readable. This function could be used to plan business trips, vacations, or any other travel that requires flights.