In [8]:
#load in all neccesary libraries 
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd 
from pprint import pprint
import urllib.parse
import time 
from pathlib import Path 
import numpy as np

DATA SCRAPPING 

In [4]:
#function to create urls for list of cities and given dates 
def generate_hotels_url(destination, start_date, end_date, adults=2, rooms=1):
    base_url = "https://www.hotels.com/Hotel-Search"
    params = {
        "destination": destination,
        "d1": start_date,
        "startDate": start_date,
        "d2": end_date,
        "endDate": end_date,
        "adults": str(adults),
        "rooms": str(rooms),
        "theme": "",
        "userIntent": "",
        "semdtl": "",
        "useRewards": "false",
        "sort": "RECOMMENDED",
        "pwaDialog": ""
    }
    
    # Add these parameters even if they're empty to match the original URLs
    params["mapBounds"] = ""
    
    return f"{base_url}?{urllib.parse.urlencode(params)}"

In [5]:
#Read in csv that contains cities and dates for the concert 
locations_df = pd.read_csv('lodging_info_final.csv')

#clean data to drop NA 
locations_df = locations_df.dropna()

#clean data to merge city, state and country to creat url 
locations_df["destination"] = locations_df["City"] + ', ' + locations_df["State"] + ', ' + locations_df["Country"]

#pull only columns of interest 
locations_all = locations_df.loc[:, ['destination', 'start_date', 'end_date']]
locations_all

Unnamed: 0,destination,start_date,end_date
0,"Palm Desert, California, United States Of America",2024-11-01,2024-11-03
1,"Palm Desert, California, United States Of America",2024-11-02,2024-11-04
2,"Phoenix, Arizona, United States Of America",2024-11-06,2024-11-08
3,"Inglewood, California, United States Of America",2024-11-08,2024-11-10
4,"San Antonio, Texas, United States Of America",2024-11-15,2024-11-17
5,"Dallas, Texas, United States Of America",2024-11-16,2024-11-18
6,"Miami, Florida, United States Of America",2024-11-19,2024-11-21
7,"Miami, Florida, United States Of America",2024-11-20,2024-11-22
8,"Charlotte, North Carolina, United States Of Am...",2024-11-22,2024-11-24
9,"Washington, District of Columbia, United State...",2024-11-24,2024-11-26


In [6]:
#Create empty list to place urls 
generated_urls = []

#Loop thru the list of cities to create urls 
for index, location in locations_all.iterrows():
    url = generate_hotels_url(
        destination=location["destination"],
        start_date=location["start_date"],
        end_date=location["end_date"]
    )
    generated_urls.append(url)

#print urls 
generated_urls

['https://www.hotels.com/Hotel-Search?destination=Palm+Desert%2C+California%2C+United+States+Of+America&d1=2024-11-01&startDate=2024-11-01&d2=2024-11-03&endDate=2024-11-03&adults=2&rooms=1&theme=&userIntent=&semdtl=&useRewards=false&sort=RECOMMENDED&pwaDialog=&mapBounds=',
 'https://www.hotels.com/Hotel-Search?destination=Palm+Desert%2C+California%2C+United+States+Of+America&d1=2024-11-02&startDate=2024-11-02&d2=2024-11-04&endDate=2024-11-04&adults=2&rooms=1&theme=&userIntent=&semdtl=&useRewards=false&sort=RECOMMENDED&pwaDialog=&mapBounds=',
 'https://www.hotels.com/Hotel-Search?destination=Phoenix%2C+Arizona%2C+United+States+Of+America&d1=2024-11-06&startDate=2024-11-06&d2=2024-11-08&endDate=2024-11-08&adults=2&rooms=1&theme=&userIntent=&semdtl=&useRewards=false&sort=RECOMMENDED&pwaDialog=&mapBounds=',
 'https://www.hotels.com/Hotel-Search?destination=Inglewood%2C+California%2C+United+States+Of+America&d1=2024-11-08&startDate=2024-11-08&d2=2024-11-10&endDate=2024-11-10&adults=2&rooms=

In [7]:
#open up browser
browser = Browser('chrome')
counter = 0 
cities = []
for url in generated_urls: 
    counter = counter + 1
    #create a beutiful soup object
    url= url
    browser.visit(url)
    time.sleep(10)
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')

    #scrape city name 
    input_field = soup.find('div', attrs ={'class', 'uitk-field has-floatedLabel-label has-icon has-placeholder'})
    city = input_field.find('button', attrs={'type':'button'}).text

    #zoom into area of interest 
    data_results = soup.find('div', attrs={'data-stid': 'property-listing-results'})  
    
    #get list of all properties on page
    list_of_all_properties = data_results.find_all('div', attrs={'class':'uitk-spacing uitk-spacing-margin-blockstart-three'})
    
    # add city name to list of cities 
    cities.append(city)
    
    list_of_dicts = []
    for single_property_card in list_of_all_properties:
        title_element = single_property_card.find('h3', attrs={'class': 'uitk-heading uitk-heading-5 overflow-wrap uitk-layout-grid-item uitk-layout-grid-item-has-row-start'})
        price_element = single_property_card.find('div', attrs={'class': 'uitk-text uitk-type-500 uitk-type-medium uitk-text-emphasis-theme'})
        
        title = title_element.text if title_element else "N/A"
        price = price_element.text if price_element else "N/A"

        if price == 'N/A':     
            price_element = single_property_card.find('div', attrs={'class': 'uitk-text uitk-type-end uitk-type-300 uitk-text-default-theme'})
            price = price_element.text if price_element else "N/A"

        my_dict = {"Location": city, "Hotel": title, "Price": price}
        list_of_dicts.append(my_dict)

        data_df = pd.DataFrame(list_of_dicts)
        output_file = Path(f"Resources/output_file_{counter}.csv")
        data_df.to_csv(output_file, index=False)
    
       

In [9]:
#get count of cities you looped thru 
num_locations = len(generated_urls)
num_locations


16

DATA CLEANING 

In [13]:
#LOOP THRU AND READ IN ALL OUTPUT FILES 

df_final = pd.DataFrame()

for ii in np.arange(1,num_locations+1): 
    file_path = Path(f"Resources/output_file_{ii}.csv")
    df= pd.read_csv(file_path)  
    df_final= pd.concat([df, df_final])

df_final


Unnamed: 0,Location,Hotel,Price
0,"Chicago, Illinois, United States of America",Palmer House a Hilton Hotel,$149
1,"Chicago, Illinois, United States of America","The LaSalle Chicago, Autograph Collection",$224
2,"Chicago, Illinois, United States of America",citizenM Chicago Downtown,$178
3,"Chicago, Illinois, United States of America",Hyatt Centric The Loop Chicago,$139
4,"Chicago, Illinois, United States of America",,
...,...,...,...
99,"Palm Desert, California, United States of America",Parker Palm Springs,$980 per night
100,"Palm Desert, California, United States of America",Moonshine by Avantstay Modern & Stylish Estate...,"$1,377 per night"
101,"Palm Desert, California, United States of America",Days Inn by Wyndham Indio,$110 per night
102,"Palm Desert, California, United States of America",The Oasis Resort,$319 per night


In [14]:
# REMOVE ALL NaN fields 
df_cleaned = df_final.dropna()


#REMOVE "$", "," AND LETTERS FROM PRICE COLUMN
#df_cleaned["Price"]=df_cleaned["Price"].replace({'\$': ''}, regex=True)
#df_cleaned["Price"]=df_cleaned["Price"].replace({'\,': ''}, regex=True)
#df_cleaned["Price"]=df_cleaned["Price"].str.replace('[a-zA-Z]', '', regex=True)
df_cleaned.loc[:, "Price"] = df_cleaned["Price"].replace({'\$': ''}, regex=True)
df_cleaned.loc[:, "Price"] = df_cleaned["Price"].replace({'\,': ''}, regex=True)
df_cleaned.loc[:, "Price"] = df_cleaned["Price"].str.replace('[a-zA-Z]', '', regex=True)



# FIX LOCATION TO HAVE CITY, STATE AND COUNTRY SEPERATE 
#df_cleaned["City"]=df_cleaned["City"].str.replace(r'\(.*?\)', '', regex=True)

df_cleaned[["City", "State", "Country"]]= df_cleaned["Location"].str.split(',', expand=True)
df_cleaned.loc[:, "City"] = df_cleaned["City"].str.replace(r'\(.*?\)', '', regex=True)
df_cleaned = df_cleaned.drop(columns =['Location'])
df_cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[["City", "State", "Country"]]= df_cleaned["Location"].str.split(',', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[["City", "State", "Country"]]= df_cleaned["Location"].str.split(',', expand=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned[["City",

Unnamed: 0,Hotel,Price,City,State,Country
0,Palmer House a Hilton Hotel,149,Chicago,Illinois,United States of America
1,"The LaSalle Chicago, Autograph Collection",224,Chicago,Illinois,United States of America
2,citizenM Chicago Downtown,178,Chicago,Illinois,United States of America
3,Hyatt Centric The Loop Chicago,139,Chicago,Illinois,United States of America
5,Hotel Riu Plaza Chicago,150,Chicago,Illinois,United States of America
...,...,...,...,...,...
99,Parker Palm Springs,980,Palm Desert,California,United States of America
100,Moonshine by Avantstay Modern & Stylish Estate...,1377,Palm Desert,California,United States of America
101,Days Inn by Wyndham Indio,110,Palm Desert,California,United States of America
102,The Oasis Resort,319,Palm Desert,California,United States of America


In [15]:
#SAVE CLEANED DATAFRAM 
file_output = Path(f"Resources/compiled_hotel_prices.csv")
df_cleaned.to_csv(file_output)
