## Pipeline for webscraping 
### Goals: <br> (1) get the summary table on each subpage of a search result <br> (2) click on each row in the summary table so additional information of the individual property can be colllected <br>
<img src="project2_images/screenshot_redfin_SFprops.png" style="width:300px;height:400px">

### Loading require packages

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from bs4 import BeautifulSoup
import pandas as pd

### Defining functions for data collection from each subpage of a search result

In [2]:
def get_summary_table(soup):
    """Retrives information for property listing in the summary table
    
       Parameter: website link as a string
       
       Returns: a data frame with information from the summary table
    """
    
    #extract feature names from the summary table
    variables = soup.find(class_ = "ReactDataTable").find_all(class_ = "button-text")[1:9]
    headers = [var.text for var in variables]
    
    table = soup.find(class_ = "tableList").find_all("tr", class_ = "tableRow")
    
    #extract feature values from the summary table
    page_list = []
    for row in table:
        result = [element.text for element in row.find_all(class_ = "column")[1:9]]
        house_dict = dict(zip(headers, result))
        page_list.append(house_dict)
    
    summary_table = pd.DataFrame(page_list)
    
    return summary_table

In [3]:
def get_side_table(soup):
    """Retrives information for property from the side table and the image
    
       Parameter: html parsed by BeautifulSoup
       
       Returns: a data frame with information from the side table and the image
    """
    
    #extract date sold from the property image
    date_sold = soup.find(class_ = "Pill").text 
    sub_string = date_sold.split()[-3:] 
    date_cleaned = " ".join(sub_string)
    
    #extract zip cold from the property image
    zip_code = soup.find(class_ = "scrollable").find("a")["href"] 
    zip_cleaned = zip_code.split("/")[3]

    #extract additional feature names from the side table
    side_table_headers = soup.find(class_ = "amenities").find_all(class_ = "title")[2:5]
    headers = [row.text for row in side_table_headers] + ["Date Sold", "Zip Code"]

    #extract additional feature values from the side table
    side_table_values = soup.find(class_ = "amenities").find_all(class_ = "value")[2:5]
    values = [row.text for row in side_table_values] + [date_cleaned, zip_cleaned]

    side_table = []
    listing_part2 = dict(zip(headers, values))
    side_table.append(listing_part2)

    return side_table

In [4]:
def add_side_tables(soup, driver, side_tables):
    """Clicks through listings in the summary table to get additional information for individual listing
    
       Parameter: html parsed by BeautifulSoup
       
       Returns: a data frame with additional information for each listing
    """
    
    #extract information from image and side table from each listing
    table = soup.find(class_ = "tableList").find_all("tr", class_ = "tableRow")

    for row in range(1, len(table)):
        row_path = f"//tr[contains(@id, 'ReactDataTableRow_{row}')]"
        ind_list = driver.find_element_by_xpath(row_path)
        ind_list.click()
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        side_table = get_side_table(soup)
        side_tables += side_table

    additional_tables = pd.DataFrame(side_tables)
    return additional_tables

### Defining function to click through subpages to get all the listings available for the search result

In [5]:
def data_subpages(soup, driver, property_listings):
    """Clicks on the other subpages of a search result to collect all available listings from the search result
    
       Parameters: 
           soup: html parsed by BeautifulSoup
           driver: chrome driver 
           property_listings: a list to collect all the dataframes from each subpage of a search result
           
       Returns: a data frame with all the available listings from a search result      
    """
    
    cont = soup.find(class_ = "PagingControls").find_all(class_ = "clickable")[-1]["class"][0]
    while (cont != "disabled"):
        #click through other subpages for the search result
        page_path = "//button[contains(@data-rf-test-id, 'next')]"
        next_page = driver.find_element_by_xpath(page_path)
        next_page.click()
        time.sleep(1)

        #parse the html with BS and extract the summary table and the first side table from it
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        summary_table = get_summary_table(soup)
        side_tables = get_side_table(soup)

        #click through each listing in the summary table to extract all the other side tables
        additional_tables = add_side_tables(soup, driver, side_tables)

        #put all information from a specific page together
        from_page = pd.concat([summary_table, additional_tables], axis = 1)
        property_listings.append(from_page)

        cont = soup.find(class_ = "PagingControls").find_all(class_ = "clickable")[-1]["class"][0]
    
    return property_listings

### Putting everything together to get data available for a particular search 

In [6]:
def data_property_type(url):
    """Retrives information of available property listings for a given property-type search
    
       Parameter: website link as a string
       
       Returns: a data frame with all the available property listings
    """
    #navigate to target url with chromedriver
    chromedriver = "/Applications/chromedriver"
    driver = webdriver.Chrome(chromedriver)
    driver.get(url)
    time.sleep(1)
    
    #convert to table view
    table_view = driver.find_element_by_xpath("//div[contains(@class, 'ModeToggler')]//span[contains(@class, 'table')]")
    table_view.click()
    time.sleep(1)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    #retrive information from the summary table, the first image, and first side table on the webpage
    summary_table = get_summary_table(soup)
    side_tables = get_side_table(soup)
    
    #retrive information from the rest of the images and side tables
    additional_tables = add_side_tables(soup, driver, side_tables)
    
    #create a dataframe for all the information from the first page of the search result
    from_page = pd.concat([summary_table, additional_tables], axis = 1)
    property_listings = [from_page]
    
    master_property_listings = data_subpages(soup, driver, property_listings)
        
    driver.quit()
    df = pd.concat(master_property_listings, axis = 0, ignore_index = True)
    
    return df

### Collecting data for houses sold in the past year

In [None]:
url = """https://www.redfin.com/city/17151/CA/San-Francisco/filter/property-type=house,include=sold-1yr"""
houses = data_property_type(url)

In [None]:
houses.info()

In [None]:
#add property type column to the data frame
property_type = ["house"] * 350
houses["prop_type"] = property_type
houses.head()

### Collecting data for condos sold in the past year

In [None]:
url = "https://www.redfin.com/city/17151/CA/San-Francisco/filter/property-type=condo,include=sold-1yr"
condos = data_property_type(url)

In [None]:
condos.info()

In [None]:
property_type = ["condo"] * 350
condos["prop_type"] = property_type
condos.head()

### Collecting data for townhouse sold in the past year

In [7]:
url = "https://www.redfin.com/city/17151/CA/San-Francisco/filter/property-type=townhouse,include=sold-1yr"
townhouses = data_property_type(url)

In [8]:
townhouses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Address     132 non-null    object
 1   Location    132 non-null    object
 2   Price       132 non-null    object
 3   Beds        132 non-null    object
 4   Baths       132 non-null    object
 5   Sq.Ft.      132 non-null    object
 6   $/Sq.Ft.    132 non-null    object
 7   On Redfin   132 non-null    object
 8   HOA         132 non-null    object
 9   Year Built  132 non-null    object
 10  Lot Size    132 non-null    object
 11  Date Sold   132 non-null    object
 12  Zip Code    132 non-null    object
dtypes: object(13)
memory usage: 13.5+ KB


In [9]:
property_type = ["townhouse"] * 132
townhouses["prop_type"] = property_type
townhouses.head()

Unnamed: 0,Address,Location,Price,Beds,Baths,Sq.Ft.,$/Sq.Ft.,On Redfin,HOA,Year Built,Lot Size,Date Sold,Zip Code,prop_type
0,4150 17th St #25,Corona Heights,"$1,450,000",2,2,1598,$907,19 days,$712/month,1994,—,"JAN 5, 2021",4150-17th-St-94114,townhouse
1,271 Shipley St Unit F6,South of Market,"$789,000",2,1,808,$976,34 days,$150/month,1993,—,"DEC 21, 2020",271-Shipley-94107,townhouse
2,678 Grand View Ave,Noe Valley,"$1,625,000",3,3,2300,$707,47 days,$392/month,1986,—,"DEC 8, 2020",678-Grand-View-Ave-94114,townhouse
3,28 Scott Aly,South of Market,"$835,000",1,1,—,—,60 days,$227/month,2000,—,"NOV 25, 2020",28-Scott-Aly-94107,townhouse
4,720 York #212,Inner Mission,"$800,000",1,1,618,"$1,294",62 days,$553/month,—,—,"NOV 23, 2020",720-York-St-94110,townhouse


### Collecting data for multi-family units sold in the past year

In [None]:
url = "https://www.redfin.com/city/17151/CA/San-Francisco/filter/property-type=multifamily,include=sold-1yr"
multifamily = data_property_type(url)

In [None]:
multifamily.info()

In [None]:
property_type = ["multi_fam"] * 350
multifamily["prop_type"] = property_type
multifamily.head()

In [None]:
#combine all the data together into a single dataframe
master = [houses, condos, townhouses, multifamily]
df = pd.concat(master, axis = 0, ignore_index = True)

In [None]:
#save data
df.to_pickle("/Users/sarazzzz/Desktop/Metis/CAMP/Metis_project2/SFproperty_df")