## Pipeline for webscraping from Redfin 
### goals: <br> (1) get the summary table on each page of a search result <br> (2) click on each row in the summary table so additional information of the individual property can be collected 
<img src="https://github.com/sarazong/Metis_project2/blob/master/project2_images/screenshot_redfin_SFprops.png" alt="scrape_example" style="width:600px; height:700px"/>

In [1]:
# import require modules and functions
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
def get_summary_table(soup):
    """Retrives information for property listing in the summary table
    
       Parameter: website link as a string
       
       Returns: a data frame with information from the summary table
    """
    
    variables = soup.find(class_ = "ReactDataTable").find_all(class_ = "button-text")[1:9]
    headers = [var.text for var in variables]
    
    table = soup.find(class_ = "tableList").find_all("tr", class_ = "tableRow")
    
    page_list = []
    for row in table:
        result = [element.text for element in row.find_all(class_ = "column")[1:9]]
        house_dict = dict(zip(headers, result))
        page_list.append(house_dict)
    
    summary_table = pd.DataFrame(page_list)
    
    return summary_table

In [3]:
def get_side_table(soup):
    """Retrives information for property from the side table and the image
    
       Parameter: html parsed by BeautifulSoup
       
       Returns: a data frame with information from the side table and the image
    """
    
    date_sold = soup.find(class_ = "Pill").text 
    sub_string = date_sold.split()[-3:] 
    date_cleaned = " ".join(sub_string)
    
    zip_code = soup.find(class_ = "scrollable").find("a")["href"] 
    zip_cleaned = zip_code.split("/")[3]
    #zip_cleaned = zip_inter.split("-")[-1]

    side_table_headers = soup.find(class_ = "amenities").find_all(class_ = "title")[2:5]
    headers = [row.text for row in side_table_headers] + ["Date Sold", "Zip Code"]

    side_table_values = soup.find(class_ = "amenities").find_all(class_ = "value")[2:5]
    values = [row.text for row in side_table_values] + [date_cleaned, zip_cleaned]

    side_table = []
    listing_part2 = dict(zip(headers, values))
    side_table.append(listing_part2)

    return side_table

In [4]:
def data_property_type(url):
    """Retrives information of available property listings for a given property-type search
    
       Parameter: website link as a string
       
       Returns: a data frame with all the available property listings
    """
    
    #open up the url
    chromedriver = "/Applications/chromedriver"
    driver = webdriver.Chrome(chromedriver)
    driver.get(url)
    time.sleep(1)
    
    #convert to table view
    table_view = driver.find_element_by_xpath("//div[contains(@class, 'ModeToggler')]//span[contains(@class, 'table')]")
    table_view.click()
    time.sleep(1)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    #retrive information from the summary table, the first image, and first side table on the webpage
    summary_table = get_summary_table(soup)
    side_tables = get_side_table(soup)
    
    #retrive information from the rest of the images and side tables
    table = soup.find(class_ = "tableList").find_all("tr", class_ = "tableRow")
    len(table)
    for row in range(1, len(table)):
        row_path = f"//tr[contains(@id, 'ReactDataTableRow_{row}')]"
        ind_list = driver.find_element_by_xpath(row_path)
        ind_list.click()
        time.sleep(1)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        side_table = get_side_table(soup)
        side_tables += side_table   
    additional_tables = pd.DataFrame(side_tables)
    
    #create a dataframe for all the information from the first page of the search result
    from_page = pd.concat([summary_table, additional_tables], axis = 1)
    property_listings = [from_page]
    
    #retrive information from the rest of the pages of the search result
    cont = soup.find(class_ = "PagingControls").find_all(class_ = "clickable")[-1]["class"][0]
    while (cont != "disabled"):
        #Click through each page for the specific property search result
        page_path = "//button[contains(@data-rf-test-id, 'next')]"
        next_page = driver.find_element_by_xpath(page_path)
        next_page.click()
        time.sleep(1)

        #Parse the html with BS and extract the summary table and the first side table from it
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        summary_table = get_summary_table(soup)
        side_tables = get_side_table(soup)

        #Click through each listing in the summary table to extract all the other side tables
        table = soup.find(class_ = "tableList").find_all("tr", class_ = "tableRow")
        for row in range(1, len(table)):
            row_path = f"//tr[contains(@id, 'ReactDataTableRow_{row}')]"
            ind_list = driver.find_element_by_xpath(row_path)
            ind_list.click()
            time.sleep(1)

            soup = BeautifulSoup(driver.page_source, 'html.parser')
            side_table = get_side_table(soup)
            side_tables += side_table

        additional_tables = pd.DataFrame(side_tables)

        #Put all information from a specific page together
        from_page = pd.concat([summary_table, additional_tables], axis = 1)
        property_listings.append(from_page)

        cont = soup.find(class_ = "PagingControls").find_all(class_ = "clickable")[-1]["class"][0]
        
    driver.quit()
    df = pd.concat(property_listings, axis = 0, ignore_index = True)
    
    return df

In [7]:
url = """https://www.redfin.com/city/17151/CA/San-Francisco/filter/property-type=house,include=sold-1yr"""
houses = data_property_type(url)

In [21]:
houses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Address     350 non-null    object
 1   Location    350 non-null    object
 2   Price       350 non-null    object
 3   Beds        350 non-null    object
 4   Baths       350 non-null    object
 5   Sq.Ft.      350 non-null    object
 6   $/Sq.Ft.    350 non-null    object
 7   On Redfin   350 non-null    object
 8   HOA         350 non-null    object
 9   Year Built  350 non-null    object
 10  Lot Size    350 non-null    object
 11  Date Sold   350 non-null    object
 12  Zip Code    350 non-null    object
dtypes: object(13)
memory usage: 35.7+ KB


In [22]:
property_type = ["house"] * 350
houses["prop_type"] = property_type
houses.head()

Unnamed: 0,Address,Location,Price,Beds,Baths,Sq.Ft.,$/Sq.Ft.,On Redfin,HOA,Year Built,Lot Size,Date Sold,Zip Code,prop_type
0,637 Hamilton St,Portola,"$1,200,000",4,2,1428,$840,15 days,,1951,"3,400 Sq. Ft.","JAN 5, 2021",637-Hamilton-St-94134,house
1,817 Guerrero St,Eureka Valley/Dolore,"$2,200,000",2,2,—,—,40 days,,1900,"2,495 Sq. Ft.","DEC 11, 2020",817-Guerrero-St-94110,house
2,1583 46th Ave,Outer Sunset,"$1,250,000",3,2,—,—,49 days,,1925,"2,063 Sq. Ft.","DEC 2, 2020",1583-46th-Ave-94122,house
3,4450 24th St,Noe Valley,"$1,760,000",3,2,1550,"$1,135",188 days,,1900,"2,945 Sq. Ft.","JUL 16, 2020",4450-24th-St-94114,house
4,1601 15th Ave,Golden Gate Heights,"$2,150,000",4,4,3315,$649,281 days,,1973,"5,104 Sq. Ft.","APR 14, 2020",1601-15th-Ave-94122,house


In [7]:
url = "https://www.redfin.com/city/17151/CA/San-Francisco/filter/property-type=condo,include=sold-1yr"
condos = data_property_type(url)

In [8]:
condos.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Address     350 non-null    object
 1   Location    350 non-null    object
 2   Price       350 non-null    object
 3   Beds        350 non-null    object
 4   Baths       350 non-null    object
 5   Sq.Ft.      350 non-null    object
 6   $/Sq.Ft.    350 non-null    object
 7   On Redfin   350 non-null    object
 8   HOA         350 non-null    object
 9   Year Built  350 non-null    object
 10  Lot Size    350 non-null    object
 11  Date Sold   350 non-null    object
 12  Zip Code    350 non-null    object
dtypes: object(13)
memory usage: 35.7+ KB


In [9]:
property_type = ["condo"] * 350
condos["prop_type"] = property_type
condos.head()

Unnamed: 0,Address,Location,Price,Beds,Baths,Sq.Ft.,$/Sq.Ft.,On Redfin,HOA,Year Built,Lot Size,Date Sold,Zip Code,prop_type
0,1781 Oak St #3,Haight Ashbury,"$1,011,000",2,1,772,"$1,310",50 days,$250/month,1924,—,"DEC 1, 2020",1781-Oak-St-94117,condo
1,821 Folsom St #314,Yerba Buena,"$949,725",2,2,1105,$859,56 days,$657/month,2004,—,"NOV 25, 2020",821-Folsom-St-94107,condo
2,4356 23rd St,Noe Valley,"$1,125,000",2,1,960,"$1,172",68 days,$308/month,1906,—,"NOV 13, 2020",4356-23rd-St-94114,condo
3,400 Beale St #811,South Beach,"$985,000",2,2,990,$995,85 days,$894/month,2002,—,"OCT 27, 2020",400-Beale-St-94105,condo
4,1285 4th Ave,Inner Sunset,"$1,650,000",3,2,—,—,89 days,$439/month,1907,—,"OCT 23, 2020",1285-4th-Ave-94122,condo


In [10]:
url = "https://www.redfin.com/city/17151/CA/San-Francisco/filter/property-type=townhouse,include=sold-1yr"
townhouses = data_property_type(url)

In [11]:
townhouses.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132 entries, 0 to 131
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Address     132 non-null    object
 1   Location    132 non-null    object
 2   Price       132 non-null    object
 3   Beds        132 non-null    object
 4   Baths       132 non-null    object
 5   Sq.Ft.      132 non-null    object
 6   $/Sq.Ft.    132 non-null    object
 7   On Redfin   132 non-null    object
 8   HOA         132 non-null    object
 9   Year Built  132 non-null    object
 10  Lot Size    132 non-null    object
 11  Date Sold   132 non-null    object
 12  Zip Code    132 non-null    object
dtypes: object(13)
memory usage: 13.5+ KB


In [12]:
property_type = ["townhouse"] * 132
townhouses["prop_type"] = property_type
townhouses.head()

Unnamed: 0,Address,Location,Price,Beds,Baths,Sq.Ft.,$/Sq.Ft.,On Redfin,HOA,Year Built,Lot Size,Date Sold,Zip Code,prop_type
0,4150 17th St #25,Corona Heights,"$1,450,000",2,2,1598,$907,15 days,$712/month,1994,—,"JAN 5, 2021",4150-17th-St-94114,townhouse
1,271 Shipley St Unit F6,South of Market,"$789,000",2,1,808,$976,30 days,$150/month,1993,—,"DEC 21, 2020",271-Shipley-94107,townhouse
2,678 Grand View Ave,Noe Valley,"$1,625,000",3,3,2300,$707,43 days,$392/month,1986,—,"DEC 8, 2020",678-Grand-View-Ave-94114,townhouse
3,28 Scott Aly,South of Market,"$835,000",1,1,—,—,56 days,$227/month,2000,—,"NOV 25, 2020",28-Scott-Aly-94107,townhouse
4,720 York #212,Inner Mission,"$800,000",1,1,618,"$1,294",58 days,$553/month,—,—,"NOV 23, 2020",720-York-St-94110,townhouse


In [13]:
url = "https://www.redfin.com/city/17151/CA/San-Francisco/filter/property-type=multifamily,include=sold-1yr"
multifamily = data_property_type(url)

In [14]:
multifamily.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Address     350 non-null    object
 1   Location    350 non-null    object
 2   Price       350 non-null    object
 3   Beds        350 non-null    object
 4   Baths       350 non-null    object
 5   Sq.Ft.      350 non-null    object
 6   $/Sq.Ft.    350 non-null    object
 7   On Redfin   350 non-null    object
 8   HOA         350 non-null    object
 9   Year Built  350 non-null    object
 10  Lot Size    350 non-null    object
 11  Date Sold   350 non-null    object
 12  Zip Code    350 non-null    object
dtypes: object(13)
memory usage: 35.7+ KB


In [15]:
property_type = ["multi_fam"] * 350
multifamily["prop_type"] = property_type
multifamily.head()

Unnamed: 0,Address,Location,Price,Beds,Baths,Sq.Ft.,$/Sq.Ft.,On Redfin,HOA,Year Built,Lot Size,Date Sold,Zip Code,prop_type
0,216 Lily St,Hayes Valley,"$1,950,000",5,5,3052,$639,16 days,,1988,"1,620 Sq. Ft.","JAN 4, 2021",216-Lily-St-94102,multi_fam
1,351 29th Ave,Central Richmond,"$2,200,000",6,4,2946,$747,65 days,,1911,"3,000 Sq. Ft.","NOV 16, 2020",351-29th-Ave-94121,multi_fam
2,2437-2441 Post St,Lower Pacific Height,"$1,900,000",9,3,4260,$446,175 days,,1895,"3,438 Sq. Ft.","JUL 29, 2020",2437-Post-St-94115,multi_fam
3,3066 25th StSold by: Redfin,Inner Mission,"$1,750,000",3,3,1973,$887,196 days,,1900,"2,596 Sq. Ft.","JUL 8, 2020",3066-25th-St-94110,multi_fam
4,3171 Cesar Chavez,Bernal Heights,"$1,065,000",5,2,2000,$533,306 days,,1911,"2,086 Sq. Ft.","MAR 20, 2020",3171-Cesar-Chavez-94110,multi_fam


In [23]:
master = [houses, condos, townhouses, multifamily]
df = pd.concat(master, axis = 0, ignore_index = True)

In [24]:
df.to_pickle("/Users/sarazzzz/Desktop/Metis/CAMP/Metis_project2/SFproperty_df")