In [20]:
import undetected_chromedriver as uc
from selenium.webdriver.remote.webdriver import By
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
import re

## Zillow Zip Search

In [2]:
def clean_prices(price):
    return int(price.replace("$", "").replace(",", ""))

def zip_avg_price(zip):
    # Verify input is correct format
    if len(zip) != 5 or not zip.isdigit() or not isinstance(zip, str):
        raise ValueError("Zip must be a 5 digit string!")
    
    # Instantiate Chrome driver and send GET request
    driver = uc.Chrome()
    url = f"https://www.zillow.com/homes/{zip}/"
    driver.get(url)

    # Identify property listing prices
    prices = []
    for span in driver.find_elements_recursive(by=By.TAG_NAME, value="span"):
        attr = span.get_attribute(name="data-test")
        if attr == "property-card-price":
            prices.append(span.text)
    
    # Close driver
    driver.close()
    
    # Cast prices to integers
    prices = [clean_prices(p) for p in prices]

    # Compute average property price
    avg_price = sum(prices)/len(prices)


    return avg_price

In [3]:
zip_avg_price("07662")

551542.8571428572

## Apartments City Search

In [5]:

city = "manhattan-ny"
url = f"https://www.apartments.com/{city}/"
driver = uc.Chrome()
driver.get(url)

In [11]:
my_soup = BeautifulSoup(driver.page_source)

In [13]:
def is_valid_url(url):
    return isinstance(url, str) and url.startswith('https://')

def first_five_pages(soup, url):
    nav = soup.find('nav', {'id': 'paging'})
    if nav:
        return [url] + [page.get('href') for page in nav.find_all('a') if is_valid_url(page.get('href'))]

first_five_pages(my_soup, url)

['https://www.apartments.com/manhattan-ny/',
 'https://www.apartments.com/manhattan-ny/2/',
 'https://www.apartments.com/manhattan-ny/3/',
 'https://www.apartments.com/manhattan-ny/4/',
 'https://www.apartments.com/manhattan-ny/5/']

In [14]:
def get_page_listings(soup):
    listings = my_soup.find('div', {'id': 'placardContainer'}).find('ul').find_all('li', {'class': 'mortar-wrapper'})
    return [listing.find('a', {'class': 'property-link'}).get('href') for listing in listings]

listing_urls = get_page_listings(my_soup)

### Listing

In [17]:
def parse_floor_plan_info(soup):
    """
    soup: floor plan soup
    """
    floor_plan_info = soup.find('span', {'class': 'detailsTextWrapper'}).text.split(',')
    bed = floor_plan_info[0].strip()
    bath = floor_plan_info[1].strip()
    sqft = floor_plan_info[2].strip() if len(floor_plan_info) >= 3 else None
    features_elem = soup.find('div', {'class': 'unitDetails'})
    
    if features_elem:
        features = [elem.text.strip() for elem in features_elem.find('ul', {'class': 'allAmenities'}).find('ul').find_all('li')]
    else:
        features = []
    
    return bed, bath, sqft, features

def parse_unit_info(soup):
    """
    soup: unit soup
    """
    unit_info = soup.find('div', {'class': 'grid-container js-unitExtension'}).find_all('div', {'class': re.compile('column')})
    unit_id = unit_info[0].find('button', {'class': 'unitBtn active'}).contents[-1].strip()
    price = unit_info[1].find_all('span')[-1].text.strip()
    sqft2 = unit_info[2].find_all('span')[-1].text.strip()
    availability = unit_info[3].find('span', {'class': 'dateAvailable'}).contents[-1].strip()
    
    return unit_id, price, sqft2, availability

def get_listing_units(soup):
    """
    Loops through floor plans and corresponding units for a given listing
    
    soup: listing soup
    """
    listing_units = []
    floor_plans = listing_soup.find('div', {'data-tab-content-id': 'all'}).find_all('div', {'class': 'pricingGridItem multiFamily hasUnitGrid'})
    for floor_plan in floor_plans:
        # Get floor plan-level info
        bed, bath, sqft, features = parse_floor_plan_info(floor_plan)
        units = floor_plan.find('div', {'class': 'unitGridContainer mortar-wrapper'}).find('ul').find_all('li')
        for unit in units:
            # Get unit-level info
            unit_id, price, sqft2, availability = parse_unit_info(unit)
            unit_dict = {'unit_id': unit_id,
                         'bed': bed,
                         'bath': bath,
                         'price': price,
                         'sqft': sqft2 if sqft2 else sqft, # sqft can come from floor plan info and/or unit info; prioritize unit info
                         'availability': availability,
                         'features': features}
            listing_units.append(unit_dict)
    
    return listing_units

In [24]:
all_units = []
for url in tqdm(listing_urls):
    print(url)
    driver.get(url)
    listing_soup = BeautifulSoup(driver.page_source)
    listing_units = get_listing_units(listing_soup)
    all_units.extend(listing_units)

In [22]:
df = pd.DataFrame(all_units)

In [23]:
df

Unnamed: 0,unit_id,bed,bath,price,sqft,availability,features
0,,Studio,1 bath,"$3,150",501,Oct 15,"[Air Conditioning, Washer/Dryer, Heating]"
1,,1 bed,1 bath,"$3,595",661,Oct 1,"[Air Conditioning, Washer/Dryer, Heating]"
2,,1 bed,1 bath,"$3,750",648,Oct 1,"[Air Conditioning, Washer/Dryer, Heating]"
3,,1 bed,1 bath,"$3,785",647,Oct 1,"[Air Conditioning, Washer/Dryer, Heating]"
4,,2 beds,1 bath,"$5,050",840,Oct 1,"[Air Conditioning, Washer/Dryer, Heating]"
5,,2 beds,2 baths,"$5,375",1103,Oct 1,"[Air Conditioning, Washer/Dryer, Heating]"
6,,2 beds,2 baths,"$5,495",1073,Oct 1,"[Air Conditioning, Washer/Dryer, Heating]"


In [1]:
driver.close()