In [220]:
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
import re
import pandas as pd
from tqdm import tqdm

In [208]:
driver = uc.Chrome()

## City Search

In [5]:
URL = "https://www.apartments.com/manhattan-ny/"

In [6]:
driver.get(URL)

In [18]:
my_soup = BeautifulSoup(driver.get(URL))

In [30]:
def is_valid_url(url):
    return isinstance(url, str) and url.startswith('https://')

def first_five_pages(soup):
    nav = soup.find('nav', {'id': 'paging'})
    if nav:
        return [URL] + [page.get('href') for page in nav.find_all('a') if is_valid_url(page.get('href'))]

first_five_pages(my_soup)

['https://www.apartments.com/manhattan-ny/',
 'https://www.apartments.com/manhattan-ny/2/',
 'https://www.apartments.com/manhattan-ny/3/',
 'https://www.apartments.com/manhattan-ny/4/',
 'https://www.apartments.com/manhattan-ny/5/']

In [189]:
def get_page_listings(soup):
    listings = my_soup.find('div', {'id': 'placardContainer'}).find('ul').find_all('li', {'class': 'mortar-wrapper'})
    return [listing.find('a', {'class': 'property-link'}).get('href') for listing in listings]

listing_urls = get_page_listings(my_soup)

In [190]:
len(listings)

25

In [191]:
listing_urls

['https://www.apartments.com/lyra-new-york-ny/jjl8rp5/', 'https://www.apartments.com/8-spruce-new-york-ny/1l3y464/', 'https://www.apartments.com/park-towers-south-new-york-ny/xc7j4tn/', 'https://www.apartments.com/eos-new-york-ny/zls36xz/', 'https://www.apartments.com/via-57-west-new-york-ny/7tc4rkx/', 'https://www.apartments.com/70-w-45th-st-new-york-ny/dxtefkl/', 'https://www.apartments.com/20-broad-street-new-york-ny/lvvpj7e/', 'https://www.apartments.com/the-helena-new-york-ny/6het1tr/', 'https://www.apartments.com/frank-57-west-new-york-ny/yh2kx5r/', 'https://www.apartments.com/10-hanover-square-new-york-ny/1wxy6j5/', 'https://www.apartments.com/95-wall-new-york-ny/b815kt8/', 'https://www.apartments.com/180-water-st-new-york-ny/htl8ej0/', 'https://www.apartments.com/63-wall-street-new-york-ny/xq2dqmm/', 'https://www.apartments.com/view-34-new-york-ny/938hg25/', 'https://www.apartments.com/the-smile-new-york-ny/4mjv8tk/', 'https://www.apartments.com/yorkshire-towers-new-york-ny/bzg

## Listing

In [222]:
def parse_floor_plan_info(soup):
    """
    soup: floor plan soup
    """
    floor_plan_info = soup.find('span', {'class': 'detailsTextWrapper'}).text.split(',')
    bed = floor_plan_info[0].strip()
    bath = floor_plan_info[1].strip()
    sqft = floor_plan_info[2].strip() if len(floor_plan_info) >= 3 else None
    features_elem = soup.find('div', {'class': 'unitDetails'})
    
    if features_elem:
        features = [elem.text.strip() for elem in features_elem.find('ul', {'class': 'allAmenities'}).find('ul').find_all('li')]
    else:
        features = []
    
    return bed, bath, sqft, features

def parse_unit_info(soup):
    """
    soup: unit soup
    """
    unit_info = soup.find('div', {'class': 'grid-container js-unitExtension'}).find_all('div', {'class': re.compile('column')})
    unit_id = unit_info[0].find('button', {'class': 'unitBtn active'}).contents[-1].strip()
    price = unit_info[1].find_all('span')[-1].text.strip()
    sqft2 = unit_info[2].find_all('span')[-1].text.strip()
    availability = unit_info[3].find('span', {'class': 'dateAvailable'}).contents[-1].strip()
    
    return unit_id, price, sqft2, availability

def get_listing_units(soup):
    """
    Loops through floor plans and corresponding units for a given listing
    
    soup: listing soup
    """
    listing_units = []
    floor_plans = listing_soup.find('div', {'data-tab-content-id': 'all'}).find_all('div', {'class': 'pricingGridItem multiFamily hasUnitGrid'})
    for floor_plan in floor_plans:
        # Get floor plan-level info
        bed, bath, sqft, features = parse_floor_plan_info(floor_plan)
        units = floor_plan.find('div', {'class': 'unitGridContainer mortar-wrapper'}).find('ul').find_all('li')
        for unit in units:
            # Get unit-level info
            unit_id, price, sqft2, availability = parse_unit_info(unit)
            unit_dict = {'unit_id': unit_id,
                         'bed': bed,
                         'bath': bath,
                         'price': price,
                         'sqft': sqft2 if sqft2 else sqft, # sqft can come from floor plan info and/or unit info; prioritize unit info
                         'availability': availability,
                         'features': features}
            listing_units.append(unit_dict)
    
    return listing_units

In [223]:
all_units = []
for url in tqdm(listing_urls):
    print(url)
    driver.get(url)
    listing_soup = BeautifulSoup(driver.page_source)
    listing_units = get_listing_units(listing_soup)
    all_units.extend(listing_units)

  0%|                                                                                                                                                                           | 0/25 [00:00<?, ?it/s]

https://www.apartments.com/lyra-new-york-ny/jjl8rp5/


  4%|██████▌                                                                                                                                                            | 1/25 [00:00<00:15,  1.50it/s]

https://www.apartments.com/8-spruce-new-york-ny/1l3y464/


  8%|█████████████                                                                                                                                                      | 2/25 [00:01<00:15,  1.49it/s]

https://www.apartments.com/park-towers-south-new-york-ny/xc7j4tn/


 12%|███████████████████▌                                                                                                                                               | 3/25 [00:02<00:15,  1.40it/s]

https://www.apartments.com/eos-new-york-ny/zls36xz/


 16%|██████████████████████████                                                                                                                                         | 4/25 [00:02<00:14,  1.44it/s]

https://www.apartments.com/via-57-west-new-york-ny/7tc4rkx/


 20%|████████████████████████████████▌                                                                                                                                  | 5/25 [00:03<00:13,  1.51it/s]

https://www.apartments.com/70-w-45th-st-new-york-ny/dxtefkl/


 20%|████████████████████████████████▌                                                                                                                                  | 5/25 [00:03<00:15,  1.28it/s]


AttributeError: 'NoneType' object has no attribute 'find'

In [213]:
df = pd.DataFrame(all_units)

In [214]:
df

Unnamed: 0,unit_id,bed,bath,price,sqft,availability,features
0,23D,Studio,1 bath,"$3,500",,Now,"[Storage Units, Dishwasher, Microwave, Renovat..."
1,18R,1 bed,1 bath,"$4,300",,Now,"[Storage Units, Dishwasher, Microwave, Corner ..."
2,30C,1 bed,1 bath,"$4,500",,Now,"[Storage Units, Dishwasher, Microwave, Corner ..."
3,24M,1 bed,1 bath,"$4,550",,Now,"[Storage Units, Dishwasher, Microwave, Corner ..."
4,27R,1 bed,1 bath,"$4,400",,Feb. 28,"[Storage Units, Dishwasher, Microwave, Corner ..."
5,9L,1 bed,1 bath,"$4,800",,Now,"[Storage Units, Dishwasher, Microwave, Hardwoo..."
6,37A,2 beds,2 baths,"$6,037",,Now,"[Storage Units, Dishwasher, Microwave, City Vi..."
7,37J,2 beds,2 baths,"$6,300",,Now,"[Storage Units, Dishwasher, Microwave, City Vi..."
