<a href="https://colab.research.google.com/github/ror32/ApartmentsRent/blob/master/ApartmentsRentLosAngeles.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gazpacho



## Get the html data with gazpacho

In [0]:
from gazpacho import get

base = f'https://www.apartmentlist.com/ca/los-angeles'
num = 1
url = f'{base}/page-{num}'

html = get(url)

## Pass the captured html to a Soup parser

In [0]:
from gazpacho import Soup

soup = Soup(html)

## Find the html tags that contain apartments data

In [0]:
rows = soup.find('div', {'class': 'ListingCard'})

## Capture a single row (and inspect for good measure)


In [5]:
row = rows[1]
row.find('div', {'class': 'css-17xjl8p'}).find('span')[0].text


'6250 Canoga Ave,'

In [6]:
row.find('div', {'class': 'css-16tgeys'}).find('a').text

'Triana'

In [7]:
row = rows[0]
apts = row.find('div', {'class': 'css-fyffd6'}).find('div', {'class': 'css-1oxqqna'})
for apt in apts:
  print(apt.find('div', {'class': 'css-xjvzth'}).text, apt.find('div', {'class': 'css-ajwnv4'}).text,)


Studio $1,480
1 Bedroom $1,595
2 Bedrooms $1,945


In [8]:
row.find('div', {'class': 'css-11wmgwu'}).text

'Studio, one- and two-bedroom apartments in pet-friendly community with pool, gym, elevator, bbq/grill, carport. Modern kitchen, hardwoods, walk-in closets, patio/balcony. Near California State, Northridge, shopping, dining, entertainment (Burbank). Easy access to the 405 and 101.'

## Wrap these find operations into a function

In [0]:
def parse_row(row):
  data = []

  name = row.find('div', {'class': 'css-16tgeys'}).find('a').text or None
  address, city = [row.find('div', {'class': 'css-17xjl8p'}).find('span')[i].text.rstrip(',') for i in range(2)]

  apts = row.find('div', {'class': 'css-fyffd6'}).find('div', {'class': 'css-1oxqqna'})
  if not isinstance(apts, type([])):
    apts = [apts]

  description = row.find('div', {'class': 'css-11wmgwu'}).text or None

  for apt in apts:
    bedrooms = apt.find('div', {'class': 'css-xjvzth'}).text
    price = apt.find('div', {'class': 'css-ajwnv4'}).text

    if price != 'Ask':
      sqft = apt.find('div', {'class': 'css-o1qo1i'}).text
      data += [(name, description, address, city, bedrooms, sqft, price)]
  
  return data


In [10]:
parse_row(row)

[('Northview-Southview Apartments',
  'Studio, one- and two-bedroom apartments in pet-friendly community with pool, gym, elevator, bbq/grill, carport. Modern kitchen, hardwoods, walk-in closets, patio/balcony. Near California State, Northridge, shopping, dining, entertainment (Burbank). Easy access to the 405 and 101.',
  '8111 Reseda Blvd',
  'Los Angeles',
  'Studio',
  '450 sqft',
  '$1,480'),
 ('Northview-Southview Apartments',
  'Studio, one- and two-bedroom apartments in pet-friendly community with pool, gym, elevator, bbq/grill, carport. Modern kitchen, hardwoods, walk-in closets, patio/balcony. Near California State, Northridge, shopping, dining, entertainment (Burbank). Easy access to the 405 and 101.',
  '8111 Reseda Blvd',
  'Los Angeles',
  '1 Bedroom',
  '687 sqft',
  '$1,595'),
 ('Northview-Southview Apartments',
  'Studio, one- and two-bedroom apartments in pet-friendly community with pool, gym, elevator, bbq/grill, carport. Modern kitchen, hardwoods, walk-in closets, pa

## Make sure that the function works for all the captured rows

In [11]:
appartments = []

for row in rows:
  try:
    appartments.extend(parse_row(row))
  except AttributeError:
    pass

appartments


[('Northview-Southview Apartments',
  'Studio, one- and two-bedroom apartments in pet-friendly community with pool, gym, elevator, bbq/grill, carport. Modern kitchen, hardwoods, walk-in closets, patio/balcony. Near California State, Northridge, shopping, dining, entertainment (Burbank). Easy access to the 405 and 101.',
  '8111 Reseda Blvd',
  'Los Angeles',
  'Studio',
  '450 sqft',
  '$1,480'),
 ('Northview-Southview Apartments',
  'Studio, one- and two-bedroom apartments in pet-friendly community with pool, gym, elevator, bbq/grill, carport. Modern kitchen, hardwoods, walk-in closets, patio/balcony. Near California State, Northridge, shopping, dining, entertainment (Burbank). Easy access to the 405 and 101.',
  '8111 Reseda Blvd',
  'Los Angeles',
  '1 Bedroom',
  '687 sqft',
  '$1,595'),
 ('Northview-Southview Apartments',
  'Studio, one- and two-bedroom apartments in pet-friendly community with pool, gym, elevator, bbq/grill, carport. Modern kitchen, hardwoods, walk-in closets, pa

## Bundle up the logic so that it can be applied to multiple pages

In [0]:
def scrape_page(num):
    base = f'https://www.apartmentlist.com/ca/los-angeles'
    url = f'{base}/page-{str(num)}'
    html = get(url)
    soup = Soup(html)
    rows = soup.find('div', {'class': 'ListingCard'})
    data = []
    for row in rows:
        try: 
            data.extend(parse_row(row))
        except AttributeError:
            pass
    return data

In [13]:
scrape_page(1)

[('Northview-Southview Apartments',
  'Studio, one- and two-bedroom apartments in pet-friendly community with pool, gym, elevator, bbq/grill, carport. Modern kitchen, hardwoods, walk-in closets, patio/balcony. Near California State, Northridge, shopping, dining, entertainment (Burbank). Easy access to the 405 and 101.',
  '8111 Reseda Blvd',
  'Los Angeles',
  'Studio',
  '450 sqft',
  '$1,480'),
 ('Northview-Southview Apartments',
  'Studio, one- and two-bedroom apartments in pet-friendly community with pool, gym, elevator, bbq/grill, carport. Modern kitchen, hardwoods, walk-in closets, patio/balcony. Near California State, Northridge, shopping, dining, entertainment (Burbank). Easy access to the 405 and 101.',
  '8111 Reseda Blvd',
  'Los Angeles',
  '1 Bedroom',
  '687 sqft',
  '$1,595'),
 ('Northview-Southview Apartments',
  'Studio, one- and two-bedroom apartments in pet-friendly community with pool, gym, elevator, bbq/grill, carport. Modern kitchen, hardwoods, walk-in closets, pa

## Scrape each page

In [0]:
import time

data_1 = []
for num  in range(2,31):
    dt = scrape_page(num)
    data_1.extend(dt)
    time.sleep(1)

In [0]:
data_2 = []
for num  in range(31,61):
    dt = scrape_page(num)
    data_2.extend(dt)
    time.sleep(1)

In [0]:
data_3 = []
for num  in range(61,90):
    dt = scrape_page(num)
    data_3.extend(dt)
    time.sleep(1)

In [0]:
data = appartments + data_1 + data_2 + data_3

## Stuff the captured data into a pandas DataFrame

In [26]:
import pandas as pd

df = pd.DataFrame(data, columns=['name', 'description', 'address', 'city', 'bedrooms', 'sqft', 'price'])
df.sample(5)

Unnamed: 0,name,description,address,city,bedrooms,sqft,price
1650,6241 CRESCENT,"Elegant & luxurious Top floor, single level, S...",6241 Crescent Park W,Los Angeles,2 Bedrooms,2810 sqft,"$6,500"
2336,7858 JAMIESON Avenue,"Remodeled 4 bedrooms, 2 bath with a pool, wi-f...",7858 Jamieson Avenue,Los Angeles,4 Bedrooms,1550 sqft,"$5,995"
1252,919 S Sherbourne Dr Apt 2,"Spacious, 3 Bedroom, hardwood floors, first fl...",919 South Sherbourne Drive,Los Angeles,3 Bedrooms,1200 sqft,"$3,300"
1778,17352 West SUNSET,Designer Done and beautifully remodeled! This ...,17352 Sunset Boulevard,Los Angeles,1 Bedroom,697 sqft,"$3,400"
896,5535 Canoga Ave,RESORT STYLE COMMUNITY WITH ALL AMENITIES! Bri...,5535 Canoga Avenue,Los Angeles,1 Bedroom,610 sqft,"$1,865"


## Mount Google Drive

In [27]:
from google.colab import drive
drive.mount('drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at drive


## Copy the csv file to Google Drive

In [0]:
df.to_csv('apartments_rent_LosAngeles_2_1_2020.csv')
!cp apartments_rent_LosAngeles_2_1_2020.csv "drive/My Drive/"

# New Section