# Import and Scrape RedFin Data

In [1]:
import pandas as pd
import numpy as np
import pickle
import requests
from bs4 import BeautifulSoup

In [2]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import re
import string
import requests
import json
from pymongo import MongoClient
from bson.objectid import ObjectId

In [3]:
def get_content(driver, url):
    """Use a Selenium Driver to get content from the url."""
    driver.get(url)
    content = (driver.page_source).encode('ascii', 'ignore') 
    return content

def create_soup(content):
    """Create a BeautifulSoup object from the content."""
    soup = BeautifulSoup(content,"lxml")
    return soup

def fill_unit_info(row, new_record):
    """Takes in a dataframe row with basic info about a listing gathered from Redfin.\
        Then estimates the number of units and number of bedrooms for each unit from the square footage\
        and other listed information. Returns a dictionary object ready to be inserted\
        into MongoDB."""
    if row['baths'] == 1:
        if row['beds'] == 1:
            est_size = 750
        elif row['beds'] == 2:
            est_size = 1100
        elif row['beds'] == 3:
            est_size = 1400
        else:
            est_size = 1000
        if new_record['square_feet'] != np.nan and new_record['square_feet'] > 1:
            guess = round(new_record['square_feet']/est_size)
            if guess <= 4:
                units = guess
            else:
                units = 4
            new_record['num_units'] = units
            new_record['unit_1_beds'] = row['beds']
            new_record['unit_2_beds'] = row['beds']
            if units >= 3:
                new_record['unit_3_beds'] = row['beds']
            if units == 4:
                new_record['unit_4_beds'] = row['beds']
            new_record['beds'] = row['beds'] * units
        else:
            print('insufficient data to fill unit info')
    elif row['baths'] >= 2 and row['baths'] <= 4:
        units = round(row['baths'] - .5)
        new_record['num_units'] = units
        beds, extra_rooms = divmod(row['beds'], units)
        if extra_rooms == 0:
            new_record['unit_1_beds'] = beds
        else:
            new_record['unit_1_beds'] = beds + extra_rooms
        new_record['unit_2_beds'] = beds
        if units >= 3:
            new_record['unit_3_beds'] = beds
        if units == 4:
            new_record['unit_4_beds'] = beds
    elif row['baths'] > 4:
        units = 4
        new_record['num_units'] = units
        beds, extra_rooms = divmod(row['beds'], units)
        if extra_rooms == 0:
            new_record['unit_1_beds'] = beds
        else:
            new_record['unit_1_beds'] = beds + extra_rooms
        new_record['unit_2_beds'] = beds
        if units >= 3:
            new_record['unit_3_beds'] = beds
        if units == 4:
            new_record['unit_4_beds'] = beds
    else:
        print("insufficient info")
    return new_record

def get_data(row):
    """Adds the data from a dataframe row representing one listing gathered from Redfin to\
        a dictionary. Then calls fill_unit_info to estimate additional unit info. Returns\
        a dictionary ready to be inserted into MongoDB."""
    new_record = {}
    new_record['price'] = row['price']
    new_record['property_type'] = row['property_type']
    new_record['address'] = row['address']
    new_record['city'] = row['city']
    new_record['state'] = row['state']
    new_record['zip'] = str(row['zip']).replace('.0','')
    new_record['beds'] = row['beds']
    new_record['baths'] = row['baths']
    new_record['location'] = row['location']
    new_record['square_feet'] = row['square_feet']
    new_record['lot_size'] = row['lot_size']
    new_record['year_built'] = row['year_built']
    new_record['days_on_market'] = row['days_on_market']
    new_record['cost_square_feet'] = row['cost_square_feet']
    new_record['url'] = row['url']
    new_record['source'] = row['source']
    new_record['mls_num'] = row['mls_num']
    new_record['latitude'] = row['latitude']
    new_record['longitude'] = row['longitude']
    fill_unit_info(row, new_record)
    print("Added " + row['mls_num'] )
    return new_record
    
def scrape(row, soup):
    """Takes in a row of data representing one listing and the BeautifulSoup object from that listings\
        webpage. Extracts selected info from the content and returns a dictionary."""
    new_record = {}
    try:
        num_units_raw = soup.find('li', attrs={'data-reactid' : '104'}).text
        new_record['num_units'] = int(num_units_raw.split(": ")[1])
    except:
        pass
    try:
        annual_income_raw = soup.find('span', attrs={'data-reactid' : '109'}).text
        new_record['annual_income'] = float(annual_income_raw.replace('$', '').replace(',',''))
    except:
        new_record['annual_income'] = np.nan
    try:
        unit_rent_raw = soup.find('span', attrs={'data-reactid' : '149'}).text
        new_record['unit_rent'] = float(unit_rent_raw.replace('$',''))
    except:
        new_record['unit_rent'] = np.nan
    try:
        new_record['description_text'] = soup.find('p', attrs={'data-reactid' : '12'}).text
    except:
        new_record['description_text'] = np.nan
    try:
        price_raw = soup.find('span', attrs={'data-reactid' : '34'}).text
        if row['price'] > 1.0:
            new_record['price'] = row['price']
        else:
            new_record['price'] = float(price_raw.replace(',',''))
    except:
        new_record['price'] = np.nan
    print("Scraped MLS " + row['mls_num'] )
    return new_record

# The web framework gets post_id from the URL and passes it as a string
def get(post_id):
    # Convert from string to ObjectId:
    document = listings.find_one({'_id': ObjectId(post_id)})

In [4]:
# Import raw RedFin CSVs

# red_fin_df = pd.read_csv('../data/raw/MN_HennepinCo_2017-02-27.csv')
# red_fin_df = pd.read_csv('../data/raw/MN_RamseyCo_2017-02-27.csv')
# red_fin_df = pd.read_csv('../data/raw/IL_SChicago_2017-02-27.csv')
# red_fin_df = pd.read_csv('../data/raw/IL_NChicago_2017-02-27.csv')
# red_fin_df = pd.read_csv('../data/raw/CA_SanDiego_2017-02-27.csv')
# red_fin_df = pd.read_csv('../data/raw/MI_Detroit_2017-02-27.csv')
# red_fin_df = pd.read_csv('../data/raw/OR_Portland_2017-02-27.csv')
# red_fin_df = pd.read_csv('../data/raw/WI_Madison_2017-02-27.csv')
# red_fin_df = pd.read_csv('../data/raw/TX_Dallas_2017-03-03.csv')
# red_fin_df = pd.read_csv('../data/raw/MA_Boston_2017-03-03.csv')
# red_fin_df = pd.read_csv('../data/raw/FL_Tampa_2017-03-03.csv')
# red_fin_df = pd.read_csv('../data/raw/WA_Seattle_2017-03-03.csv')
# red_fin_df = pd.read_csv('../data/raw/AZ_Phoenix_2017-03-03.csv')
# red_fin_df = pd.read_csv('../data/raw/NC_Raleigh_2017-03-03.csv')
red_fin_df = pd.read_csv('../data/raw/MN_Minneapolis_2017-03-08.csv')

print(red_fin_df.shape)

(37, 27)


In [5]:
columns = red_fin_df.columns
columns

Index(['SALE TYPE', 'SOLD DATE', 'PROPERTY TYPE', 'ADDRESS', 'CITY', 'STATE',
       'ZIP', 'PRICE', 'BEDS', 'BATHS', 'LOCATION', 'SQUARE FEET', 'LOT SIZE',
       'YEAR BUILT', 'DAYS ON MARKET', '$/SQUARE FEET', 'HOA/MONTH', 'STATUS',
       'NEXT OPEN HOUSE START TIME', 'NEXT OPEN HOUSE END TIME',
       'URL (SEE http://www.redfin.com/buy-a-home/comparative-market-analysis FOR INFO ON PRICING)',
       'SOURCE', 'MLS#', 'FAVORITE', 'INTERESTED', 'LATITUDE', 'LONGITUDE'],
      dtype='object')

In [6]:
new_columns = ['SALE_TYPE', 'SOLD_DATE', 'PROPERTY_TYPE', 'ADDRESS', 'CITY', 'STATE',
       'ZIP', 'PRICE', 'BEDS', 'BATHS', 'LOCATION', 'SQUARE_FEET', 'LOT_SIZE',
       'YEAR_BUILT', 'DAYS_ON_MARKET', 'COST_SQUARE_FEET', 'HOA_PER_MONTH', 'STATUS',
       'NEXT_OPEN_HOUSE_START_TIME', 'NEXT_OPEN_HOUSE_END_TIME',
       'URL', 'SOURCE', 'MLS_NUM', 'FAVORITE', 'INTERESTED', 'LATITUDE', 'LONGITUDE']

new_columns_lower = [col.lower() for col in new_columns]

In [7]:
# Replace column names with snake case names
red_fin_df.columns = new_columns_lower

In [8]:
# Convert data types
red_fin_df['mls_num'] = red_fin_df['mls_num'].astype(str, copy=False)
red_fin_df['mls_num'].replace(".0", "", inplace=True, regex=True)

# Remove larger buildings with more than 4 units
red_fin_df = red_fin_df[red_fin_df['property_type'] == 'Multi-Family (2-4 Unit)']
red_fin_df = red_fin_df.copy(deep=True)
red_fin_df.head()

Unnamed: 0,sale_type,sold_date,property_type,address,city,state,zip,price,beds,baths,...,status,next_open_house_start_time,next_open_house_end_time,url,source,mls_num,favorite,interested,latitude,longitude
0,MLS Listing,,Multi-Family (2-4 Unit),2742 18th Ave S,Minneapolis,MN,55407,219000,5,2.0,...,Active,,,http://www.redfin.com/MN/Minneapolis/2742-18th...,NORTHSTARMLS,4791247,N,Y,44.952342,-93.249002
1,MLS Listing,,Multi-Family (2-4 Unit),3544 Irving Ave,Minneapolis,MN,55408,895000,6,5.0,...,Active,,,http://www.redfin.com/MN/Minneapolis/3544-Irvi...,NORTHSTARMLS,47770,N,Y,44.938181,-93.30252
2,MLS Listing,,Multi-Family (2-4 Unit),2919 Fillmore St NE,Minneapolis,MN,55418,219900,4,2.0,...,Active,,,http://www.redfin.com/MN/Minneapolis/2919-Fill...,NORTHSTARMLS,47972,N,Y,45.020978,-93.241847
3,MLS Listing,,Multi-Family (2-4 Unit),3016 14th Ave S,Minneapolis,MN,55407,268900,5,2.0,...,Active,March-12-2017 11:00 AM,March-12-2017 01:00 PM,http://www.redfin.com/MN/Minneapolis/3016-14th...,NORTHSTARMLS,41341,N,Y,44.947774,-93.255354
4,MLS Listing,,Multi-Family (2-4 Unit),3331 Morgan Ave,Minneapolis,MN,55412,109900,4,2.0,...,Active,,,http://www.redfin.com/MN/Minneapolis/3331-Morg...,NORTHSTARMLS,40429,N,Y,45.015977,-93.304816


In [12]:
# Initiate MongoDB client and add rows with estimated info to MongoDB
client = MongoClient()
db = client.property_investor
listings = db.listings
cursor = listings.find()
listing_df = pd.DataFrame(list(cursor))

data = []
    
for index, row in red_fin_df.iterrows():
    record = get_data(row)
    data.append(record)

result = db.listings.insert_many(data)
print(result.inserted_ids)

Added 4791247
Added 47770
Added 47972
Added 41341
Added 40429
Added 4699186
Added 497
Added 42149
Added 42161
Added 4785355
Added 4774396
Added 40837
Added 4784872
Added 4779626
Added 41825
Added 41617
Added 4794443
Added 412
Added 4757583
Added 4753786
Added 47848
Added 40744
Added 4797251
Added 47954
Added 47737
Added 47949
Added 4754989
Added 4797624
Added 47582
Added 41176
Added 4794112
Added 4799289
Added 4784255
Added 4784863
Added 47848
Added 4784896
Added 4784842
[ObjectId('58c0b7918e259c06c43ed6ae'), ObjectId('58c0b7918e259c06c43ed6af'), ObjectId('58c0b7918e259c06c43ed6b0'), ObjectId('58c0b7918e259c06c43ed6b1'), ObjectId('58c0b7918e259c06c43ed6b2'), ObjectId('58c0b7918e259c06c43ed6b3'), ObjectId('58c0b7918e259c06c43ed6b4'), ObjectId('58c0b7918e259c06c43ed6b5'), ObjectId('58c0b7918e259c06c43ed6b6'), ObjectId('58c0b7918e259c06c43ed6b7'), ObjectId('58c0b7918e259c06c43ed6b8'), ObjectId('58c0b7918e259c06c43ed6b9'), ObjectId('58c0b7918e259c06c43ed6ba'), ObjectId('58c0b7918e259c06c43

In [13]:
# Dump raw dictionary for each city to a picklefile
with open('../data/interim/0308_MN_Minneapolis.pkl', 'wb') as picklefile:
    pickle.dump(data, picklefile)

In [102]:
# Initiate driver for scraping
from random import randint
from time import sleep

chromedriver = "./chromedriver"
driver = webdriver.Chrome(chromedriver)

In [107]:
cursor = listings.find()
listing_df = pd.DataFrame(list(cursor))
listing_df.shape
listing_df.head()

Unnamed: 0,_id,address,annual_income,baths,beds,city,cost_square_feet,days_on_market,description_text,hoa_per_month,...,square_feet,state,unit_1_beds,unit_2_beds,unit_3_beds,unit_4_beds,unit_rent,url,year_built,zip
0,58b5f1b0dcc9d8d448be2325,3039 Bryant Ave S,66048.0,1.0,4.0,Minneapolis,174.0,1.0,Great investment opportunity in the heart of U...,,...,2952.0,MN,1.0,1.0,1.0,1.0,775.0,http://www.redfin.com/MN/Minneapolis/3039-Brya...,1913.0,55408
1,58b5f1b0dcc9d8d448be2326,3307 29th Ave S,19704.0,2.0,3.0,Minneapolis,123.0,1.0,"New windows, siding, foundation, updated bathr...",,...,1695.0,MN,2.0,1.0,,,,http://www.redfin.com/MN/Minneapolis/3307-29th...,1900.0,55406
2,58b5f1b0dcc9d8d448be2328,3540 Bloomington Ave,90960.0,1.0,8.0,Minneapolis,59.0,3.0,"great investment property , great location. Cu...",,...,4992.0,MN,2.0,2.0,2.0,2.0,,http://www.redfin.com/MN/Minneapolis/3540-Bloo...,1928.0,55407
3,58b5f1b0dcc9d8d448be2329,3028 Fillmore St NE,64860.0,3.0,5.0,Minneapolis,125.0,3.0,Beautiful Audobon Park duplex that is owner oc...,,...,3394.0,MN,3.0,2.0,1.0,,,http://www.redfin.com/MN/Minneapolis/3028-Fill...,1935.0,55418
4,58b5f1b0dcc9d8d448be232a,3945 Van Nest Ave,49536.0,1.0,3.0,Minneapolis,120.0,3.0,This classic Minneapolis duplex is filled with...,,...,2289.0,MN,1.0,1.0,1.0,,,http://www.redfin.com/MN/Minneapolis/3945-Van-...,1891.0,55409


In [103]:
# Scrape data and add to MongoDB
data = []
for index, row in listing_df.iterrows():
    print(row['_id'])
    if type(row['description_text']) == float or len(row['description_text']) < 7:
        content = get_content(driver, row['url'])
        soup = create_soup(content)
        data = scrape(row, soup)
        print(data)
        db.listings.update_one(
            {'_id': ObjectId(row['_id'])},
            {'$set': data}, 
            upsert=False)
        print("New: ", data['description_text'])
        sleep(randint(4,8))
    else:
        print("Existing: ", row['description_text'][0:20])

58b5f1b0dcc9d8d448be2325
Existing:  Great investment opp
58b5f1b0dcc9d8d448be2326
Existing:  New windows, siding,
58b5f1b0dcc9d8d448be2328
Existing:  great investment pro
58b5f1b0dcc9d8d448be2329
Existing:  Beautiful Audobon Pa
58b5f1b0dcc9d8d448be232a
Existing:  This classic Minneap
58b5f1b0dcc9d8d448be232b
Existing:  Great investment opp
58b5f1b0dcc9d8d448be232c
Existing:  Hard to find NE Minn
58b5f1b0dcc9d8d448be232d
Existing:  AMAZING INVESTMENT O
58b5f1b0dcc9d8d448be232e
Existing:  Great Investment opp
58b5f1b0dcc9d8d448be232f
Existing:  Great investment opp
58b5f1b0dcc9d8d448be2331
Scraped MLS 4792211
{'num_units': 2, 'annual_income': 1500.0, 'unit_rent': 2.0, 'description_text': nan, 'price': 225000.0}
New:  nan
58b5f1b0dcc9d8d448be2332
Existing:  Good cash flow. New 
58b5f1b0dcc9d8d448be2333
Existing:  Currently Owner Occu
58b5f1b0dcc9d8d448be2334
Existing:  Great owner-occupant
58b5f1b0dcc9d8d448be2336
Existing:  Very Popular Rental 
58b5f1b0dcc9d8d448be2338
Existing:  Seller 

KeyboardInterrupt: 

In [106]:
# Update existing records in MongoDB with new data from scrape
for index, row in listing_df.iterrows():
    print("Updating: ", row['_id'])
    data = get_data(row)
    db.listings.update_one(
        {'_id': ObjectId(row['_id'])},
        {'$set': data}, 
        upsert=False)

Updating:  58b5f1b0dcc9d8d448be2325
Added 4798563
Updating:  58b5f1b0dcc9d8d448be2326
Added 4798267
Updating:  58b5f1b0dcc9d8d448be2328
Added 4797798
Updating:  58b5f1b0dcc9d8d448be2329
Added 4797624
Updating:  58b5f1b0dcc9d8d448be232a
Added 4797595
Updating:  58b5f1b0dcc9d8d448be232b
Added 4797251
Updating:  58b5f1b0dcc9d8d448be232c
Added 4797250
Updating:  58b5f1b0dcc9d8d448be232d
Added 4796672
Updating:  58b5f1b0dcc9d8d448be232e
Added 4794443
Updating:  58b5f1b0dcc9d8d448be232f
Added 4794112
Updating:  58b5f1b0dcc9d8d448be2331
Added 4792211
Updating:  58b5f1b0dcc9d8d448be2332
Added 4791247
Updating:  58b5f1b0dcc9d8d448be2333
Added 4791098
Updating:  58b5f1b0dcc9d8d448be2334
Added 4789195
Updating:  58b5f1b0dcc9d8d448be2336
Added 4787440
Updating:  58b5f1b0dcc9d8d448be2338
Added 4784896
Updating:  58b5f1b0dcc9d8d448be2339
Added 4784890
Updating:  58b5f1b0dcc9d8d448be233a
Added 4784872
Updating:  58b5f1b0dcc9d8d448be233b
Added 4784863
Updating:  58b5f1b0dcc9d8d448be233c
Added 4784842
