### This Script Runs the Craigslist Scrape, matches the neighborhoods from craigslist to those in the algorithm
### The script then inputs the projections into the dataframe for each property

In [1]:
import pandas as pd
import numpy as np
import geopandas as gp
import pickle

In [2]:
from requests import get

#get the first page of the east bay housing prices
response = get('https://sfbay.craigslist.org/search/sfc/reb?hasPic=1&search_distance=5&postal=94114&availabilityMode=0&housing_type=1&housing_type=2&housing_type=3&housing_type=4&housing_type=6&housing_type=9&sale_date=all+dates') #get rid of those lame-o's that post a housing option without a pic using their filter

from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')

#get the macro-container for the housing posts
posts = html_soup.find_all('li', class_= 'result-row')


from time import sleep
import re
from random import randint #avoid throttling by not sending too many requests one after the other
from warnings import warn
from time import time
from IPython.core.display import clear_output
import numpy as np

#find the total number of posts to find the limit of the pagination
results_num = html_soup.find('div', class_= 'search-legend')
results_total = int(results_num.find('span', class_='totalcount').text) #pulled the total count of posts as the upper bound of the pages array

#each page has 119 posts so each new page is defined as follows: s=120, s=240, s=360, and so on. So we need to step in size 120 in the np.arange function
pages = np.arange(0, results_total+1)

iterations = 0

post_timing = []
post_hoods = []
post_title_texts = []
bedroom_counts = []
sqfts = []
post_links = []
post_prices = []

for post in posts:

    if post.find('span', class_ = 'result-hood') is not None:

        #posting date
        #grab the datetime element 0 for date and 1 for time
        post_datetime = post.find('time', class_= 'result-date')['datetime']
        post_timing.append(post_datetime)

        #neighborhoods
        post_hood = post.find('span', class_= 'result-hood').text
        post_hood=re.split(r'[\(\)]',post_hood)
        post_hood=''.join(post_hood)
        post_hoods.append(post_hood)
        

        #title text
        post_title = post.find('a', class_='result-title hdrlnk')
        post_title_text = post_title.text
        post_title_texts.append(post_title_text)

        #post link
        post_link = post_title['href']
        post_links.append(post_link)
            
        #removes the \n whitespace from each side, removes the currency symbol, and turns it into an int
        post_price = int(post.a.text.strip().replace("$", "")) 
        post_prices.append(post_price)
            
        if post.find('span', class_ = 'housing') is None:
            bedroom_count = np.nan
            bedroom_counts.append(bedroom_count)
            sqft=np.nan
            sqfts.append(sqft)
                
        #if the first element is accidentally square footage
        elif 'ft2' in post.find('span', class_ = 'housing').text.split()[0]:

            #make bedroom nan
            bedroom_count = np.nan
            bedroom_counts.append(bedroom_count)

            #make sqft the first element
            sqft = int(post.find('span', class_ = 'housing').text.split()[0][:-3])
            sqfts.append(sqft)

            #if the length of the housing details element is more than 2
        elif len(post.find('span', class_ = 'housing').text.split()) > 2:

            #therefore element 0 will be bedroom count
            bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
            bedroom_counts.append(bedroom_count)

            #and sqft will be number 3, so set these here and append
            sqft = int(post.find('span', class_ = 'housing').text.split()[2][:-3])
            sqfts.append(sqft)

            #if there is num bedrooms but no sqft
        elif len(post.find('span', class_ = 'housing').text.split()) == 2:

            #therefore element 0 will be bedroom count
            bedroom_count = post.find('span', class_ = 'housing').text.replace("br", "").split()[0]
            bedroom_counts.append(bedroom_count)

            #and sqft will be number 3, so set these here and append
            sqft = np.nan
            sqfts.append(sqft)                    

        else:
            bedroom_count = np.nan
            bedroom_counts.append(bedroom_count)

            sqft = np.nan
            sqfts.append(sqft)

        #if none of those conditions catch, make bedroom nan, this won't be needed    

import pandas as pd

craigslist_options = pd.DataFrame({'posted': post_timing,
                       'neighborhood': post_hoods,
                       'post title': post_title_texts,
                       'number bedrooms': bedroom_counts,
                        'sqft': sqfts,
                        'URL': post_links,
                       'price': post_prices})

In [3]:
Latitude=[]
Longitude=[]
Picture=[]
for i in range(0,len(craigslist_options.iloc[:,2])):
    page_link=get(craigslist_options.iloc[i,5])
    page_link_soup=BeautifulSoup(page_link.text,'html.parser')
    FrontPic=page_link_soup.find('div',class_='slide first visible')
    for link in FrontPic.find_all('img'):
        FrontPic1=link.get('src')
    Picture.append(FrontPic1)
    MapPic=page_link_soup.find('div',id='map')
    MapPicLat=MapPic['data-latitude']
    MapPicLon=MapPic['data-longitude']
    Latitude.append(MapPicLat)
    Longitude.append(MapPicLon)

craigslist_options['Latitude']=Latitude
craigslist_options['Longitude']=Longitude
craigslist_options['Picture Link']=Picture
craigslist_options=craigslist_options[craigslist_options['price'] >=100000]
craigslist_options

Unnamed: 0,posted,neighborhood,post title,number bedrooms,sqft,URL,price,Latitude,Longitude,Picture Link
0,2020-04-29 10:28,mission district,Treat Street 4 Bedroom 2 Bathroom,4.0,1250.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1399000,37.7587,-122.433,https://images.craigslist.org/00B0B_3qHgjjC1UI...
1,2020-05-03 13:05,glen park,Great home deals around the city,4.0,2200.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1049000,37.7587,-122.433,https://images.craigslist.org/00g0g_yJSOiN53xK...
2,2020-05-05 06:58,castro / upper market,Fabulous 2 unit in Upper Castro,8.0,3832.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,3264000,37.763265,-122.442534,https://images.craigslist.org/01717_hDr1DmN7iw...
3,2020-05-04 12:04,mission district,Well maintained office building for sale!,,30000.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,13500000,37.764057,-122.41868,https://images.craigslist.org/00B0B_agTyaKYbYk...
6,2020-05-03 21:36,mission district,5 Unit Inner Mission Multifamily Property,,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,2295000,37.751145,-122.409914,https://images.craigslist.org/00k0k_hszSJut6cB...
7,2020-05-03 21:35,mission district,5 Unit Inner Mission Multifamily Property,,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,2295000,37.751145,-122.409914,https://images.craigslist.org/00k0k_hszSJut6cB...
10,2020-04-28 18:24,lower pac hts,LOW PAC HEIGHTS,3.0,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1999950,37.7856,-122.4358,https://images.craigslist.org/00S0S_cJFi2yRoU9...
11,2020-04-28 18:24,lower pac hts,LOW PAC HEIGHTS sun 1-5,3.0,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1999950,37.7856,-122.4358,https://images.craigslist.org/00z0z_jEQ6q7Qzrj...
19,2020-05-05 11:47,Pac Heights,Development Opportunity,,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,2550000,37.785613,-122.444515,https://images.craigslist.org/00909_kvIH7Kesp1...
20,2020-05-05 11:15,pacific heights,"Modern 2 BR, 2.5 BA Condo off Polk St. Walk to...",2.0,1217.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,130000,37.788588,-122.421178,https://images.craigslist.org/01212_2luRRLhZl4...


In [4]:
hoods_frame=gp.read_file('zillow-neighborhoods.geojson')
hoods_frame.index=hoods_frame['name']
hoods_frame

['Ingleside Heights', 'Ingleside Terrace', 'Yerba Buena Island', 'Inner Richmond', 'Cow Hollow', 'Central Waterfront - Dogpatch', 'Jordan Park - Laurel Heights', 'Lakeside', 'Van Ness - Civic Center', 'Pine Lake Park', 'Bayview', 'Anza Vista', 'St. Francis Wood', 'Financial District', 'Golden Gate Park', 'Duboce Triangle', 'Outer Mission', 'Inner Sunset', 'Potrero Hill', 'Miraloma Park', 'Haight-Ashbury', 'Western Addition', 'North Waterfront', 'Balboa Terrace', 'Forest Hill', 'Mission Bay', 'Inner Parkside', 'Silver Terrace', 'Central Richmond', 'Tenderloin', 'Glen Park', 'Stonestown', 'Lakeshore', 'Forest Hill Extension', 'Alamo Square', 'Westwood Park', 'Midtown Terrace', 'Lower Pacific Heights', 'Nob Hill', 'Golden Gate Heights', 'Alkatraz Island', 'Twin Peaks', 'Lone Mountain', 'Crocker Amazon', 'South of Market', 'Mission', 'Treasure Island', 'Presidio', 'South Beach', 'Eureka Valley - Dolores Heights - Castro', 'Diamond Heights', 'Visitacion Valley', 'Marina', 'Outer Parkside', 

In [5]:
from shapely.geometry import Point, Polygon
SampLats=pd.to_numeric(craigslist_options['Latitude'])
SampLongs=pd.to_numeric(craigslist_options['Longitude'])
coords=list(zip(SampLongs,SampLats))

SampHoods=[]
for i in range(0,len(coords)):
    loca=Point(coords[i])
    for j  in range(0,len(hoods_frame['geometry'])):
        if loca.within(hoods_frame.iloc[j,5])==True:
            SampHoods.append(hoods_frame.iloc[j,1])
            
    try:
        SampHoods[i]
    except:
        SampHoods.append("None")

craigslist_options['neighborhood']=SampHoods
craigslist_options=craigslist_options[craigslist_options['neighborhood'] !='None']
for i in range(0,len(craigslist_options['neighborhood'])):
    if craigslist_options.iloc[i,1]=='Haight-Ashbury':
        craigslist_options.iloc[i,1]='Haight'
        
craigslist_options.index=range(0,len(craigslist_options))
craigslist_options

Unnamed: 0,posted,neighborhood,post title,number bedrooms,sqft,URL,price,Latitude,Longitude,Picture Link
0,2020-04-29 10:28,Eureka Valley - Dolores Heights - Castro,Treat Street 4 Bedroom 2 Bathroom,4.0,1250.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1399000,37.7587,-122.433,https://images.craigslist.org/00B0B_3qHgjjC1UI...
1,2020-05-03 13:05,Eureka Valley - Dolores Heights - Castro,Great home deals around the city,4.0,2200.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1049000,37.7587,-122.433,https://images.craigslist.org/00g0g_yJSOiN53xK...
2,2020-05-05 06:58,Corona Heights,Fabulous 2 unit in Upper Castro,8.0,3832.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,3264000,37.763265,-122.442534,https://images.craigslist.org/01717_hDr1DmN7iw...
3,2020-05-04 12:04,Mission,Well maintained office building for sale!,,30000.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,13500000,37.764057,-122.41868,https://images.craigslist.org/00B0B_agTyaKYbYk...
4,2020-05-03 21:36,Mission,5 Unit Inner Mission Multifamily Property,,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,2295000,37.751145,-122.409914,https://images.craigslist.org/00k0k_hszSJut6cB...
5,2020-05-03 21:35,Mission,5 Unit Inner Mission Multifamily Property,,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,2295000,37.751145,-122.409914,https://images.craigslist.org/00k0k_hszSJut6cB...
6,2020-04-28 18:24,Lower Pacific Heights,LOW PAC HEIGHTS,3.0,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1999950,37.7856,-122.4358,https://images.craigslist.org/00S0S_cJFi2yRoU9...
7,2020-04-28 18:24,Lower Pacific Heights,LOW PAC HEIGHTS sun 1-5,3.0,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1999950,37.7856,-122.4358,https://images.craigslist.org/00z0z_jEQ6q7Qzrj...
8,2020-05-05 11:47,Lower Pacific Heights,Development Opportunity,,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,2550000,37.785613,-122.444515,https://images.craigslist.org/00909_kvIH7Kesp1...
9,2020-05-05 11:15,Van Ness - Civic Center,"Modern 2 BR, 2.5 BA Condo off Polk St. Walk to...",2.0,1217.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,130000,37.788588,-122.421178,https://images.craigslist.org/01212_2luRRLhZl4...


In [6]:
craigslist_options['Year1']=np.zeros((len(craigslist_options.index),1))
craigslist_options['Year3']=np.zeros((len(craigslist_options.index),1))
craigslist_options['Year5']=np.zeros((len(craigslist_options.index),1))
craigslist_options['Year10']=np.zeros((len(craigslist_options.index),1))
craigslist_options['Year20']=np.zeros((len(craigslist_options.index),1))

In [7]:
prediction_df = pickle.load( open( "output.pkl", "rb" ) )
prediction_df
for i in range(0,len(craigslist_options['neighborhood'])):
    for j in prediction_df.index:
        if craigslist_options.iloc[i,1]==j:
            craigslist_options.loc[i,'Year1']=prediction_df.loc[j,'Year 1']*craigslist_options.loc[i,'price']
            craigslist_options.loc[i,'Year3']=prediction_df.loc[j,'Year 3']*craigslist_options.loc[i,'price']
            craigslist_options.loc[i,'Year5']=prediction_df.loc[j,'Year 5']*craigslist_options.loc[i,'price']   
            craigslist_options.loc[i,'Year10']=prediction_df.loc[j,'Year 10']*craigslist_options.loc[i,'price']
            craigslist_options.loc[i,'Year20']=prediction_df.loc[j,'Year 20']*craigslist_options.loc[i,'price']
craigslist_options=craigslist_options[craigslist_options['Year1'] >1]
craigslist_options.index=range(0,len(craigslist_options))
craigslist_options

Unnamed: 0,posted,neighborhood,post title,number bedrooms,sqft,URL,price,Latitude,Longitude,Picture Link,Year1,Year3,Year5,Year10,Year20
0,2020-05-05 06:58,Corona Heights,Fabulous 2 unit in Upper Castro,8.0,3832.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,3264000,37.763265,-122.442534,https://images.craigslist.org/01717_hDr1DmN7iw...,3432108.0,4242803.0,6854059.0,9651579.0,14455200.0
1,2020-05-04 12:04,Mission,Well maintained office building for sale!,,30000.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,13500000,37.764057,-122.41868,https://images.craigslist.org/00B0B_agTyaKYbYk...,13598850.0,17442290.0,28353200.0,42946250.0,64305750.0
2,2020-05-03 21:36,Mission,5 Unit Inner Mission Multifamily Property,,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,2295000,37.751145,-122.409914,https://images.craigslist.org/00k0k_hszSJut6cB...,2311805.0,2965189.0,4820045.0,7300862.0,10931980.0
3,2020-05-03 21:35,Mission,5 Unit Inner Mission Multifamily Property,,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,2295000,37.751145,-122.409914,https://images.craigslist.org/00k0k_hszSJut6cB...,2311805.0,2965189.0,4820045.0,7300862.0,10931980.0
4,2020-04-28 18:24,Lower Pacific Heights,LOW PAC HEIGHTS,3.0,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1999950,37.7856,-122.4358,https://images.craigslist.org/00S0S_cJFi2yRoU9...,2031397.0,2635201.0,3961258.0,5567475.0,8058637.0
5,2020-04-28 18:24,Lower Pacific Heights,LOW PAC HEIGHTS sun 1-5,3.0,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1999950,37.7856,-122.4358,https://images.craigslist.org/00z0z_jEQ6q7Qzrj...,2031397.0,2635201.0,3961258.0,5567475.0,8058637.0
6,2020-05-05 11:47,Lower Pacific Heights,Development Opportunity,,,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,2550000,37.785613,-122.444515,https://images.craigslist.org/00909_kvIH7Kesp1...,2590096.0,3359965.0,5050730.0,7098708.0,10275020.0
7,2020-05-04 14:42,South of Market,3D TOUR LIVE corner condo at The Beacon birds ...,2.0,1230.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1295000,37.777134,-122.39453,https://images.craigslist.org/00T0T_4UcgJNig7M...,1333323.0,1760053.0,2702165.0,3707825.0,4949665.0
8,2020-05-04 14:41,South of Market,3D TOUR - Stylish corner 2Br at The Beacon wit...,2.0,1286.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1348888,37.777151,-122.393982,https://images.craigslist.org/00v0v_d8EWat6XrW...,1388805.0,1833293.0,2814608.0,3862116.0,5155632.0
9,2020-05-04 14:43,South of Market,3D TOUR! Beautiful waterfront oasis on Mission...,2.0,1300.0,https://sfbay.craigslist.org/sfc/reb/d/san-fra...,1499800,37.776414,-122.393146,https://images.craigslist.org/00P0P_JbHaDCzvZl...,1544183.0,2038400.0,3129503.0,4294205.0,5732438.0


In [8]:
craigslist_options.to_pickle('craigslist_df.quinn',protocol=4)
