In [7]:
import pandas as pd

pd.set_option('display.max_columns', None)  # None means unlimited
pd.set_option('display.max_rows', None)     # None means unlimited
pd.set_option('display.max_colwidth', None) # None means show entire content of the column

In [40]:
import geopy.distance
import pandas as pd
from geopy.geocoders import Nominatim
import pickle
import time
import uuid

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import explained_variance_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from homeharvest import scrape_property
from datetime import datetime, timedelta
from azure.storage.blob import BlobServiceClient, BlobBlock


class Config:
    def __init__(self):
        self.az_storage_conn_str = "DefaultEndpointsProtocol=https;AccountName=estateadviserstorage;AccountKey=Y52EdpNysG+MJetBBg7T+JeLfC/H8ZkB0HyGdRG+NItsVcY5KsKINikApihU4OqgERa2frz1gCVw+AStUiwuzg==;EndpointSuffix=core.windows.net"
        self.az_storage_container_name = "models"
        self.yrs_to_predict = 5
        self.hist_start_year = 2015
        self.hist_batch_incr_days = 90
        self.active_listing_days = 30
        



config = Config()


def scrape_historical_sales(location):
    date_format = "%Y-%m-%d"
    increment = timedelta(days=config.hist_batch_incr_days)
    start_date = datetime(year=config.hist_start_year, month=1, day=1)
    end_date = start_date + increment

    dataframes = []

    while start_date < datetime.utcnow():
        start_date_str = start_date.strftime(date_format)
        end_date_str = end_date.strftime(date_format)

        try:
            properties = scrape_property(
                location=location,
                listing_type="sold",  # or (for_sale, for_rent)
                date_from=start_date_str,
                date_to=end_date_str,
            )
            print(f"Start:{start_date_str} End:{end_date_str} Count:{len(properties)}")
            dataframes.append(properties)

        except ValueError as e:
            print(f"Start:{start_date_str} End:{end_date_str} Error:{str(e)}")

        start_date = end_date + timedelta(days=1)
        end_date += increment

    combined_df = pd.concat(dataframes, ignore_index=True)
    print(combined_df.info())
    return combined_df


class PropertyDatasetProcessor:
    def __init__(self, df, city):
        self.dataset = df
        self.city = city

        geolocator = Nominatim(user_agent="RealEstateAdvisor")
        location = geolocator.geocode(f"Downtown {city}")
        self.downtown_lat, self.downtown_lon = location.latitude, location.longitude

    @staticmethod
    def calc_lat_lon_dist(lat1, lon1, lat2, lon2):
        if pd.isna(lat1) or pd.isna(lon1) or pd.isna(lat2) or pd.isna(lon2):
            return None
        return round(geopy.distance.geodesic((lat1, lon1), (lat2, lon2)).km, 1)

    @staticmethod
    def calc_baths_num(full_baths, half_baths):
        if pd.isna(full_baths) and pd.isna(half_baths):
            return 0.0
        elif pd.isna(full_baths):
            return half_baths * 0.5
        elif pd.isna(half_baths):
            return full_baths
        else:
            return full_baths + 0.5 * half_baths

    def clean_dataset(self):
        dataset = self.dataset[(self.dataset.city == self.city)]

        dataset["distance_to_downtown"] = dataset.apply(
            lambda row: self.calc_lat_lon_dist(
                row["latitude"], row["longitude"], self.downtown_lat, self.downtown_lon
            ),
            axis=1,
        )
        dataset["baths"] = dataset.apply(
            lambda row: self.calc_baths_num(row["full_baths"], row["half_baths"]),
            axis=1,
        )
        dataset["sqft"] = dataset.apply(
            lambda row: (
                0.0 if pd.isna(row["sqft"]) and row["style"] == "LAND" else row["sqft"]
            ),
            axis=1,
        )
        dataset["style"] = dataset.apply(
            lambda row: "OTHER" if pd.isna(row["style"]) else row["style"], axis=1
        )
        dataset["lot_sqft"] = dataset.apply(
            lambda row: 0.0 if pd.isna(row["lot_sqft"]) else row["lot_sqft"], axis=1
        )
        dataset["hoa_fee"] = dataset.apply(
            lambda row: 0.0 if pd.isna(row["hoa_fee"]) else row["hoa_fee"], axis=1
        )
        dataset["stories"] = dataset.apply(
            lambda row: 0.0 if pd.isna(row["stories"]) else row["stories"], axis=1
        )
        dataset["beds"] = dataset.apply(
            lambda row: 0.0 if pd.isna(row["beds"]) else row["beds"], axis=1
        )
        dataset["sold_year"] = pd.to_datetime(dataset["last_sold_date"]).apply(
            lambda x: x.year
        )

        dataset.dropna(
            subset=["year_built", "sqft", "distance_to_downtown", "parking_garage"],
            inplace=True,
        )
        dataset["age"] = dataset.apply(
            lambda row: row["sold_year"] - row["year_built"], axis=1
        )

        return dataset


def train_model(dataset):
    cols_to_drop = [
        "property_url",
        "status",
        "street",
        "unit",
        "city",
        "state",
        "days_on_mls",
        "list_price",
        "list_date",
        "latitude",
        "longitude",
        "primary_photo",
        "mls",
        "mls_id",
        "price_per_sqft",
        "alt_photos",
        "style",
        "full_baths",
        "half_baths",
        "last_sold_date",
        "sold_price",
    ]

    X = dataset.drop(cols_to_drop, axis=1)
    y = dataset["sold_price"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42
    )

    rf_model = RandomForestRegressor(n_estimators=50, random_state=42)
    rf_model.fit(X_train, y_train)
    model_score = rf_model.score(X_test, y_test)
    print(f"Model trained with the score: {model_score}")

    return rf_model


def get_chunk_blocks(data, blob_client, chunk_size=4 * 1024 * 1024):
    block_list = []

    index = 0
    while index < len(data):
        chunk_data = data[index : index + chunk_size]

        if not chunk_data:
            break
        blk_id = str(uuid.uuid4())
        blob_client.stage_block(block_id=blk_id, data=chunk_data)
        block_list.append(BlobBlock(block_id=blk_id))

        index += chunk_size

    return block_list


def write_model_to_storage(model, city: str, state: str):
    serialized_model = pickle.dumps(model)

    blob_service_client = BlobServiceClient.from_connection_string(
        config.az_storage_conn_str
    )
    blob_name = f"{city.lower()}_{state.lower()}.pkl"
    blob_client = blob_service_client.get_blob_client(
        container=config.az_storage_container_name, blob=blob_name
    )

    block_list = get_chunk_blocks(
        serialized_model, blob_client, chunk_size=4 * 1024 * 1024
    )
    blob_client.commit_block_list(block_list)

    print(f"{blob_name} uploaded successfully.")

In [5]:
df = scrape_historical_sales("Miami, FL")

Start:2015-01-01 End:2015-04-01 Count:2995
Start:2015-04-02 End:2015-06-30 Count:3502
Start:2015-07-01 End:2015-09-28 Count:3387
Start:2015-09-29 End:2015-12-27 Count:3024
Start:2015-12-28 End:2016-03-26 Error:'INDUSTRIAL' is not a valid PropertyType
Start:2016-03-27 End:2016-06-24 Count:3522
Start:2016-06-25 End:2016-09-22 Error:'INDUSTRIAL' is not a valid PropertyType
Start:2016-09-23 End:2016-12-21 Count:3095
Start:2016-12-22 End:2017-03-21 Count:2947
Start:2017-03-22 End:2017-06-19 Error:'INDUSTRIAL' is not a valid PropertyType
Start:2017-06-20 End:2017-09-17 Count:2764
Start:2017-09-18 End:2017-12-16 Count:2869
Start:2017-12-17 End:2018-03-16 Count:2899
Start:2018-03-17 End:2018-06-14 Count:3659
Start:2018-06-15 End:2018-09-12 Count:4151
Start:2018-09-13 End:2018-12-11 Count:3883
Start:2018-12-12 End:2019-03-11 Count:3321
Start:2019-03-12 End:2019-06-09 Count:4149
Start:2019-06-10 End:2019-09-07 Count:4280
Start:2019-09-08 End:2019-12-06 Count:3965
Start:2019-12-07 End:2020-03-05 

In [15]:
df.head().drop(["alt_photos"], axis=1)

Unnamed: 0,property_url,mls,mls_id,status,style,street,unit,city,state,zip_code,beds,full_baths,half_baths,sqft,year_built,days_on_mls,list_price,list_date,sold_price,last_sold_date,lot_sqft,price_per_sqft,latitude,longitude,stories,hoa_fee,parking_garage,primary_photo
0,https://www.realtor.com/realestateandhomes-detail/5348699173,FLFL,A2066865,SOLD,PropertyType.SINGLE_FAMILY,3641 NW 16th St,,Miami,FL,33125,3,2,,1433,1948,7,195000,2015-03-25,205000,2015-04-01,5400.0,143,25.788917,-80.254697,,,,http://p.rdcpix.com/v07/ld41d1245-m0od-w480_h360_x2.webp?w=1080&q=75
1,https://www.realtor.com/realestateandhomes-detail/6048933050,FLFL,A2055459,SOLD,PropertyType.SINGLE_FAMILY,6031 SW 115th Ave,,Miami,FL,33173,4,2,,1848,1989,81,325000,2015-01-10,310000,2015-04-01,4958.0,168,25.71052,-80.379932,,,1.0,http://p.rdcpix.com/v02/ld8590a45-m0od-w480_h360_x2.webp?w=1080&q=75
2,https://www.realtor.com/realestateandhomes-detail/6559185457,FLFL,A2016177,SOLD,PropertyType.CONDO_TOWNHOME_ROWHOME_COOP,2280 SW 32nd Ave,Ste 210,Miami,FL,33145,1,1,,763,2004,153,170000,2014-10-30,160000,2015-04-01,,210,25.748703,-80.246643,,,1.0,http://p.rdcpix.com/v03/l10fcee44-m0od-w480_h360_x2.webp?w=1080&q=75
3,https://www.realtor.com/realestateandhomes-detail/5153160217,FLFL,A2071383,SOLD,PropertyType.SINGLE_FAMILY,5112 SW 151st Pl,,Miami,FL,33185,3,2,,2001,1981,14,259900,2015-03-18,255000,2015-04-01,4576.0,127,25.717333,-80.437491,,,1.0,http://p.rdcpix.com/v16/l14291545-m0od-w480_h360_x2.webp?w=1080&q=75
4,https://www.realtor.com/realestateandhomes-detail/6522900504,FLFL,A2033459,SOLD,PropertyType.CONDO_TOWNHOME_ROWHOME_COOP,13215 SW 143 Te,Unit 13215,Miami,FL,33186,5,3,,2400,2002,139,247500,2014-11-13,243000,2015-04-01,2500.0,101,25.634643,-80.407095,,,,http://p.rdcpix.com/v27/lebbbfb44-m0od-w480_h360_x2.webp?w=1080&q=75


In [35]:
dataset = PropertyDatasetProcessor(df, "Miami").clean_dataset()
dataset.head().drop(["alt_photos"], axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["distance_to_downtown"] = dataset.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["baths"] = dataset.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["sqft"] = dataset.apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

Unnamed: 0,property_url,mls,mls_id,status,style,street,unit,city,state,zip_code,beds,full_baths,half_baths,sqft,year_built,days_on_mls,list_price,list_date,sold_price,last_sold_date,lot_sqft,price_per_sqft,latitude,longitude,stories,hoa_fee,parking_garage,primary_photo,distance_to_downtown,baths,sold_year,age
1,https://www.realtor.com/realestateandhomes-detail/6048933050,FLFL,A2055459,SOLD,PropertyType.SINGLE_FAMILY,6031 SW 115th Ave,,Miami,FL,33173,4.0,2,,1848.0,1989,81,325000,2015-01-10,310000,2015-04-01,4958.0,168,25.71052,-80.379932,0.0,0.0,1,http://p.rdcpix.com/v02/ld8590a45-m0od-w480_h360_x2.webp?w=1080&q=75,20.2,2.0,2015,26
2,https://www.realtor.com/realestateandhomes-detail/6559185457,FLFL,A2016177,SOLD,PropertyType.CONDO_TOWNHOME_ROWHOME_COOP,2280 SW 32nd Ave,Ste 210,Miami,FL,33145,1.0,1,,763.0,2004,153,170000,2014-10-30,160000,2015-04-01,0.0,210,25.748703,-80.246643,0.0,0.0,1,http://p.rdcpix.com/v03/l10fcee44-m0od-w480_h360_x2.webp?w=1080&q=75,6.2,1.0,2015,11
3,https://www.realtor.com/realestateandhomes-detail/5153160217,FLFL,A2071383,SOLD,PropertyType.SINGLE_FAMILY,5112 SW 151st Pl,,Miami,FL,33185,3.0,2,,2001.0,1981,14,259900,2015-03-18,255000,2015-04-01,4576.0,127,25.717333,-80.437491,0.0,0.0,1,http://p.rdcpix.com/v16/l14291545-m0od-w480_h360_x2.webp?w=1080&q=75,25.5,2.0,2015,34
11,https://www.realtor.com/realestateandhomes-detail/5235282982,FLFL,A2072602,SOLD,PropertyType.SINGLE_FAMILY,16220 SW 103rd Pl,,Miami,FL,33157,3.0,2,,1732.0,2000,17,86900,2015-03-15,133500,2015-04-01,8346.0,77,25.618355,-80.361052,0.0,0.0,1,http://p.rdcpix.com/v02/lcbfd1545-m0od-w480_h360_x2.webp?w=1080&q=75,24.1,2.0,2015,15
13,https://www.realtor.com/realestateandhomes-detail/6392916898,FLFL,A2070735,SOLD,PropertyType.SINGLE_FAMILY,16259 SW 44th Ln,,Miami,FL,33185,4.0,3,,2340.0,2005,7,329000,2015-03-25,315000,2015-04-01,3096.0,135,25.722867,-80.457076,0.0,0.0,2,http://p.rdcpix.com/v07/l65be1445-m0od-w480_h360_x2.webp?w=1080&q=75,27.3,3.0,2015,10


In [36]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 35549 entries, 1 to 112668
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   property_url          35549 non-null  object 
 1   mls                   35279 non-null  object 
 2   mls_id                35279 non-null  object 
 3   status                35549 non-null  object 
 4   style                 35549 non-null  object 
 5   street                35549 non-null  object 
 6   unit                  17574 non-null  object 
 7   city                  35549 non-null  object 
 8   state                 35549 non-null  object 
 9   zip_code              35549 non-null  object 
 10  beds                  35549 non-null  float64
 11  full_baths            35543 non-null  object 
 12  half_baths            10471 non-null  object 
 13  sqft                  35549 non-null  float64
 14  year_built            35549 non-null  object 
 15  days_on_mls           2

In [37]:
model = train_model(dataset)

Model trained with the score: 0.4681595924865435


In [41]:
write_model_to_storage(model, "Miami", "FL")

miami_fl.pkl uploaded successfully.
