In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import os
from scipy import stats
import numpy as np
import sqlalchemy
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [None]:
sql = "SELECT * FROM rentals;"
uri = os.environ["DB_URI"]
engine = sqlalchemy.create_engine(uri)
data = pd.read_sql(sql=sql, con=engine)

# Subset data

In [None]:
allowed_property_types = ["Condo / Apartment", "Loft / Studio"]
numerical_cols = ["rent", "year_built", "num_bathrooms", "num_bedrooms", "area"]
cols_of_interest = numerical_cols + ["city", "neighborhood"]
cleaned_data = data[data.property_type.isin(allowed_property_types)]
print(len(cleaned_data))
cleaned_data = cleaned_data.loc[:, cols_of_interest]
cleaned_data = cleaned_data.dropna()
print(len(cleaned_data))
cleaned_data = cleaned_data[(np.abs(stats.zscore(cleaned_data.loc[:,numerical_cols])) < 3).all(axis=1)]
print(len(cleaned_data))

# Prep data

In [None]:
def extract_neigborhood_from_city(city_name):
    city_parts = city_name.split("(")
    if len(city_parts) == 1:
        return city_parts[0]
    else:
        return city_parts[1].strip(")").strip()

In [None]:
cleaned_data.neighborhood = cleaned_data.city.apply(lambda x: extract_neigborhood_from_city(x))
cleaned_data.city = cleaned_data.city.apply(lambda x: x.split("(")[0].strip())

In [None]:
label_encoder = LabelEncoder()
cleaned_data["region"] = label_encoder.fit_transform(cleaned_data.neighborhood)

In [None]:
features = ["year_built", "num_bathrooms", "num_bedrooms", "area", "region"]
X = cleaned_data.loc[:, features]
y = cleaned_data.rent

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=100)

# Model

In [None]:
estimator = RandomForestRegressor(n_estimators=10, max_depth=10, bootstrap=True)

In [None]:
estimator.fit(X_train, y_train)
estimator.score(X_test, y_test)