# Sandbox notebook

This is just my sandbox notebook for doing throwaway work and quick experiments. I'll try to annotate as much as possible, but since it's a sandbox, no guarantees.

In [6]:
import pandas as pd
import seaborn as sns
import numpy
import os.path
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error

from math import sqrt

import matplotlib.pyplot as plt

In [7]:
housing_df = pd.read_csv(os.path.join(os.getcwd(), 'Sacramentorealestatetransactions.csv'))
housing_df.head()

Unnamed: 0,street,city,zip,state,beds,baths,sq__ft,type,sale_date,price,latitude,longitude
0,3526 HIGH ST,SACRAMENTO,95838,CA,2,1,836,Residential,Wed May 21 00:00:00 EDT 2008,59222,38.631913,-121.434879
1,51 OMAHA CT,SACRAMENTO,95823,CA,3,1,1167,Residential,Wed May 21 00:00:00 EDT 2008,68212,38.478902,-121.431028
2,2796 BRANCH ST,SACRAMENTO,95815,CA,2,1,796,Residential,Wed May 21 00:00:00 EDT 2008,68880,38.618305,-121.443839
3,2805 JANETTE WAY,SACRAMENTO,95815,CA,2,1,852,Residential,Wed May 21 00:00:00 EDT 2008,69307,38.616835,-121.439146
4,6001 MCMAHON DR,SACRAMENTO,95824,CA,2,1,797,Residential,Wed May 21 00:00:00 EDT 2008,81900,38.51947,-121.435768


In [8]:
housing_df.columns

Index(['street', 'city', 'zip', 'state', 'beds', 'baths', 'sq__ft', 'type',
       'sale_date', 'price', 'latitude', 'longitude'],
      dtype='object')

In [9]:
numeric_columns = ['beds', 'baths', 'sq__ft', 'latitude', 'longitude']
categorical_columns = ['city', 'zip', 'type']
scaled_columns = ['beds_scaled', 'baths_scaled', 'sq__ft_scaled', 'latitude_scaled', 'longitude_scaled']

categorical_pipeline = make_pipeline(
    OneHotEncoder()
)

numeric_pipeline = make_pipeline(
    StandardScaler()
)

price_pipeline = make_pipeline(
    KBinsDiscretizer(n_bins=5, encode='ordinal')
)

full_pipeline = ColumnTransformer([
    ("categorical", categorical_pipeline, categorical_columns),
    ("numeric", numeric_pipeline, numeric_columns),
    # ("price", price_pipeline, ['price'])
])

feature_df = full_pipeline.fit_transform(housing_df)
feature_df
# pd.DataFrame.sparse.from_spmatrix(feature_df)

full_pipeline.get_feature_names_out(housing_df.columns)
# We could make a pipeline here with a one-hot encoder for the categorical columns but that's already been done

# Fit and apply the StandardScaler with the training data, then apply it to the testing data
# training_df[scaled_columns] = full_pipeline.fit_transform(training_df)
# testing_df[scaled_columns] = full_pipeline.transform(testing_df)

# # Get the price columns
# training_prices = training_df['price']
# testing_prices = testing_df['price']

# # Drop the columns we don't want to use in the regression model
# columns_to_drop = ['street', 'city', 'zip', 'state', 'beds', 'baths', 'sq__ft', 'type', 'sale_date', 'price', 'latitude', 'longitude', 'price_bin']
# training_df = training_df.drop(columns=columns_to_drop)
# testing_df = testing_df.drop(columns=columns_to_drop)

array(['categorical__city_ANTELOPE', 'categorical__city_AUBURN',
       'categorical__city_CAMERON PARK', 'categorical__city_CARMICHAEL',
       'categorical__city_CITRUS HEIGHTS', 'categorical__city_COOL',
       'categorical__city_DIAMOND SPRINGS', 'categorical__city_EL DORADO',
       'categorical__city_EL DORADO HILLS', 'categorical__city_ELK GROVE',
       'categorical__city_ELVERTA', 'categorical__city_FAIR OAKS',
       'categorical__city_FOLSOM', 'categorical__city_FORESTHILL',
       'categorical__city_GALT', 'categorical__city_GARDEN VALLEY',
       'categorical__city_GOLD RIVER', 'categorical__city_GRANITE BAY',
       'categorical__city_GREENWOOD', 'categorical__city_LINCOLN',
       'categorical__city_LOOMIS', 'categorical__city_MATHER',
       'categorical__city_MEADOW VISTA',
       'categorical__city_NORTH HIGHLANDS',
       'categorical__city_ORANGEVALE', 'categorical__city_PENRYN',
       'categorical__city_PLACERVILLE', 'categorical__city_POLLOCK PINES',
       'cate

In [10]:
from sklearn.cluster import DBSCAN
coords = housing_df[['latitude', 'longitude']]
coords.head()

Unnamed: 0,latitude,longitude
0,38.631913,-121.434879
1,38.478902,-121.431028
2,38.618305,-121.443839
3,38.616835,-121.439146
4,38.51947,-121.435768


In [19]:
kms_per_radian = 6371.0088
epsilon = 100 / kms_per_radian

clust = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(coords)
len(set(clust.labels_))
pd.Series(clust.labels_).value_counts().sort_index()

0     394
1     335
2      14
3      41
4      21
5       6
6       1
7      11
8       1
9       2
10      1
11      3
12      5
13      1
14      1
15      1
16      1
17     30
18      1
19     64
20      3
21      1
22      1
23      3
24      1
25      5
26      1
27      1
28      1
29      1
30      3
31      1
32      1
33      3
34      1
35      1
36      1
37      1
38      1
39      1
40      1
41      1
42      1
43      1
44      1
45      1
46      1
47      1
48      1
49      1
50      1
51      1
52      1
53      1
54      1
55      1
56      3
dtype: int64

In [22]:
import folium

map = folium.Map(location=[coords['latitude'].mean(), coords['longitude'].mean()], zoom_start=10)

coords['cluster'] = clust.labels_

for row in coords[coords.cluster == 2].values:
    data = row.tolist()
    folium.Marker(data[0:2], icon=folium.Icon(color=data[2])).add_to(map)


map