In [None]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [None]:
# fetch data 

airbnb_data = pd.read_csv('Kaggle_Data/Airbnb_listings_Austin.csv')

airbnb_data.head()

In [None]:
Kval = 6 # Number of clusters

airbnb_data.isna().sum()


In [None]:
# define function to import viz libraries
import plotly
plotly.offline.init_notebook_mode(connected=True)
from plotly.graph_objs import *
from plotly import tools
import plotly.graph_objects as go
import seaborn as sns

In [None]:
# correl = subset_data.corr()
correl = airbnb_data.corr()

trace = go.Heatmap(z=correl.values,
                  x=correl.index.values,
                  y=correl.columns.values)
data=[trace]
plotly.offline.iplot(data, filename='basic-heatmap')

In [None]:
airbnb_data.columns

In [None]:
cols = ['latitude', 'longitude']
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
subset_data = pd.DataFrame(sc.fit_transform(airbnb_data[cols]), columns = airbnb_data[cols].columns, index = airbnb_data.index)

In [None]:
subset_data.head()

In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters = Kval, 
           init = 'random', 
           n_init = 10, 
           max_iter = 500, 
           tol = 1e-04, 
           random_state = 50)
data_km = pd.DataFrame(km.fit_predict(subset_data), index = subset_data.index)

In [None]:
data_km.head()

In [None]:
data_km.rename(columns = {0:'cluster'}, inplace = True)
data_km = data_km.merge(airbnb_data[['latitude', 'longitude']], left_index = True, right_index = True)
# If we use the above, we would get the actual latitude used in the data as opposed to the scaled one with below
# But scaled is what we have for the cetroids, so using it
# data_km = data_km.merge(subset_data[['latitude', 'longitude']], left_index = True, right_index = True)
data_km.head()

In [None]:
# pd.set_option('display.max_rows', None)
# data_km.sort_values(by = ['cluster'])

In [None]:
plot_data = []
for clus in set(data_km['cluster']):
    df = data_km[data_km['cluster'] == clus]
    plot_data.append(go.Scatter(x= df['longitude'], y= df['latitude'], 
                                # text=df["name"], 
                                name = 'cluster ' + str(clus), mode = 'markers'))

# # Added the following for cluster centers
# df_cc = pd.DataFrame(km.cluster_centers_)
# plot_data.append(go.Scatter(x= df_cc[1], y= df_cc[0], 
#                             # text=df["name"], 
#                             name = 'cluster center', mode = 'markers'))

layout = go.Layout(xaxis = dict(title='longitude'), yaxis = dict(title= 'latitude'), 
                   title = 'Clustering')
fig = go.Figure(data= plot_data, layout=layout)
plotly.offline.iplot(fig)
    


In [None]:
distortions = []
for i in range(2, 30): 
    km = KMeans(n_clusters = i, 
           init = 'k-means++', # this initializes by placing the initial ones randomly and uniformly in grid
           n_init = 10, 
           max_iter = 500, 
           # tol = 1e-04, 
           random_state = 50)
    km.fit(subset_data)
    distortions.append(km.inertia_)
plt.plot(range(2,30), distortions, marker = 'o')
plt.xlabel('Number of Clusters')
plt.ylabel('Distortion')
plt.tight_layout()
plt.show()

In [None]:
from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters = Kval,  
           affinity = 'euclidean', 
           linkage = 'complete')
data_ac = pd.DataFrame(ac.fit_predict(subset_data), index = subset_data.index)
data_ac.head()

In [None]:
data_ac.rename(columns = {0:'cluster'}, inplace = True)
# data_ac = data_ac.merge(airbnb_data[['latitude', 'longitude']], left_index = True, right_index = True)
# If we use the above, we would get the actual latitude used in the data as opposed to the scaled one with below
# But scaled is what we have for the cetroids, so using it
data_ac = data_ac.merge(subset_data[['latitude', 'longitude']], left_index = True, right_index = True)
data_ac.head()

In [None]:
plot_data = []
for clus in set(data_ac['cluster']):
    df = data_ac[data_ac['cluster'] == clus]
    plot_data.append(go.Scatter(x= df['longitude'], y= df['latitude'], 
                                # text=df["name"], 
                                name = 'cluster ' + str(clus), mode = 'markers'))

layout = go.Layout(xaxis = dict(title='longitude'), yaxis = dict(title= 'latitude'), 
                   title = 'Clustering')
fig = go.Figure(data= plot_data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
# Now let us try by not prescribing number of clusters (the distance hreshold is crucial, selecting 5.0 gives 6 clusters)
ac = AgglomerativeClustering(n_clusters = None,  
                             distance_threshold = 4.0, 
           affinity = 'euclidean', 
           linkage = 'complete')
data_ac = pd.DataFrame(ac.fit_predict(subset_data), index = subset_data.index)
data_ac.head()
data_ac.rename(columns = {0:'cluster'}, inplace = True)
# data_ac = data_ac.merge(airbnb_data[['latitude', 'longitude']], left_index = True, right_index = True)
# If we use the above, we would get the actual latitude used in the data as opposed to the scaled one with below
# But scaled is what we have for the cetroids, so using it
data_ac = data_ac.merge(subset_data[['latitude', 'longitude']], left_index = True, right_index = True)
data_ac.head()

In [None]:
plot_data = []
for clus in set(data_ac['cluster']):
    df = data_ac[data_ac['cluster'] == clus]
    plot_data.append(go.Scatter(x= df['longitude'], y= df['latitude'], 
                                # text=df["name"], 
                                name = 'cluster ' + str(clus), mode = 'markers'))

layout = go.Layout(xaxis = dict(title='longitude'), yaxis = dict(title= 'latitude'), 
                   title = 'Clustering')
fig = go.Figure(data= plot_data, layout=layout)
plotly.offline.iplot(fig)

In [None]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps = 0.3,  
           min_samples = 5, 
           metric = 'euclidean')
data_db = pd.DataFrame(db.fit_predict(subset_data), index = subset_data.index)
data_db.head()

In [None]:
data_db.rename(columns = {0:'cluster'}, inplace = True)
# data_db = data_db.merge(airbnb_data[['latitude', 'longitude']], left_index = True, right_index = True)
# If we use the above, we would get the actual latitude used in the data as opposed to the scaled one with below
# But scaled is what we have for the cetroids, so using it
data_db = data_db.merge(subset_data[['latitude', 'longitude']], left_index = True, right_index = True)
data_db.head()

In [None]:
plot_data = []
for clus in set(data_db['cluster']):
    df = data_ac[data_db['cluster'] == clus]
    plot_data.append(go.Scatter(x= df['longitude'], y= df['latitude'], 
                                # text=df["name"], 
                                name = 'cluster ' + str(clus), mode = 'markers'))

layout = go.Layout(xaxis = dict(title='longitude'), yaxis = dict(title= 'latitude'), 
                   title = 'Clustering')
fig = go.Figure(data= plot_data, layout=layout)
plotly.offline.iplot(fig)