File: clustering.ipynb -- Unsupervised Clustering of Airbnb Data => Gentrification Labels <br>
Author: Shomik Jain <br>
Date: 2/02/2020

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn import metrics
from scipy import stats
from sklearn.preprocessing import StandardScaler

In [2]:
file = '../data/classification_data.csv'
avg_data = pd.read_csv(file)

In [3]:
avg_data.head()

Unnamed: 0,zipcode,agg_count_log,price_log,person_capacity,review_rating_location,review_length_log,crime_words_perc_old,sentiment_neg_avg,lda2,lda3,...,gini_index,edu_bachelors,age_25_34,square_feet_log,listings_count_log,gentrifying,non_gentrifying,higher_income,crime_score_norm,gentrification_score
0,10001.0,7.732475,5.252437,3.142,9.711,3.432707,0.028249,0.018556,0.184196,0.297205,...,0.604282,0.681601,0.225217,6.358853,3.866979,0.0,0.0,1.0,0.526317,0.484911
1,10002.0,8.731934,5.091838,3.020333,9.261333,3.444472,0.037337,0.019438,0.216034,0.293421,...,0.566964,0.318629,0.178341,6.643259,4.381943,1.0,0.0,0.0,0.48192,0.378736
2,10003.0,8.604905,5.206409,3.180333,9.789,3.461813,0.035863,0.016667,0.211683,0.320224,...,0.55366,0.788286,0.273808,6.406351,4.562832,1.0,0.0,0.0,0.399749,0.567569
3,10004.0,4.946277,5.206644,3.217333,9.745333,3.426781,0.010071,0.01269,0.201332,0.302027,...,0.463547,0.843616,0.286839,7.799511,2.517237,0.0,0.0,1.0,0.001156,0.637875
4,10005.0,5.944353,5.181536,2.967333,9.582,3.371756,0.013019,0.012908,0.202391,0.342784,...,0.486856,0.881869,0.46502,6.531429,2.665439,0.0,0.0,1.0,0.028404,0.679579


In [4]:
avg_data.columns

Index(['zipcode', 'agg_count_log', 'price_log', 'person_capacity',
       'review_rating_location', 'review_length_log', 'crime_words_perc_old',
       'sentiment_neg_avg', 'lda2', 'lda3', 'lda4', 'lda5', 'lda_pca1',
       'lda_pca2', 'd2v_pca1', 'd2v_pca2', 'crime_score', 'crime_score_log',
       'race_index', 'gini_index', 'edu_bachelors', 'age_25_34',
       'square_feet_log', 'listings_count_log', 'gentrifying',
       'non_gentrifying', 'higher_income', 'crime_score_norm',
       'gentrification_score'],
      dtype='object')

In [5]:
avg_data = avg_data[(avg_data['gentrifying']==1) | (avg_data['non_gentrifying']==1)]

In [6]:
features = [
'listings_count_log',
'agg_count_log',
'price_log',
'review_rating_location',
'review_length_log',
'crime_words_perc_old',
'sentiment_neg_avg',
'lda_pca1',
'lda_pca2',
'd2v_pca1',
'd2v_pca2', 
]

In [7]:
from sklearn.cluster import KMeans

X_data = avg_data[features]
y_data = avg_data['gentrifying']

clustering = KMeans(n_clusters=2, n_init=5).fit(X_data.values)

pred1 = clustering.labels_
pred2 = []
for p in pred1:
    pred2.append(1-p)
    
pred1_score = accuracy_score(y_data, pred1)
pred2_score = accuracy_score(y_data, pred2)

pred = pred2
if pred1_score > pred2_score:
    pred = pred1
    
accuracy = accuracy_score(y_data, pred)

print(accuracy)

precision = metrics.precision_score(y_data, pred, average=None)
recall = metrics.recall_score(y_data, pred, average=None)
f1 = metrics.f1_score(y_data, pred, average=None)

print(precision, recall, f1)

0.8421052631578947
[0.625 1.   ] [1.         0.78571429] [0.76923077 0.88      ]


In [8]:
from sklearn.cluster import DBSCAN

X_data = avg_data[features]
y_data = avg_data['gentrifying']

clustering = DBSCAN(eps=0.9).fit(X_data.values)

pred = []
for p in pred1:
    if p <= 0:
        pred.append(1)
    else:
        pred.append(0)    
        
accuracy = accuracy_score(y_data, pred)

print(accuracy)

precision = metrics.precision_score(y_data, pred, average=None)
recall = metrics.recall_score(y_data, pred, average=None)
f1 = metrics.f1_score(y_data, pred, average=None)

print(precision, recall, f1)

0.8421052631578947
[0.625 1.   ] [1.         0.78571429] [0.76923077 0.88      ]


In [9]:
from sklearn.cluster import AgglomerativeClustering

X_data = avg_data[features]
y_data = avg_data['gentrifying']

clustering = AgglomerativeClustering(n_clusters=2).fit(X_data.values)

pred1 = clustering.labels_
pred2 = []
for p in pred1:
    pred2.append(1-p)
    
pred1_score = accuracy_score(y_data, pred1)
pred2_score = accuracy_score(y_data, pred2)

pred = pred2
if pred1_score > pred2_score:
    pred = pred1
    
accuracy = accuracy_score(y_data, pred)

print(accuracy)

precision = metrics.precision_score(y_data, pred, average=None)
recall = metrics.recall_score(y_data, pred, average=None)
f1 = metrics.f1_score(y_data, pred, average=None)

print(precision, recall, f1)

0.9122807017543859
[0.77777778 0.97435897] [0.93333333 0.9047619 ] [0.84848485 0.9382716 ]


In [10]:
from sklearn.cluster import OPTICS

X_data = avg_data[features]
y_data = avg_data['gentrifying']

clustering = OPTICS().fit(X_data.values)

pred = []
for p in pred1:
    if p <= 0:
        pred.append(1)
    else:
        pred.append(0)    
        
accuracy = accuracy_score(y_data, pred)

print(accuracy)

precision = metrics.precision_score(y_data, pred, average=None)
recall = metrics.recall_score(y_data, pred, average=None)
f1 = metrics.f1_score(y_data, pred, average=None)

print(precision, recall, f1)

0.9122807017543859
[0.77777778 0.97435897] [0.93333333 0.9047619 ] [0.84848485 0.9382716 ]


In [11]:
from sklearn.mixture import GaussianMixture

X_data = avg_data[features]
y_data = avg_data['gentrifying']

clustering = GaussianMixture().fit(X_data.values)

pred = []
for p in pred1:
    if p <= 0:
        pred.append(1)
    else:
        pred.append(0)    
        
accuracy = accuracy_score(y_data, pred)

print(accuracy)

precision = metrics.precision_score(y_data, pred, average=None)
recall = metrics.recall_score(y_data, pred, average=None)
f1 = metrics.f1_score(y_data, pred, average=None)

print(precision, recall, f1)

0.9122807017543859
[0.77777778 0.97435897] [0.93333333 0.9047619 ] [0.84848485 0.9382716 ]


In [12]:
from sklearn.cluster import Birch

X_data = avg_data[features]
y_data = avg_data['gentrifying']

clustering = Birch(n_clusters=2).fit(X_data.values)

pred = []
for p in pred1:
    if p <= 0:
        pred.append(1)
    else:
        pred.append(0)    
        
accuracy = accuracy_score(y_data, pred)

print(accuracy)

precision = metrics.precision_score(y_data, pred, average=None)
recall = metrics.recall_score(y_data, pred, average=None)
f1 = metrics.f1_score(y_data, pred, average=None)

print(precision, recall, f1)

0.9122807017543859
[0.77777778 0.97435897] [0.93333333 0.9047619 ] [0.84848485 0.9382716 ]


In [13]:
from sklearn.cluster import SpectralClustering

X_data = avg_data[features]
y_data = avg_data['gentrifying']

clustering = SpectralClustering(n_clusters=2).fit(X_data.values)

pred1 = clustering.labels_
pred2 = []
for p in pred1:
    pred2.append(1-p)
    
pred1_score = accuracy_score(y_data, pred1)
pred2_score = accuracy_score(y_data, pred2)

pred = pred2
if pred1_score > pred2_score:
    pred = pred1
    
accuracy = accuracy_score(y_data, pred)

print(accuracy)

precision = metrics.precision_score(y_data, pred, average=None)
recall = metrics.recall_score(y_data, pred, average=None)
f1 = metrics.f1_score(y_data, pred, average=None)

print(precision, recall, f1)

0.9122807017543859
[0.77777778 0.97435897] [0.93333333 0.9047619 ] [0.84848485 0.9382716 ]


In [14]:
# Output Data for Clustering Map

zipcodes = list(avg_data['zipcode'].unique())
y_data = list(avg_data['gentrifying'])
pred = list(pred)

predictions = pd.DataFrame(columns=['region', 'value'])
actual = pd.DataFrame(columns=['region', 'value'])

predictions['region'] = zipcodes
actual['region'] = zipcodes

predictions['value'] = pred
actual['value'] = y_data

predictions['region'] = predictions['region'].astype('int64')
actual['region'] = actual['region'].astype('int64')

actual.to_csv('clustering_real.csv', index=False)
predictions.to_csv('clustering_pred.csv', index=False)