In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import nltk
import os
import seaborn as sns

import altair as alt
alt.data_transformers.disable_max_rows()
alt.themes.enable('fivethirtyeight')

import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

# supress warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

from scipy.stats import ttest_ind

from sklearn.cluster import KMeans


from textblob import TextBlob

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
root_path = '/content/drive/My Drive/airbnb/nyc-airbnb/'
files = os.listdir(root_path)
dates = [f[4:] for f in files if f.startswith('nyc')]
dates = sorted(dates)

In [4]:
dates

['2022-11-02',
 '2022-12-04',
 '2023-01-04',
 '2023-02-05',
 '2023-03-06',
 '2023-04-05',
 '2023-05-03',
 '2023-06-05',
 '2023-07-03',
 '2023-08-04',
 '2023-09-05',
 '2023-10-01',
 '2023-11-01']

In [5]:
dfs = {}
for date in dates:
  path = f'{root_path}nyc-{date}/listings.csv.gz'
  temp = pd.read_csv(path, compression = 'gzip')
  temp['price'] = temp['price'].str.replace("$", "").str.replace(",", "").astype(float)
  dfs[date] = temp

In [6]:
df_all = []
for i in range(len(dates)):
  curr_date = dates[i]
  next_date = dates[i+1] if i < len(dates)-1 else ""
  date_df = dfs[curr_date]
  date_df["date"] = curr_date
  date_df["next_date"] = next_date
  df_all.append(date_df)
df_all = pd.concat(df_all)
next_df = df_all[["id","date"]].rename(columns={"date": "next_date"})
next_df["drop"] = 0
df_all = pd.merge(df_all, next_df, on=["id","next_date"], how="left")
df_all["drop"] = df_all["drop"].fillna(1)
# df_all = df_all[(~df_all["date"].isin(["2023-08-04","2023-10-01"]))]
df_all = df_all[(~df_all["date"].isin(["2023-08-04","2023-09-05","2023-11-01"]))]

one_hot_encoded_df = pd.get_dummies(df_all['neighbourhood_group_cleansed'], prefix='neighborhood_')
df_all = df_all.join(one_hot_encoded_df)
df_all = df_all[df_all["price"] != 0]
df_all["log_price"] = np.log(df_all["price"])

In [7]:
df_all.groupby("date")["drop"].sum()

date
2022-11-02    1327.0
2022-12-04    1809.0
2023-01-04    2818.0
2023-02-05    2196.0
2023-03-06    2124.0
2023-04-05    2171.0
2023-05-03    2124.0
2023-06-05    1825.0
2023-07-03    2043.0
2023-10-01    1487.0
Name: drop, dtype: float64

In [8]:
df_all.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [9]:
df_before_law = df_all[df_all['date'] <= '2023-07-03']
df_before_law['constant'] = 1
df_before_law['host_since'] = pd.to_datetime(df_before_law['host_since'])
df_before_law['host_years'] = 2023 - df_before_law['host_since'].dt.year

In [10]:
df_all['date_dummy'] = (df_all['date']>= '2023-10-01').astype(int)
df_all['constant'] = 1
df_all['host_since'] = pd.to_datetime(df_all['host_since'])
df_all['host_years'] = 2023 - df_all['host_since'].dt.year

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import statsmodels.api as sm

neighborhood_dummy = [#'neighborhood__Bronx',
                      'neighborhood__Brooklyn',
                      'neighborhood__Queens',
                      ]

x = df_before_law.copy()

x = x[x["neighbourhood_group_cleansed"].isin(["Brooklyn","Manhattan","Queens"])]
x.loc[x["review_scores_rating"] < 4.5, "rating"] = 0
x.loc[(x["review_scores_rating"] >= 4.5) & (x["review_scores_rating"] < 4.8), "rating"] = 1
x.loc[(x["review_scores_rating"] >= 4.8), "rating"] = 2
x["host_listings_count"] = np.minimum(x["host_listings_count"], 5)
x["beds"] = np.minimum(x["beds"], 5)
x["num_rating_per_year"] = np.log((x["number_of_reviews"]+1) / np.maximum(x["host_years"],1))

scaler = StandardScaler()
scaler_needed_columns = ['log_price',
                         'availability_30',
                         'beds',
                         'host_listings_count',
                         'rating',
                         'host_years'
                         ]

x[scaler_needed_columns] = scaler.fit_transform(x[scaler_needed_columns])

## nights that greater than 30days becomes 1, otherwise 0
x["minimum_nights_30"] = x["minimum_nights"] // 30
x["minimum_nights_30"] = np.minimum(x["minimum_nights_30"], 1)
x["maximum_nights_30"] = x["maximum_nights"] // 30
x["maximum_nights_30"] = np.minimum(x["maximum_nights_30"], 1)
x["accommodates_2"] = np.minimum(x["accommodates"], 5)
x['instant_bookable_'] = x['instant_bookable'].map({'f': 0, 't': 1}).fillna(0)
x['host_is_superhost_'] = x['host_is_superhost'].map({'f':0, 't':1}).fillna(0)

initial_columns = ['constant'] + neighborhood_dummy +\
                  [
                   'maximum_nights_30',
                   'instant_bookable_',
                   'host_is_superhost_'
                   ] + scaler_needed_columns
columns = initial_columns

x = x[x[columns].notnull().all(axis=1)]
X = x[columns]
y = x['drop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
sm_model = sm.Logit(y_train, X_train)
sm_res = sm_model.fit()
sm_res.summary()

Optimization terminated successfully.
         Current function value: 0.657111
         Iterations 5


0,1,2,3
Dep. Variable:,drop,No. Observations:,419356.0
Model:,Logit,Df Residuals:,419344.0
Method:,MLE,Df Model:,11.0
Date:,"Sun, 10 Dec 2023",Pseudo R-squ.:,0.05199
Time:,07:46:24,Log-Likelihood:,-275560.0
converged:,True,LL-Null:,-290680.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
constant,0.4468,0.009,49.156,0.000,0.429,0.465
neighborhood__Brooklyn,-0.2933,0.008,-38.949,0.000,-0.308,-0.279
neighborhood__Queens,-0.3668,0.010,-36.720,0.000,-0.386,-0.347
maximum_nights_30,-0.3410,0.008,-41.096,0.000,-0.357,-0.325
instant_bookable_,-0.0808,0.008,-10.013,0.000,-0.097,-0.065
host_is_superhost_,-0.2366,0.008,-28.223,0.000,-0.253,-0.220
log_price,0.1700,0.004,38.710,0.000,0.161,0.179
availability_30,0.2548,0.003,76.743,0.000,0.248,0.261
beds,-0.0992,0.004,-26.571,0.000,-0.106,-0.092


In [12]:
def fdr(pvals, level=0.05):
  thresholds = pvals.sort_values().rank() / pvals.size * level
  significance = pd.DataFrame({"pvalues": pvals, "thresholds": thresholds})
  significance["significant"] = False
  significance.loc[significance["pvalues"] < significance["thresholds"], "significant"] = True
  return significance
fdr(sm_res.pvalues)

Unnamed: 0,pvalues,thresholds,significant
availability_30,0.0,0.016667,True
beds,1.4546929999999998e-155,0.041667,True
constant,0.0,0.016667,True
host_is_superhost_,3.0172510000000004e-175,0.0375,True
host_listings_count,0.0,0.016667,True
host_years,0.0,0.016667,True
instant_bookable_,1.3309680000000002e-23,0.05,True
log_price,0.0,0.016667,True
maximum_nights_30,0.0,0.016667,True
neighborhood__Brooklyn,0.0,0.016667,True


In [13]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
VIFs = pd.Series(
    [variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
    index=X_train.columns
)
print(VIFs)

constant                  9.268344
neighborhood__Brooklyn    1.342978
neighborhood__Queens      1.416176
maximum_nights_30         1.042806
instant_bookable_         1.079477
host_is_superhost_        1.146712
log_price                 1.523006
availability_30           1.191060
beds                      1.330291
host_listings_count       1.180820
rating                    1.146976
host_years                1.178624
dtype: float64


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import statsmodels.api as sm

neighborhood_dummy = [
                      'neighborhood__Brooklyn',
                      'neighborhood__Queens',
                      ]

x = df_all.copy()

x = x[x["neighbourhood_group_cleansed"].isin(["Brooklyn","Manhattan","Queens"])]
x.loc[x["review_scores_rating"] < 4.5, "rating"] = 0
x.loc[(x["review_scores_rating"] >= 4.5) & (x["review_scores_rating"] < 4.8), "rating"] = 1
x.loc[(x["review_scores_rating"] >= 4.8), "rating"] = 2
x["host_listings_count"] = np.minimum(x["host_listings_count"], 5)
x["beds"] = np.minimum(x["beds"], 5)
x["num_rating_per_year"] = np.log((x["number_of_reviews"]+1) / np.maximum(x["host_years"],1))

scaler = StandardScaler()
scaler_needed_columns = ['log_price',
                         'availability_30',
                         'beds',
                         'host_listings_count',
                         'rating',
                         'host_years'
                         ]

x[scaler_needed_columns] = scaler.fit_transform(x[scaler_needed_columns])

## nights that greater than 30days becomes 1, otherwise 0
x["minimum_nights_30"] = x["minimum_nights"] // 30
x["minimum_nights_30"] = np.minimum(x["minimum_nights_30"], 1)
x["maximum_nights_30"] = x["maximum_nights"] // 30
x["maximum_nights_30"] = np.minimum(x["maximum_nights_30"], 1)
x["accommodates_2"] = np.minimum(x["accommodates"], 5)
x['instant_bookable_'] = x['instant_bookable'].map({'f': 0, 't': 1}).fillna(0)
x['host_is_superhost_'] = x['host_is_superhost'].map({'f':0, 't':1}).fillna(0)

initial_columns = ['constant'] + neighborhood_dummy +\
                  [
                   'maximum_nights_30',
                   'instant_bookable_',
                   'host_is_superhost_'
                   ] + scaler_needed_columns

# add dummy to the columns
# remove maximum_nights_30 because few listings's maximum nights are less than 30 days.
dummy_columns = []
for column in initial_columns:
  if column == "maximum_nights_30":
    continue
  x[f'{column}_chg'] = x[column] * x['date_dummy']
  dummy_columns.append(f'{column}_chg')

columns = initial_columns+dummy_columns

x = x[x[columns].notnull().all(axis=1)]
X = x[columns]
y = x['drop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
sm_model = sm.Logit(y_train, X_train)
sm_res = sm_model.fit()
sm_res.summary()

Optimization terminated successfully.
         Current function value: 0.655888
         Iterations 5


0,1,2,3
Dep. Variable:,drop,No. Observations:,460274.0
Model:,Logit,Df Residuals:,460251.0
Method:,MLE,Df Model:,22.0
Date:,"Sun, 10 Dec 2023",Pseudo R-squ.:,0.05375
Time:,07:46:59,Log-Likelihood:,-301890.0
converged:,True,LL-Null:,-319040.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
constant,0.4472,0.009,49.465,0.000,0.429,0.465
neighborhood__Brooklyn,-0.2790,0.008,-37.145,0.000,-0.294,-0.264
neighborhood__Queens,-0.3346,0.010,-33.694,0.000,-0.354,-0.315
maximum_nights_30,-0.3397,0.008,-41.337,0.000,-0.356,-0.324
instant_bookable_,-0.0892,0.008,-11.083,0.000,-0.105,-0.073
host_is_superhost_,-0.2196,0.008,-26.351,0.000,-0.236,-0.203
log_price,0.1825,0.004,41.429,0.000,0.174,0.191
availability_30,0.2619,0.003,79.477,0.000,0.255,0.268
beds,-0.1024,0.004,-27.575,0.000,-0.110,-0.095


In [15]:
### Using False discovery rate to set a significance threshold for multiple comparison
fdr(sm_res.pvalues)

Unnamed: 0,pvalues,thresholds,significant
availability_30,0.0,0.007609,True
availability_30_chg,6.144592e-11,0.034783,True
beds,2.229099e-167,0.019565,True
beds_chg,4.799155000000001e-22,0.030435,True
constant,0.0,0.007609,True
constant_chg,6.382157e-08,0.041304,True
host_is_superhost_,4.99799e-153,0.021739,True
host_is_superhost__chg,4.360308e-49,0.026087,True
host_listings_count,0.0,0.007609,True
host_listings_count_chg,0.9288888,0.05,False


In [16]:
from statsmodels.stats.outliers_influence import variance_inflation_factor
VIFs = pd.Series(
    [variance_inflation_factor(X.values, i) for i in range(X.shape[1])],
    index=X_train.columns
)
print(VIFs)

constant                      10.074190
neighborhood__Brooklyn         1.473076
neighborhood__Queens           1.550866
maximum_nights_30              1.056621
instant_bookable_              1.177717
host_is_superhost_             1.266760
log_price                      1.679007
availability_30                1.299684
beds                           1.459057
host_listings_count            1.299681
rating                         1.258800
host_years                     1.296932
constant_chg                   3.253317
neighborhood__Brooklyn_chg     2.406321
neighborhood__Queens_chg       1.850864
instant_bookable__chg          1.438228
host_is_superhost__chg         1.663993
log_price_chg                  1.716381
availability_30_chg            1.283344
beds_chg                       1.460703
host_listings_count_chg        1.266026
rating_chg                     1.263135
host_years_chg                 1.272637
dtype: float64
