In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import nltk
import os
import seaborn as sns

import altair as alt
alt.data_transformers.disable_max_rows()
alt.themes.enable('fivethirtyeight')

import plotly.graph_objects as go
import plotly.io as pio
pio.templates.default = "plotly_white"

# supress warnings
import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

from scipy.stats import ttest_ind

from sklearn.cluster import KMeans


from textblob import TextBlob

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
root_path = '/content/drive/My Drive/airbnb/nyc-airbnb/'
files = os.listdir(root_path)
dates = [f[4:] for f in files if f.startswith('nyc')]
dates = sorted(dates)

In [4]:
dates

['2022-11-02',
 '2022-12-04',
 '2023-01-04',
 '2023-02-05',
 '2023-03-06',
 '2023-04-05',
 '2023-05-03',
 '2023-06-05',
 '2023-07-03',
 '2023-08-04',
 '2023-09-05',
 '2023-10-01']

In [5]:
dfs = {}
for date in dates:
  path = f'{root_path}nyc-{date}/listings.csv.gz'
  temp = pd.read_csv(path, compression = 'gzip')
  temp['price'] = temp['price'].str.replace("$", "").str.replace(",", "").astype(float)
  dfs[date] = temp

In [6]:
df_all = []
for i in range(len(dates)):
  curr_date = dates[i]
  next_date = dates[i+1] if i < len(dates)-1 else ""
  date_df = dfs[curr_date]
  date_df["date"] = curr_date
  date_df["next_date"] = next_date
  df_all.append(date_df)
df_all = pd.concat(df_all)
next_df = df_all[["id","date"]].rename(columns={"date": "next_date"})
next_df["drop"] = 0
df_all = pd.merge(df_all, next_df, on=["id","next_date"], how="left")
df_all["drop"] = df_all["drop"].fillna(1)
df_all = df_all[(~df_all["date"].isin(["2023-08-04","2023-10-01"]))]

one_hot_encoded_df = pd.get_dummies(df_all['neighbourhood_group_cleansed'], prefix='neighborhood_')
df_all = df_all.join(one_hot_encoded_df)
df_all = df_all[df_all["price"] != 0]
df_all["log_price"] = np.log(df_all["price"])

In [7]:
df_all['calculated_host_listings_count']

0         3
1         1
2         2
3         1
4         2
         ..
466370    1
466371    1
466372    1
466373    1
466374    1
Name: calculated_host_listings_count, Length: 422582, dtype: int64

In [8]:
df_all.groupby("date")["drop"].sum()

date
2022-11-02    1327.0
2022-12-04    1809.0
2023-01-04    2818.0
2023-02-05    2196.0
2023-03-06    2124.0
2023-04-05    2171.0
2023-05-03    2124.0
2023-06-05    1825.0
2023-07-03    2043.0
2023-09-05    2554.0
Name: drop, dtype: float64

In [9]:
df_all.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'source', 'name',
       'description', 'neighborhood_overview', 'picture_url', 'host_id',
       'host_url', 'host_name', 'host_since', 'host_location', 'host_about',
       'host_response_time', 'host_response_rate', 'host_acceptance_rate',
       'host_is_superhost', 'host_thumbnail_url', 'host_picture_url',
       'host_neighbourhood', 'host_listings_count',
       'host_total_listings_count', 'host_verifications',
       'host_has_profile_pic', 'host_identity_verified', 'neighbourhood',
       'neighbourhood_cleansed', 'neighbourhood_group_cleansed', 'latitude',
       'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms',
       'bathrooms_text', 'bedrooms', 'beds', 'amenities', 'price',
       'minimum_nights', 'maximum_nights', 'minimum_minimum_nights',
       'maximum_minimum_nights', 'minimum_maximum_nights',
       'maximum_maximum_nights', 'minimum_nights_avg_ntm',
       'maximum_nights_avg_ntm', 'ca

In [10]:
df_before_law = df_all[df_all['date'] <= '2023-07-03']
df_before_law['constant'] = 1
df_before_law['host_since'] = pd.to_datetime(df_before_law['host_since'])
df_before_law['host_years'] = 2023 - df_before_law['host_since'].dt.year

In [11]:
df_all['date_dummy'] = (df_all['date']>= '2023-09-05').astype(int)
df_all['constant'] = 1
df_all['host_since'] = pd.to_datetime(df_all['host_since'])
df_all['host_years'] = 2023 - df_all['host_since'].dt.year

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import statsmodels.api as sm


neighborhood_dummy = ['neighborhood__Bronx',
                      'neighborhood__Brooklyn',
                      #'neighborhood__Manhattan',
                      'neighborhood__Queens',
                      'neighborhood__Staten Island']



scaler = StandardScaler()
scaler_needed_columns = ['log_price','availability_30', #'availability_60',
                         #'availability_365',
                         'beds','host_listings_count',
                         'review_scores_rating',#'review_scores_accuracy',
                         #'review_scores_cleanliness','review_scores_checkin',
                        # 'review_scores_communication','review_scores_location',
                        # 'review_scores_value',
                         #'accommodates',
                         'host_years']
x = df_before_law.copy()
x[scaler_needed_columns] = scaler.fit_transform(x[scaler_needed_columns])

## nights that greater than 30days becomes 1, otherwise 0
x["minimum_nights_30"] = x["minimum_nights"] // 30
x["minimum_nights_30"] = np.minimum(x["minimum_nights_30"], 1)
x["maximum_nights_30"] = x["maximum_nights"] // 30
x["maximum_nights_30"] = np.minimum(x["maximum_nights_30"], 1)
x["accommodates_2"] = np.minimum(x["accommodates"]//2, 3)
x['instant_bookable_'] = x['instant_bookable'].map({'f': 0, 't': 1}).fillna(0)
x['host_is_superhost_'] = x['host_is_superhost'].map({'f':0, 't':1}).fillna(0)

initial_columns = scaler_needed_columns+['constant',#'minimum_nights_30',
                                         #'maximum_nights_30',
                                         'accommodates_2',
                                         'instant_bookable_',
                                         'host_is_superhost_'] + neighborhood_dummy

# add dummy to the columns
# dummy_columns = []
# for column in initial_columns:
#   x[f'{column}_dummy'] = x[column] * x['date_dummy']
#   dummy_columns.append(f'{column}_dummy')

columns = initial_columns

x = x[x[columns].notnull().all(axis=1)]
X = x[columns]
y = x['drop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
sm_model = sm.Logit(y_train, X_train)
sm_res = sm_model.fit()

Optimization terminated successfully.
         Current function value: 0.664113
         Iterations 5


In [13]:
sm_res.summary()

0,1,2,3
Dep. Variable:,drop,No. Observations:,442270.0
Model:,Logit,Df Residuals:,442256.0
Method:,MLE,Df Model:,13.0
Date:,"Mon, 27 Nov 2023",Pseudo R-squ.:,0.04189
Time:,19:55:17,Log-Likelihood:,-293720.0
converged:,True,LL-Null:,-306560.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
log_price,0.1582,0.005,34.932,0.000,0.149,0.167
availability_30,0.2828,0.003,89.164,0.000,0.277,0.289
beds,-0.0058,0.004,-1.371,0.170,-0.014,0.002
host_listings_count,-0.0995,0.008,-11.846,0.000,-0.116,-0.083
review_scores_rating,-0.0267,0.003,-7.944,0.000,-0.033,-0.020
host_years,-0.2418,0.003,-73.956,0.000,-0.248,-0.235
constant,0.3097,0.009,36.131,0.000,0.293,0.327
accommodates_2,-0.1442,0.006,-25.559,0.000,-0.155,-0.133
instant_bookable_,0.0834,0.008,11.039,0.000,0.069,0.098


In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import statsmodels.api as sm


neighborhood_dummy = ['neighborhood__Bronx',
                      'neighborhood__Brooklyn',
                      #'neighborhood__Manhattan',
                      'neighborhood__Queens',
                      'neighborhood__Staten Island']



scaler = StandardScaler()
x = df_all.copy()
x["host_listings_count"] = np.minimum(x["host_listings_count"], 2)

scaler_needed_columns = ['log_price',
                         'availability_30',
                         'beds',
                         'host_listings_count',
                         'review_scores_rating',
                         'number_of_reviews',
                         'host_years']

x[scaler_needed_columns] = scaler.fit_transform(x[scaler_needed_columns])

## nights that greater than 30days becomes 1, otherwise 0
x["minimum_nights_30"] = x["minimum_nights"] // 30
x["minimum_nights_30"] = np.minimum(x["minimum_nights_30"], 1)
x["maximum_nights_30"] = x["maximum_nights"] // 30
x["maximum_nights_30"] = np.minimum(x["maximum_nights_30"], 1)
x["accommodates_2"] = np.minimum(x["accommodates"]//2, 3)
x['instant_bookable_'] = x['instant_bookable'].map({'f': 0, 't': 1}).fillna(0)
x['host_is_superhost_'] = x['host_is_superhost'].map({'f':0, 't':1}).fillna(0)

initial_columns = scaler_needed_columns+['constant',
                                         'accommodates_2', 'instant_bookable_',
                                         'host_is_superhost_'] + neighborhood_dummy

# add dummy to the columns
dummy_columns = []
for column in initial_columns:
  x[f'{column}_dummy'] = x[column] * x['date_dummy']
  dummy_columns.append(f'{column}_dummy')

columns = initial_columns+dummy_columns

x = x[x[columns].notnull().all(axis=1)]
X = x[columns]
y = x['drop']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)
sm_model = sm.Logit(y_train, X_train)
sm_res = sm_model.fit()
#y_pred_prob = sm_res.predict(X_test)
#p_pred = sm_res.predict_proba(X_test)

Optimization terminated successfully.
         Current function value: 0.650127
         Iterations 5


In [None]:
df_all.groupby("date")["drop"].sum() / df_all.groupby("date")["drop"].apply(len)

date
2022-11-02    0.033583
2022-12-04    0.043587
2023-01-04    0.065958
2023-02-05    0.051415
2023-03-06    0.049506
2023-04-05    0.050228
2023-05-03    0.049069
2023-06-05    0.041906
2023-07-03    0.046735
2023-09-05    0.064737
Name: drop, dtype: float64

In [None]:
x = df_all.copy()
x["host_listings_count"] = np.minimum(x["host_listings_count"], 2)
x.groupby(["date_dummy","host_listings_count"]).apply(len)

date_dummy  host_listings_count
0           1.0                    181679
            2.0                    201406
1           0.0                         1
            1.0                     18099
            2.0                     21347
dtype: int64

In [15]:
sm_res.summary()

0,1,2,3
Dep. Variable:,drop,No. Observations:,485286.0
Model:,Logit,Df Residuals:,485256.0
Method:,MLE,Df Model:,29.0
Date:,"Mon, 27 Nov 2023",Pseudo R-squ.:,0.06207
Time:,19:57:29,Log-Likelihood:,-315500.0
converged:,True,LL-Null:,-336370.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
log_price,0.1896,0.005,40.063,0.000,0.180,0.199
availability_30,0.2594,0.003,79.344,0.000,0.253,0.266
beds,-0.0109,0.005,-2.412,0.016,-0.020,-0.002
host_listings_count,0.2066,0.003,61.479,0.000,0.200,0.213
review_scores_rating,-0.0026,0.003,-0.750,0.453,-0.009,0.004
number_of_reviews,-0.0606,0.003,-20.466,0.000,-0.066,-0.055
host_years,-0.2393,0.003,-70.292,0.000,-0.246,-0.233
constant,0.3743,0.009,42.290,0.000,0.357,0.392
accommodates_2,-0.1750,0.006,-29.646,0.000,-0.187,-0.163
