In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
from IPython.core.display import display, HTML
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

%reload_ext autoreload
%autoreload 1
%matplotlib inline

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import os
import seaborn as sns
import pandas as pd
import math

from Utils.UtilsViz import *
from Utils.DataUtils import *

In [3]:
US_coord = [37.0902, -102]
NY_COORD = [40.7128, -74.0060]

# data_path = os.path.join(os.getcwd(), "Data")
ny_datapath = "C:\\Users\\sriharis\\OneDrive\\UChicago\\DataMining\\project\\NYData\\"
# ny_datapath = "C:\\Users\\Ssrih\\OneDrive\\UChicago\\DataMining\\project\\NYData\\"

In [4]:
os.listdir(ny_datapath)

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'C:\\Users\\Ssrih\\OneDrive\\UChicago\\DataMining\\project\\NYData\\'

In [None]:
listings = pd.read_csv(os.path.join(ny_datapath, "listings.csv"))

In [None]:
listings.head()

Drop min and max nights columns

# Price preprocessing 

In [None]:
listings['price'] = listings['price'].str.strip('').str.strip('$').str.replace(',', '').astype('float')

In [None]:
percentiles = list(range(0,101))
price_percentile = {}
for p in percentiles:
    price_percentile[p] = np.percentile(listings['price'].values, p)

price_percentile = pd.DataFrame.from_dict(price_percentile, orient='index')
price_percentile.plot(kind='bar', figsize=(25,9), grid=True)

In [None]:
listings = listings[listings["price"] <= price_percentile.iloc[99,:].values[0]]
listings = listings[listings["price"] >= price_percentile.iloc[1,:].values[0]]
listings["price"].describe()

In [None]:
percentiles = list(range(0,101))
price_percentile = {}
for p in percentiles:
    price_percentile[p] = np.percentile(listings['price'].values, p)

price_percentile = pd.DataFrame.from_dict(price_percentile, orient='index')
price_percentile.plot(kind='bar', figsize=(25,9), grid=True)

In [None]:
plot_dist(data=listings, colname="price", kde=False)

In [None]:
def get_logprice(price):
    if price <= 0.0:
        return 0.0
    else:
        return np.log(price)
listings['price_log'] = listings['price'].apply(get_logprice)
plot_dist(data=listings, colname="price_log", kde=False)

In [None]:
listings.drop("price_log", axis=1, inplace=True)

# Amenities

Let's have a look at the distribution of amenities

In [None]:
# Amenities
def get_num_amenities(row):
    a = row[1:-1].split(",")
    return len(a)

listings["num_amenities"] = listings["amenities"].apply(get_num_amenities)
g = plot_dist(data=listings, colname="num_amenities", xlabel="number of amenities", ylabel="count", kde=False, title="Distribution of Number of amenities in dataset")

How does price behave based on number of amenities?

Let's filter out all the prices above 500$

In [None]:
f, ax = plt.subplots(1,1,figsize=(25, 8))
g = plot_box(x="num_amenities", y="price", data=listings, ax=ax, agg_rule="median")

Let's group price into bins and see how behaviour is

In [None]:
bins = list(range(0,100, 5))
listings['amenities_binned'] = pd.cut(listings['num_amenities'], bins)

In [None]:
f, ax = plt.subplots(1,1,figsize=(18, 8))
g = plot_box(x="amenities_binned", y="price", data=listings, ax=ax, agg_rule="median",
            title="Price change over number of amenities listed",
            xlabel="Number of amenities", ylabel="Price of listing")

# Accommodates 

Does the price vary significantly as accommodation increases?

In [None]:
f, ax = plt.subplots(1,1,figsize=(15, 12))
g = plot_box(x="accommodates", y="price", data=listings, ax=ax, agg_rule="median", title="Price variation over number of people accommodated", xlabel="accommodates", ylabel="price")

#  Bathrooms, Bedrooms and Beds

Manhattan is definitely the most expensive.

In [None]:
cols = ["bathrooms", "bedrooms", "beds"]
listings[cols].describe()

Bathrooms and bedrooms may not add as much value to the dataset when over 75% of the column is just 1. 

Beds can be filled with the median, 1.0

# Security Deposit

In [None]:
listings['security_deposit'] = listings['security_deposit'].dropna(axis=0).str.strip('$').str.replace(',', '').astype('float')

In [None]:
tmp = listings[["price", "security_deposit"]]
tmp = tmp.dropna()
tmp.corr()

In [None]:
f, ax = plt.subplots(1, 1, figsize=(25, 8))
sns.distplot(tmp["price"], ax=ax, hist=False, kde=True, color="blue")
sns.distplot(tmp["security_deposit"], ax=ax, hist=False, kde=True, color="green")

Definitely have to fill with median. Nothing else can really help over here.

# Cleaning Fee

In [None]:
listings['cleaning_fee'] = listings['cleaning_fee'].str.strip('$').str.replace(',', '').astype('float')

In [None]:
tmp = listings[["price", "cleaning_fee"]]
tmp = tmp.dropna()
tmp.corr()

In [None]:
f, ax = plt.subplots(1, 1, figsize=(25, 8))
sns.distplot(tmp["price"], ax=ax, hist=True, kde=False)
sns.distplot(tmp["cleaning_fee"], ax=ax, hist=True, kde=False, color="green")

Definitely have to fill with median. Nothing else can really help over here.

# Neighbourhood Group

In [None]:
listings["neighbourhood_group_cleansed"].unique()

In [None]:
f, ax = plt.subplots(1,3,figsize=(30, 8))
g = sns.violinplot(x="neighbourhood_group_cleansed", y="price", data=listings, ax=ax[0])
t = g.set_title("Price vs.Neighbourhood Group")
t = g.set_ylabel("Price")
t = g.set_xlabel("Neighbourhood Group")
matching_ylim = g.get_ylim()
g = sns.boxplot(x="neighbourhood_group_cleansed", y="security_deposit", data=listings, ax=ax[1])
t = g.set_title("Neighbourhood Group vs. Security Deposit")
t = g.set_ylim([0, 2000])
g = sns.violinplot(x="neighbourhood_group_cleansed", y="cleaning_fee", data=listings, ax=ax[2])
t = g.set_title("Neighbourhood Group vs. Cleaning Fee")
t = g.set_ylim(matching_ylim)

# Property Type and Bedrooms

Club the lower frequency elements together

In [None]:
strings = ("Apartment", "House", "Townhouse", "Loft", "Condominium", "Serviced apartment")
apartment_list = list([])
for line in listings['property_type']:
    if any(s in line for s in strings):
        apartment_list.append('yes')
    else:
        apartment_list.append('no')

listings['prop'] = apartment_list
listings.loc[listings['prop'] == 'no', 'property_type'] = 'Other'
listings.loc[listings['property_type'] == 'Houseboat', 'property_type'] = 'Other'
listings.drop(['prop'], axis=1, inplace=True)

In [None]:
f, ax = plt.subplots(1,2,figsize=(25, 10))
plot_box(x="bedrooms", y="price", data=listings, agg_rule="median", ax=ax[0], title="price vs Bedrooms")
g = sns.violinplot(x="property_type", y="price", data=listings, ax=ax[1])
ty = g.set(title="Price vs Property Type")
t = g.set_ylabel("Price")
t = g.set_xlabel("Property Type")

In [None]:
f, ax = plt.subplots(1,2,figsize=(30, 8))
plot_box(x="bedrooms", y="security_deposit", data=listings, agg_rule="median", ax=ax[0], ylim=2000, title="Security Deposit vs Bedrooms")
g = sns.boxplot(x="property_type", y="security_deposit", data=listings, ax=ax[1])
g.set(title="Security deposit vs Property Type")
yl = g.set_ylim(0,2000)

In [None]:
f, ax = plt.subplots(1,2,figsize=(30, 8))
plot_box(x="bedrooms", y="cleaning_fee", data=listings, agg_rule="median", ax=ax[0], title="cleaning_fee vs Bedrooms")
g = sns.boxplot(x="property_type", y="cleaning_fee", data=listings, ax=ax[1])
g.set(title="cleaning_fee vs Property Type")
yl = g.set_ylim(0,500)