In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from folium.plugins import HeatMap
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current sessio

In [None]:
nyc_data = pd.read_csv('/kaggle/input/d/arthbr11/new-york-city-airbnb-open-data/listings.csv')
features = ['name', 'host_id', 'host_name', 'neighbourhood_group_cleansed', 'neighbourhood_cleansed', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'maximum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
nyc_data_filtered = nyc_data[features]
nyc_data['price'] = pd.to_numeric(nyc_data['price'].replace({'\$':'',',':''}, regex = True))

In [None]:
nyc_data_filtered['price'] = pd.to_numeric(nyc_data_filtered['price'].replace({'\$':'',',':''}, regex = True))
nyc_data_filtered.info()

In [None]:
# Visualize data info
nyc_data.info()

In [None]:
# Drop the data that are not of interest and/or causing privacy issues
nyc_data.drop(['id','host_name','last_review'], axis=1, inplace=True)

# Visualize the first 5 rows
nyc_data.head()

In [None]:
# Determine the number of missing values for every column
nyc_data.isnull().sum()

In [None]:
#replacing all NaN values in 'reviews_per_month' with 0
# See https://www.kaggle.com/dgomonov/data-exploration-on-nyc-airbnb
nyc_data.fillna({'reviews_per_month':0}, inplace=True)
nyc_data.head(5)

In [None]:
nyc_data = nyc_data .loc[nyc_data ['price'] > 0]
nyc_data.describe()

#### Visualization

In [None]:
# Brooklyn
sub_1=nyc_data.loc[nyc_data['neighbourhood_group_cleansed'] == 'Brooklyn']
price_sub1=sub_1[['price']]
# Manhattan
sub_2=nyc_data.loc[nyc_data['neighbourhood_group_cleansed'] == 'Manhattan']
price_sub2=sub_2[['price']]
# Queens
sub_3=nyc_data.loc[nyc_data['neighbourhood_group_cleansed'] == 'Queens']
price_sub3=sub_3[['price']]
# Staten Island
sub_4=nyc_data.loc[nyc_data['neighbourhood_group_cleansed'] == 'Staten Island']
price_sub4=sub_4[['price']]
# Bronx
sub_5=nyc_data.loc[nyc_data['neighbourhood_group_cleansed'] == 'Bronx']
price_sub5=sub_5[['price']]
# Putting all the prices' dfs in the list
price_list_by_n=[price_sub1, price_sub2, price_sub3, price_sub4, price_sub5]

In [None]:
# Creating an empty list that we will append later with price distributions for each neighbourhood_group
p_l_b_n_2=[]
# Creating list with known values in neighbourhood_group column
nei_list=['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx']
# Creating a for loop to get statistics for price ranges and append it to our empty list
for x in price_list_by_n:
    i=x.describe(percentiles=[.25, .50, .75])
    i=i.iloc[3:]
    i.reset_index(inplace=True)
    i.rename(columns={'index':'Stats'}, inplace=True)
    p_l_b_n_2.append(i)
# Changing names of the price column to the area name for easier reading of the table    
p_l_b_n_2[0].rename(columns={'price':nei_list[0]}, inplace=True)
p_l_b_n_2[1].rename(columns={'price':nei_list[1]}, inplace=True)
p_l_b_n_2[2].rename(columns={'price':nei_list[2]}, inplace=True)
p_l_b_n_2[3].rename(columns={'price':nei_list[3]}, inplace=True)
p_l_b_n_2[4].rename(columns={'price':nei_list[4]}, inplace=True)
# finilizing our dataframe for final view    
stat_df=p_l_b_n_2
stat_df=[i.set_index('Stats') for i in stat_df]
stat_df=stat_df[0].join(stat_df[1:])
stat_df

In [None]:
# We can see from our statistical table that we have some extreme values, therefore we need to remove them for the sake of a better visualization

# Creating a sub-dataframe with no extreme values / less than 500
sub_6 = nyc_data[nyc_data.price < 400.0]

# Using violinplot to showcase density and distribtuion of prices 
viz_1=sns.violinplot(data=sub_6, x='neighbourhood_group_cleansed', y='price')
viz_1.set_title('Density and distribution of prices for each neighberhood_group')

In [None]:
title = 'Room type location per Neighbourhood Group'
viz_2 = sns.catplot(x='room_type', kind="count", hue="neighbourhood_group_cleansed", data=nyc_data);
plt.title(title)
plt.ioff()

In [None]:
nyc_data.neighbourhood_cleansed.value_counts().head(10)

In [None]:
# Let's now combine this with our boroughs and room type for a rich visualization we can make

# Grabbing top 10 neighbourhoods for sub-dataframe
sub_7=nyc_data.loc[nyc_data['neighbourhood_cleansed'].isin(['Williamsburg','Bedford-Stuyvesant','Harlem','Bushwick', 'Upper West Side',
                                       'Hell\'s Kitchen','East Village','Upper East Side','Crown Heights','Midtown'])]
# Using catplot to represent multiple interesting attributes together and a count
viz_2=sns.catplot(x='neighbourhood_cleansed', hue='neighbourhood_group_cleansed', col='room_type', data=sub_7, kind='count')
viz_2.set_xticklabels(rotation=90)

In [None]:
# Creating an empty list that we will append later with price distributions for each neighbourhood_group
p_l_b_n_2=[]
# Creating list with known values in neighbourhood_group column
nei_list=['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx']
# Creating a for loop to get statistics for price ranges and append it to our empty list
for x in price_list_by_n:
    i=x.describe(percentiles=[.25, .50, .75])
    i=i.iloc[3:]
    i.reset_index(inplace=True)
    i.rename(columns={'index':'Stats'}, inplace=True)
    p_l_b_n_2.append(i)
# Changing names of the price column to the area name for easier reading of the table    
p_l_b_n_2[0].rename(columns={'price':nei_list[0]}, inplace=True)
p_l_b_n_2[1].rename(columns={'price':nei_list[1]}, inplace=True)
p_l_b_n_2[2].rename(columns={'price':nei_list[2]}, inplace=True)
p_l_b_n_2[3].rename(columns={'price':nei_list[3]}, inplace=True)
p_l_b_n_2[4].rename(columns={'price':nei_list[4]}, inplace=True)
# finilizing our dataframe for final view    
stat_df=p_l_b_n_2
stat_df=[i.set_index('Stats') for i in stat_df]
stat_df=stat_df[0].join(stat_df[1:])
stat_df

In [None]:
# Exclude property with listed price of 0

nyc_data = nyc_data.loc[(nyc_data['price']).replace({'\$':''}, regex = True).replace({',':''}, regex = True).astype(float) > 0]
# data_copy = data.copy()

In [None]:
#examine the dataset
nyc_data.describe()

In [None]:
# Recode data as categorical
# https://datascience.stackexchange.com/questions/29093/continuous-variable-to-categorical-by-quartiles
data_encoded = nyc_data.copy()
data_encoded['minimum_nights'] = pd.qcut(nyc_data['minimum_nights'], q=2, labels=["minimum_nights_low", "minimum_nights_high"])
data_encoded['number_of_reviews'] = pd.qcut(nyc_data['number_of_reviews'], q=3, labels=["number_of_reviews_low", "minimum_nights_medium", "number_of_reviews_high"])
data_encoded['reviews_per_month'] = pd.qcut(nyc_data['reviews_per_month'], q=2, labels=["reviews_per_month_low", "reviews_per_month_high"])
data_encoded['calculated_host_listings_count'] = pd.cut(nyc_data['calculated_host_listings_count'], 
                                                bins=[0, 2, 327],
                                                labels=["calculated_host_listings_count_low", "calculated_host_listings_count_high"])
data_encoded['availability_365'] = pd.qcut(nyc_data['availability_365'], q=2, labels=["availability_low", "availability_high"])

In [None]:
data_encoded.isnull().sum()

In [None]:
data_encoded.head()

In [None]:
sns.set_palette("muted")
from pylab import *
f, ax = plt.subplots(figsize=(24, 18))

subplot(2,3,1)
sns.distplot((nyc_data['price']).replace({'\$':''}, regex = True).replace({',':''}, regex = True).astype(float))
plt.xlim(0, 2000)

subplot(2,3,2)
sns.distplot(nyc_data['minimum_nights'])
plt.xlim(0, 500)

subplot(2,3,3)
sns.distplot(nyc_data['number_of_reviews'])
plt.xlim(0, 400)

subplot(2,3,4)
sns.distplot(nyc_data['reviews_per_month'])
plt.xlim(0, 30)

subplot(2,3,5)
sns.distplot(nyc_data['calculated_host_listings_count'])
plt.xlim(0, 150)

subplot(2,3,6)
sns.distplot(nyc_data['availability_365'])
plt.xlim(0, 500)

plt.tight_layout() # avoid overlap of plotsplt.draw()

In [None]:
from pylab import *
f, ax = plt.subplots(figsize=(24, 18))

subplot(2,3,1)
sns.boxplot(y = (nyc_data['price']).replace({'\$':''}, regex = True).replace({',':''}, regex = True).astype(float)) 

subplot(2,3,2)
sns.boxplot(y = nyc_data['minimum_nights'])

subplot(2,3,3)
sns.boxplot(y = nyc_data['number_of_reviews'])

subplot(2,3,4)
sns.boxplot(y = nyc_data['reviews_per_month'])

subplot(2,3,5)
sns.boxplot(y = nyc_data['calculated_host_listings_count'])

subplot(2,3,6)
sns.boxplot(y = nyc_data['availability_365'])

plt.tight_layout() # avoid overlap of plots
plt.draw()

In [None]:
# Set up color blind friendly color palette
# The palette with grey:
cbPalette = ["#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]
# The palette with black:
cbbPalette = ["#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]

# sns.palplot(sns.color_palette(cbPalette))
# sns.palplot(sns.color_palette(cbbPalette))

sns.set_palette(cbPalette)
#sns.set_palette(cbbPalette)

In [None]:
title = 'Properties per Neighbourhood Group'
figure(figsize=(10, 10), dpi=80)
sns.countplot(nyc_data['neighbourhood_group_cleansed'])

plt.title(title)
plt.ioff()

In [None]:
title = 'Properties per Room Type'
figure(figsize=(10, 10), dpi=80)
sns.countplot(nyc_data['room_type'])
plt.title(title)
plt.ioff()

In [None]:
plt.figure(figsize=(20,10))
title = 'Correlation matrix of numerical variables'
sns.heatmap(nyc_data_filtered.corr(), square=True, cmap='RdYlGn')
plt.title(title)
plt.ioff()

In [None]:
# See https://www.kaggle.com/biphili/hospitality-in-era-of-airbnb
title = 'Neighbourhood Group Location'
plt.figure(figsize=(15,9))
sns.scatterplot(nyc_data.longitude,nyc_data.latitude,hue=nyc_data.neighbourhood_group_cleansed).set_title(title)
plt.ioff()

title = 'Room type location per Neighbourhood Group'
plt.figure(figsize=(15,9))
sns.scatterplot(nyc_data.longitude,nyc_data.latitude,hue=nyc_data.room_type).set_title(title)
plt.ioff()

In [None]:
m=folium.Map([40.7128,-74.0060],zoom_start=11)
HeatMap(nyc_data[['latitude','longitude']].dropna(),radius=8,gradient={0.2:'blue',0.4:'purple',0.6:'orange',1.0:'red'}).add_to(m)
display(m)

In [None]:
#https://jakevdp.github.io/PythonDataScienceHandbook/04.14-visualization-with-seaborn.html
#http://seaborn.pydata.org/tutorial/color_palettes.html

x= 'neighbourhood_group_cleansed'
y= 'price'
title = 'Price per Neighbourhood Group'

f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=x, y=y, data=nyc_data_filtered)
plt.title(title)
plt.ioff()

In [None]:
title = 'Median Price per Neighbourhood Group'
result = nyc_data_filtered.groupby(["neighbourhood_group_cleansed"])['price'].aggregate(np.median).reset_index().sort_values('price')
sns.barplot(x='neighbourhood_group_cleansed', y="price", data=nyc_data_filtered, order=result['neighbourhood_group_cleansed'])
plt.title(title)
plt.ioff()

In [None]:
# https://stackoverflow.com/questions/54132989/is-there-a-way-to-change-the-color-and-shape-indicating-the-mean-in-a-seaborn-bo
x='neighbourhood_group_cleansed'
y='price'

title = 'Price per neighbourhood_group for Properties under $155'
data_filtered = nyc_data_filtered.loc[nyc_data_filtered['price'] < 155]
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=x, y=y, data=data_filtered, notch=True, showmeans=True,
           meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"black"})
plt.title(title)
plt.ioff()
f
title = 'Price per neighbourhood_group for Properties more than $170'
data_filtered = nyc_data_filtered.loc[nyc_data_filtered['price'] > 170]
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=x, y=y, data=data_filtered, notch=False, showmeans=True,
           meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"black"})
plt.title(title)
plt.ioff()

In [None]:

x='room_type'
y='price'

title = 'Price per Room Type for properties under $155'
data_filtered = nyc_data_filtered.loc[nyc_data_filtered['price'] < 155]
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=x, y=y, data=data_filtered, notch=True, showmeans=True,
           meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"black"})
plt.title(title)
plt.ioff()
f
title = 'Price per Room Type for properties more than $170'
data_filtered = nyc_data_filtered.loc[nyc_data_filtered['price'] > 170]
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=x, y=y, data=data_filtered, notch=False, showmeans=True,
           meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"black"})
plt.title(title)
plt.ioff()

<font size="12">**4. Modeling first model**</font>

<font size="6">**4.1 Data Preprocessing**</font>


In [None]:
# Load the Dataset  
#data.drop(['latitude', 'name',], axis=1, inplace=True)
nyc_data.drop(['name'], axis=1, inplace=True)
index = ['host_id','neighbourhood_group_cleansed', 'neighbourhood_cleansed','latitude','longitude','room_type','price','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']
dataavailable = nyc_data[index]
data_copy = nyc_data.copy()
df_copy = dataavailable.copy()
df_copy.head()

In [None]:
df_copy.isnull().sum()

In [None]:
# log10 transform
df_copy.minimum_nights += 0.000000001
df_copy['minimum_nights'] = np.log10(df_copy['minimum_nights'])
df_copy.number_of_reviews += 0.000000001
df_copy['number_of_reviews'] = np.log10(df_copy['number_of_reviews'])
df_copy.reviews_per_month += 0.000000001
df_copy['reviews_per_month'] = np.log10(df_copy['reviews_per_month'])
df_copy.calculated_host_listings_count += 0.000000001
df_copy['calculated_host_listings_count'] = np.log10(df_copy['calculated_host_listings_count'])
df_copy.availability_365 += 0.000000001
df_copy['availability_365'] = np.log10(df_copy['availability_365'])
df_copy.head()

In [None]:
# Encoding categorical data
df_copy = pd.get_dummies(df_copy, columns=['room_type'], drop_first=True)
df_copy = pd.get_dummies(df_copy, columns=['neighbourhood_cleansed'], drop_first=True)
df_copy = pd.get_dummies(df_copy, columns=['neighbourhood_group_cleansed'], drop_first=True)

In [None]:
# Filter the dataset for prices between 50 and $175
data_filtered_low = df_copy.loc[(df_copy['price'] < 155)]


In [None]:
# Filter the dataset for prices superior to $175
data_filtered_high = df_copy.loc[(df_copy['price'] > 170)]

<font size="12">**4.2 Multiple Linear Regression**</font>

<font size="6">**4.2.1 Modeling on lower price dataset**</font>

In [None]:
# Split the dataset
X = data_filtered_low.drop('price', axis=1).values
y = data_filtered_low['price'].values
y = np.log10(y)

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
# Predicting the Test set results
y_pred = lr.predict(X_test)

In [None]:
df = pd.DataFrame({'Actual': np.round(10 ** y_test, 0), 
                   'Predicted': np.round(10 ** y_pred, 0)})
df.head(10)

In [None]:
# https://towardsdatascience.com/a-beginners-guide-to-linear-regression-in-python-with-scikit-learn-83a8f7ae2b4f
# https://www.theanalysisfactor.com/assessing-the-fit-of-regression-models/
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score

print('Price mean:', np.round(np.mean(y), 2))  
print('Price std:', np.round(np.std(y), 2))
print('RMSE:', np.round(np.sqrt(metrics.mean_squared_error(y_test, lr.predict(X_test))), 2))
print('R2 score train:', np.round(r2_score(y_train, lr.predict(X_train), multioutput='variance_weighted'), 2))
print('R2 score test:', np.round(r2_score(y_test, lr.predict(X_test), multioutput='variance_weighted'), 2))

### 4.2.2 Modeling the higher price dataset

In [None]:
# Split the dataset
X = data_filtered_high.drop('price', axis=1).values
y = data_filtered_high['price'].values
y = np.log10(y)

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predicting the Test set results
y_pred = lr.predict(X_test)

In [None]:
df = pd.DataFrame({'Actual': np.round(10 ** y_test, 0), 
                   'Predicted': np.round(10 ** y_pred, 0)})
df.head(10)

In [None]:
# https://towardsdatascience.com/a-beginners-guide-to-linear-regression-in-python-with-scikit-learn-83a8f7ae2b4f
# https://www.theanalysisfactor.com/assessing-the-fit-of-regression-models/
from sklearn.metrics import mean_squared_error
from sklearn import metrics
from sklearn.metrics import r2_score

print('Price mean:', np.round(np.mean(y), 2))  
print('Price std:', np.round(np.std(y), 2))
print('RMSE:', np.round(np.sqrt(metrics.mean_squared_error(y_test, lr.predict(X_test))), 2))
print('R2 score train:', np.round(r2_score(y_train, lr.predict(X_train), multioutput='variance_weighted'), 2))
print('R2 score test:', np.round(r2_score(y_test, lr.predict(X_test), multioutput='variance_weighted'), 2))