In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current sessio

In [None]:
nyc_data = pd.read_csv('/kaggle/input/d/arthbr11/new-york-city-airbnb-open-data/listings.csv')
features = ['name', 'host_id', 'host_name', 'neighbourhood_group_cleansed', 'neighbourhood_cleansed', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'maximum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']
nyc_data_filtered = nyc_data[features]
nyc_data['price'] = pd.to_numeric(nyc_data['price'].replace({'\$':'',',':''}, regex = True))

In [None]:
nyc_data_filtered['price'] = pd.to_numeric(nyc_data_filtered['price'].replace({'\$':'',',':''}, regex = True))
nyc_data_filtered.info()

In [None]:
# Visualize data info
nyc_data.info()

In [None]:
# Drop the data that are not of interest and/or causing privacy issues
nyc_data.drop(['id','host_name','last_review'], axis=1, inplace=True)

# Visualize the first 5 rows
nyc_data.head()

In [None]:
# Determine the number of missing values for every column
nyc_data.isnull().sum()

In [None]:
#replacing all NaN values in 'reviews_per_month' with 0
# See https://www.kaggle.com/dgomonov/data-exploration-on-nyc-airbnb
nyc_data.fillna({'reviews_per_month':0}, inplace=True)

In [None]:
# Exclude property with listed price of 0
nyc_data = nyc_data.loc[(nyc_data['price']).replace({'\$':''}, regex = True).replace({',':''}, regex = True).astype(float) > 0]
# data_copy = data.copy()

In [None]:
#examine the dataset
nyc_data.describe()

In [None]:
# Recode data as categorical
# https://datascience.stackexchange.com/questions/29093/continuous-variable-to-categorical-by-quartiles
data_encoded = nyc_data.copy()
data_encoded['minimum_nights'] = pd.qcut(nyc_data['minimum_nights'], q=2, labels=["minimum_nights_low", "minimum_nights_high"])
data_encoded['number_of_reviews'] = pd.qcut(nyc_data['number_of_reviews'], q=3, labels=["number_of_reviews_low", "minimum_nights_medium", "number_of_reviews_high"])
data_encoded['reviews_per_month'] = pd.qcut(nyc_data['reviews_per_month'], q=2, labels=["reviews_per_month_low", "reviews_per_month_high"])
data_encoded['calculated_host_listings_count'] = pd.cut(nyc_data['calculated_host_listings_count'], 
                                                bins=[0, 2, 327],
                                                labels=["calculated_host_listings_count_low", "calculated_host_listings_count_high"])
data_encoded['availability_365'] = pd.qcut(nyc_data['availability_365'], q=2, labels=["availability_low", "availability_high"])

In [None]:
data_encoded.isnull().sum()

In [None]:
data_encoded.head()

In [None]:
sns.set_palette("muted")
from pylab import *
f, ax = plt.subplots(figsize=(24, 18))

subplot(2,3,1)
sns.distplot((nyc_data['price']).replace({'\$':''}, regex = True).replace({',':''}, regex = True).astype(float))
plt.xlim(0, 2000)

subplot(2,3,2)
sns.distplot(nyc_data['minimum_nights'])
plt.xlim(0, 500)

subplot(2,3,3)
sns.distplot(nyc_data['number_of_reviews'])
plt.xlim(0, 400)

subplot(2,3,4)
sns.distplot(nyc_data['reviews_per_month'])
plt.xlim(0, 30)

subplot(2,3,5)
sns.distplot(nyc_data['calculated_host_listings_count'])
plt.xlim(0, 150)

subplot(2,3,6)
sns.distplot(nyc_data['availability_365'])
plt.xlim(0, 500)

plt.tight_layout() # avoid overlap of plotsplt.draw()

In [None]:
from pylab import *
f, ax = plt.subplots(figsize=(24, 18))

subplot(2,3,1)
sns.boxplot(y = (nyc_data['price']).replace({'\$':''}, regex = True).replace({',':''}, regex = True).astype(float)) 

subplot(2,3,2)
sns.boxplot(y = nyc_data['minimum_nights'])

subplot(2,3,3)
sns.boxplot(y = nyc_data['number_of_reviews'])

subplot(2,3,4)
sns.boxplot(y = nyc_data['reviews_per_month'])

subplot(2,3,5)
sns.boxplot(y = nyc_data['calculated_host_listings_count'])

subplot(2,3,6)
sns.boxplot(y = nyc_data['availability_365'])

plt.tight_layout() # avoid overlap of plots
plt.draw()

In [None]:
# Set up color blind friendly color palette
# The palette with grey:
cbPalette = ["#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]
# The palette with black:
cbbPalette = ["#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7"]

# sns.palplot(sns.color_palette(cbPalette))
# sns.palplot(sns.color_palette(cbbPalette))

sns.set_palette(cbPalette)
#sns.set_palette(cbbPalette)

In [None]:
title = 'Properties per Neighbourhood Group'
figure(figsize=(20, 10), dpi=80)
sns.countplot(nyc_data['neighbourhood_group_cleansed'])

plt.title(title)
plt.ioff()

In [None]:
title = 'Properties per Room Type'
figure(figsize=(20, 10), dpi=80)
sns.countplot(nyc_data['room_type'])
plt.title(title)
plt.ioff()

In [None]:
plt.figure(figsize=(20,10))
title = 'Correlation matrix of numerical variables'
sns.heatmap(nyc_data_filtered.corr(), square=True, cmap='RdYlGn')
plt.title(title)
plt.ioff()

In [None]:
# See https://www.kaggle.com/biphili/hospitality-in-era-of-airbnb
title = 'Neighbourhood Group Location'
plt.figure(figsize=(15,9))
sns.scatterplot(nyc_data.longitude,nyc_data.latitude,hue=nyc_data.neighbourhood_group_cleansed).set_title(title)
plt.ioff()

title = 'Room type location per Neighbourhood Group'
plt.figure(figsize=(15,9))
sns.scatterplot(nyc_data.longitude,nyc_data.latitude,hue=nyc_data.room_type).set_title(title)
plt.ioff()

In [None]:
title = 'Room type location per Neighbourhood Group'
sns.catplot(x='room_type', kind="count", hue="neighbourhood_group_cleansed", data=nyc_data);
plt.title(title)
plt.ioff()

In [None]:
#https://jakevdp.github.io/PythonDataScienceHandbook/04.14-visualization-with-seaborn.html
#http://seaborn.pydata.org/tutorial/color_palettes.html

x= 'neighbourhood_group_cleansed'
y= 'price'
title = 'Price per Neighbourhood Group'

f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=x, y=y, data=nyc_data_filtered)
plt.title(title)
plt.ioff()

In [None]:
title = 'Median Price per Neighbourhood Group'
result = nyc_data_filtered.groupby(["neighbourhood_group_cleansed"])['price'].aggregate(np.median).reset_index().sort_values('price')
sns.barplot(x='neighbourhood_group_cleansed', y="price", data=nyc_data_filtered, order=result['neighbourhood_group_cleansed'])
plt.title(title)
plt.ioff()

In [None]:
# https://stackoverflow.com/questions/54132989/is-there-a-way-to-change-the-color-and-shape-indicating-the-mean-in-a-seaborn-bo
x='neighbourhood_group_cleansed'
y='price'

title = 'Price per neighbourhood_group for Properties under $175'
data_filtered = nyc_data_filtered.loc[nyc_data_filtered['price'] < 175]
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=x, y=y, data=data_filtered, notch=True, showmeans=True,
           meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"black"})
plt.title(title)
plt.ioff()
f
title = 'Price per neighbourhood_group for Properties more than $175'
data_filtered = nyc_data_filtered.loc[nyc_data_filtered['price'] > 175]
f, ax = plt.subplots(figsize=(8, 6))
sns.boxplot(x=x, y=y, data=data_filtered, notch=False, showmeans=True,
           meanprops={"marker":"s","markerfacecolor":"white", "markeredgecolor":"black"})
plt.title(title)
plt.ioff()

<font size="12">**4. Modeling first model**</font>

<font size="6">**4.1 Data Preprocessing**</font>


In [None]:
# Load the Dataset  
#data.drop(['latitude', 'name',], axis=1, inplace=True)
nyc_data.drop(['name'], axis=1, inplace=True)
data_copy = nyc_data.copy()

In [None]:
# log10 transform
# https://stackoverflow.com/questions/30794525/adding-one-to-all-the-values-in-a-dataframe
# data_copy = data.copy()
nyc_data.minimum_nights += 0.000000001
nyc_data['minimum_nights'] = np.log10(nyc_data['minimum_nights'])
nyc_data.number_of_reviews += 0.000000001
nyc_data['number_of_reviews'] = np.log10(nyc_data['number_of_reviews'])
nyc_data.reviews_per_month += 0.000000001
nyc_data['reviews_per_month'] = np.log10(nyc_data['reviews_per_month'])
nyc_data.calculated_host_listings_count += 0.000000001
nyc_data['calculated_host_listings_count'] = np.log10(nyc_data['calculated_host_listings_count'])
nyc_data.availability_365 += 0.000000001
nyc_data['availability_365'] = np.log10(nyc_data['availability_365'])

In [None]:
# Encoding categorical data
nyc_data = pd.get_dummies(nyc_data, columns=['room_type'], drop_first=True)
nyc_data = pd.get_dummies(nyc_data, columns=['neighbourhood_cleansed'], drop_first=True)
nyc_data = pd.get_dummies(nyc_data, columns=['neighbourhood_group_cleansed'], drop_first=True)

In [None]:
# Filter the dataset for prices between 50 and $175
data_filtered_low = nyc_data.loc[(nyc_data['price'] < 175)]

In [None]:
# Filter the dataset for prices superior to $175
data_filtered_high = nyc_data.loc[(nyc_data['price'] > 175)]

<font size="12">**4.2 Multiple Linear Regression**</font>

<font size="6">**4.2.1 Modeling on lower price dataset**</font>

In [None]:
# Split the dataset
X = data_filtered_low.drop('price', axis=1).values
y = data_filtered_low['price'].values
y = np.log10(y)

In [None]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predicting the Test set results
y_pred = lr.predict(X_test)

In [None]:
df = pd.DataFrame({'Actual': np.round(10 ** y_test, 0), 
                   'Predicted': np.round(10 ** y_pred, 0)})
df.head(10)

In [None]:
# Split the dataset
X = data_filtered_low.drop('price', axis=1).values
y = data_filtered_low['price'].values
y = np.log10(y)

# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Fitting Multiple Linear Regression to the Training set
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predicting the Test set results
y_pred = lr.predict(X_test)

# Compare predicted and actual values
# https://towardsdatascience.com/a-beginners-guide-to-linear-regression-in-python-with-scikit-learn-83a8f7ae2b4f
# https://stackoverflow.com/questions/19100540/rounding-entries-in-a-pandas-dafaframe
df = pd.DataFrame({'Actual': np.round(10 ** y_test, 0), 
                   'Predicted': np.round(10 ** y_pred, 0)})
df.head(10)