# 1. Import Libraries and load dataset

In [None]:
import numpy as np
import pandas as pd
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
abnb = pd.read_csv('/kaggle/input/airbnb-istanbul-dataset/AirbnbIstanbul.csv')

# 2. Initial Analysis and Data Cleaning

In [None]:
abnb.head(10)

In [None]:
abnb.shape

We have **16251** observations and **16** variables

In [None]:
abnb.info()

In [None]:
msno.bar(abnb)

Bar plat of missing data suggests that entire neighbourhood_group variable has **NULL **values . 
Last review and reviews_per_month both have roughly 50% of NULL values

In [None]:
abnb.drop(['neighbourhood_group'],axis=1,inplace=True)

In [None]:
abnb.last_review = pd.to_datetime(abnb.last_review)

In [None]:
abnb.reviews_per_month.fillna(0,inplace=True)

In [None]:
abnb.host_name.fillna('Not Stated',inplace = True)

In [None]:
abnb.name.fillna('No Description',inplace = True)

In [None]:
abnb.info()

**Questions:**

1. How many different types of **room_type** are available?

In [None]:
abnb.room_type.unique()

2. How many distinct **neighbourhood** are there?

In [None]:
abnb.neighbourhood.unique()

Identify duplicate **host_id** and **neighbourhood**

In [None]:
dup_abnb = abnb[abnb.duplicated(['host_id','neighbourhood'],keep=False)].sort_values(['host_id'])
dup_abnb.head(10)

In [None]:
dup_abnb[['host_id','neighbourhood']].head(10)

# 3. Exploratory data analysis and visualization

In [None]:
dup_dict = dup_abnb.host_id.value_counts()
import operator
sorted_x = sorted(dup_dict.items(), key=operator.itemgetter(1))
top10 = [i[0] for i in sorted_x[::-1][:10]]
top10_hosts = abnb[abnb.host_id.isin(top10)]
top10_hosts.host_name.value_counts()


**Top 10 hosts average rating**

In [None]:
top10_host_numrv= top10_hosts.groupby('host_name')['number_of_reviews'].mean().reset_index()
names,n = top10_host_numrv['host_name'],top10_host_numrv['number_of_reviews'] 
top10_host_numrv

In [None]:
f,ax = plt.subplots(figsize=(12,8)) # size 12 by 8
ax.barh(names,n)
ax.axvline(abnb.number_of_reviews.mean(),color='r')
ax.set_title('Mean # of Reviews of Top10 Multiple Property Owners',fontsize=20,pad=20)
plt.show()


**Average price set by top 10 hosts**

In [None]:
top10_hosts_price = top10_hosts.groupby('host_name')['price'].mean().reset_index()
names,price = top10_hosts_price['host_name'],top10_hosts_price['price']
top10_hosts_price

In [None]:
f,ax = plt.subplots(figsize=(12,8))
ax.barh(names,price)
ax.axvline(abnb.price.mean(),color='r')
ax.set_title('Mean Prices set by Top10 Multiple Property Owners',fontsize=20,pad=20)
plt.show()

**Top 10 crazy prices of neighbourhood**

In [None]:
tp10_price = abnb.sort_values('price',ascending=False).nlargest(10,'price')
neighbourhood,price = tp10_price['neighbourhood'], tp10_price['price']
tp10_price[['neighbourhood','price']]

In [None]:
f,ax = plt.subplots(figsize=(12,8))
ax.barh(neighbourhood,price)
ax.axvline(abnb.price.mean(),color='r')
ax.set_title('Prices set by Top neighbourhoods',fontsize=20,pad=20)
plt.show()

Identifying outliers

In [None]:
abnb['price'].mean() # avg price 354.72389391422064


In [None]:
abnb.price.median()

In [None]:
# price and room type

f,ax = plt.subplots(figsize=(12,8))
abnb.boxplot(column='price',by='room_type',rot=90,ax=ax)
#ax.set_ylim([0,10000])
plt.suptitle('')
ax.set_title('Price Grouped by Room Type',fontsize = 15)
ax.title.set(y=1.05)
ax.set_ylabel('Price (TL)',fontsize=15)
ax.set_xlabel('',fontsize=15)
plt.show()

**Clearly, since the mean price 354 and median price 190, price variable has too many outliers as seen from the boxplot. Visually, we can see that price > 1000 has very few observations and can be considered as outliers**

In [None]:
# price vs number of reviews.
f,ax = plt.subplots(figsize=(12,8))
abnb.plot.scatter('number_of_reviews','price',ax=ax)
ax.set_ylabel('Price',fontsize = 15,labelpad=10)
ax.set_xlabel('number_of_reviews',fontsize = 15,labelpad=10)
ax.set_title('Price Vs No. of reviews',fontsize=20,pad=20)
plt.show()


In [None]:
# price and neighbourhood relation
f,ax = plt.subplots(figsize=(12,8))
abnb.boxplot(column='price',by='neighbourhood',rot=90,ax=ax)
#ax.set_ylim([0,10000])
plt.suptitle('')
ax.set_title('Price Grouped by Neighbourhood',fontsize = 15)
ax.title.set(y=1.05)
ax.set_ylabel('Price (TL)',fontsize=15)
ax.set_xlabel('',fontsize=15)
plt.show()

In [None]:
# convert string object types (room_types and neighbourhood) to categorical columns(factors)n
abnb['room_type']=abnb.room_type.astype('category')
# other way
abnb['neighbourhood'] = pd.Categorical(abnb.neighbourhood)
abnb.info()

In [None]:
# total number of listings and room_type
sns.countplot(abnb['room_type'], palette="plasma")
fig = plt.gcf()
fig.set_size_inches(10,10)
plt.title('Total number of listings per room type')

In [None]:
# price vs minimum nights
f,ax = plt.subplots(figsize=(12,8))
abnb.plot.scatter('minimum_nights','price',ax=ax)
ax.set_ylabel('Price',fontsize = 15,labelpad=10)
ax.set_xlabel('minimum_nights',fontsize = 15,labelpad=10)
ax.set_title('Price Vs Minimum number of nights',fontsize=20,pad=20)
plt.show()

In [None]:
# price vs availability_365.
f,ax = plt.subplots(figsize=(12,8))
abnb.plot.scatter('availability_365','price',ax=ax)
ax.set_ylabel('Price',fontsize = 15,labelpad=10)
ax.set_xlabel('availability_365',fontsize = 15,labelpad=10)
ax.set_title('Price Vs 365 days availability',fontsize=20,pad=20)
plt.show()


In [None]:
# price vs calculated_host_listings_count.
f,ax = plt.subplots(figsize=(12,8))
abnb.plot.scatter('calculated_host_listings_count','price',ax=ax)
ax.set_ylabel('Price',fontsize = 15,labelpad=10)
ax.set_xlabel('calculated_host_listings_count',fontsize = 15,labelpad=10)
ax.set_title('Price Vs number of listings per host',fontsize=20,pad=20)
plt.show()

**Finally, lets see the correlations between different predictors and response variable which is price**

In [None]:
# identifying correlation
abnb_numerical_colns=abnb[['latitude','longitude','price','minimum_nights','number_of_reviews','last_review','reviews_per_month','calculated_host_listings_count','availability_365']] # remove unwanted features such as id, host_id, host_name etc
corr = abnb_numerical_colns.corr() 
plt.figure(figsize=(12,10))
sns.heatmap(corr, annot=True)
abnb.columns
plt.show() 

**We can see that there is a high correlation between number of reviews and reviews per month, however other variables doesn't show any significant correlation**

# 4. Data preprocessing for model selection and price prediction 

In exploratory data analysis, we saw that price greater than 1000 are very few observations and has very large values which can jeoparize our machine learning model predictions. Hence, we'll remove these outliers from our data.

In [None]:
abnb_unbiased = abnb[abnb.price<1000]
abnb_unbiased.shape

In [None]:
abnb_unbiased.info()

Selecting the right predictor variables. id, name, host_id and host_name are basically the identifies, hence removing them for modeling. Also, removing the latitude and longitude variables becasuse neighbourhood describes the location. Also removing neighbourhood as there are 39 distinct neighbourhoods and will be difficult to consider each one of them for modeling.

In [None]:
abnb_predictors=abnb_unbiased[['room_type','minimum_nights','number_of_reviews','reviews_per_month','calculated_host_listings_count','availability_365']]
abnb_predictors.head(10)

#a) One hot encoding for Room type

In [None]:
dummy_roomtype=pd.get_dummies(abnb_predictors['room_type'], prefix='dummy')
abnb_predictors = pd.concat([abnb_predictors,dummy_roomtype],axis=1)
abnb_predictors.drop(['room_type'],axis=1, inplace=True)
abnb_predictors

In [None]:
#b) standardizing the dataset
from sklearn import preprocessing
X=preprocessing.scale(abnb_predictors)
y=abnb_unbiased.price
print(X)
print(y)

In [None]:
X = pd.DataFrame(X)
X=X.rename(index=str, columns={0:'minimum_nights',1:'number_of_reviews',2:'reviews_per_month',3:'calculated_host_listings_count',
                             4:'availability_365',5:'dummy_Entire home/apt',6:'dummy_Private room',7:'dummy_Shared room'})
X.head()



In [None]:
y.head()

# 5. Applying machine learning models 

5.1 Prepare a Linear Regression Model

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 353)

from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

**Now lets predict and evaluate the model**

In [None]:
y_pred1 = regressor.predict(X_test)

from sklearn.metrics import r2_score
r2_score(y_test,y_pred1)

R-Square of **21.27%** is not great in terms of accuracy but as a starting model not bad too.

5.2 Prepairng a Decision Tree Regression

In [None]:
from sklearn.tree import DecisionTreeRegressor
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2,random_state=105)
DTree=DecisionTreeRegressor(min_samples_leaf=.0001)
DTree.fit(X_train,y_train)

In [None]:
y_pred2=DTree.predict(X_test)

from sklearn.metrics import r2_score
r2_score(y_test,y_pred2)

R-Square of **3.59%** is not good at all.

In [None]:
#5.3 Prepairing a Random forest regressor

from sklearn.ensemble import RandomForestRegressor
RForest = RandomForestRegressor(n_estimators=100, random_state=0)
RForest.fit(X_train, y_train)

In [None]:
y_pred3 = RForest.predict(X_test)

from sklearn.metrics import r2_score
r2_score(y_test,y_pred3)

Random forest R-square of **23.02%** gave better accuracy than linear and decission tree model

# 6. Revised model with improvements

We want to see if there exists any multi-collinearity between predictor variables and if yes removing one of them can help improve our model.

In [None]:
#Get Correlation between different variables
corr = abnb_predictors.corr(method='kendall')

plt.figure(figsize=(12,10))
sns.heatmap(corr, annot=True)
abnb_predictors.columns
plt.show()

we can see that there is a high correlation between **number of reviews ** and **reviews per month**. Also, there is a high negative correlation between **entire home apartments** and **private room**. So, will consider dropping **number_of_reviews** and **dummy_Private room** variable.

In [None]:
abnb_unbiased.head()

In [None]:
abnb_predictors=abnb_unbiased[['room_type','minimum_nights','reviews_per_month','calculated_host_listings_count','availability_365']]
abnb_predictors.loc[0:3]