# Berlin Airbnb housing prices

## 1. Import required library

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

import geopy.distance
import geopandas as gpd
import geoplot

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_squared_error

## 2. Data Understanding

### 2.1 Read data with pandas

In [2]:
df = pd.read_csv('../input/airbnb-berlin-july-2021/listings_berlin.csv')
df.head()

### 2.2 Explore Dataset Information

In [3]:
# check dataset info
df.info()

In [4]:
# check dataset shape
df.shape

In [5]:
# describe numeric column
df.describe()

## 3. Exploratory Data Analysis (EDA)

### 3.1 Check and remove missing values

In [6]:
# check missing values
df.isna().sum()

In [7]:
# remove rows in column that have missing value
misvalue = ['last_review', 'reviews_per_month', 'name', 'host_name']

for column in misvalue:
    for index in df[df[column].isna()].index:
        df.drop(index, axis=0, inplace=True)

In [8]:
# recheck dataset info
df.info()

In [9]:
df.shape

### 3.2 Target analysis

In [10]:
# remove outliers
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)
IQR = Q3 - Q1
df = df[~((df['price']<(Q1-1.5*IQR))|(df['price']>(Q3+1.5*IQR)))]
df.reset_index(drop=True)

In [11]:
# anayze the target variable ie 'price'
sns.displot(data=df, x='price',color='#ff4125', kde=True)

### 3.3 Feature engineering

#### 3.3.1 Add new feature which is distance to city center

The value of a property generally increases the more central it is. So including the distance to the center of the city may be beneficial. I proxied the city centre by using the coordinates for the Mitte borough, which is the center of Berlin.

In [12]:
# center of Berlin city, Mitte borough
center = (52.5373, 13.3603)

In [13]:
df['dist_to_center'] = np.nan
for i in df.index:
    df['dist_to_center'][i] = geopy.distance.distance(center, (df['latitude'][i], df['longitude'][i])).km

In [14]:
df.info()

#### 3.3.2 Separate DataFrame depending on datatype

**Categorical features**

In [15]:
cat_df = df.select_dtypes(include='object')

In [16]:
catcol = list(cat_df.columns)
catcol

**Numerical features**

In [17]:
num_df = df.select_dtypes(include='number')

In [18]:
numcol = list(num_df.columns)
numcol

#### 3.3.3 Univariate Analysis

In this section the univariate analysis is performed; More importantly I have considered the features that are more importanht with the 'Target' that have high corelation with the Target.

For the numeric features, I used 'displot' and 'boxplot' to analyze their distribution.

Similarly for categorical features the most reasonable way to visualize the distribution is to use a 'countplot' which shows the relative counts for each category or class. Can use a pie-plot also to be a bit more fancy.

**Numeric feature**

Analyze numeric features

In [19]:
# function to plot numerical feature
def plot_num(feature):
    sns.histplot(data=df,x=feature,color='#ff4125')

In [20]:
# check minimum nights feature
df['minimum_nights'].unique()

In [21]:
# count the sum of every unique element
Counter(df['minimum_nights']).most_common()

In [22]:
# remove element with value less than 365
df = df[~(df['minimum_nights'] > 365)]

In [23]:
df.shape

In [24]:
# plot minimum nights feature
plot_num('minimum_nights')

Majority of the airbnb housing have low minimum nights to rent

In [25]:
# check number of reviews feature
plot_num('number_of_reviews')

In [26]:
# check reviews per month feature
plot_num('reviews_per_month')

In [27]:
# check calculated host listings count feature
plot_num('calculated_host_listings_count')

In [28]:
# check availability 365 feature
plot_num('availability_365')

In [29]:
# using geopandas to read geojson file
berlin = gpd.read_file('../input/airbnb-berlin-july-2021/neighbourhoods.geojson')

In [30]:
# create geoplot for berlin city map
geoplot.polyplot(berlin, facecolor = 'green', figsize=(20, 20)) 
plt.scatter(df['longitude'], df['latitude'], c=df['price'], cmap = 'Reds')

The center the location, the higher the price and the amount of airbnb housing

**Categorical features**

Analyze categorical features

In [31]:
# function to visualize categorical data
def plot_cat(feature):
    sns.countplot(data=df,x=feature)
    ax=sns.countplot(data=df,x=feature)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=270)

In [32]:
plot_cat('neighbourhood_group')

In [33]:
plot_cat('room_type')

#### 3.3.4 Bivariate analysis

In this section the Bivariate Analysis have been done. I have plotted various numeric as well as categorical features against the target ie 'SalePrice'.

**Numerical features**

In [34]:
# function to visualize bivariate analysis for numerical feature
def bi_num(feature):
    fig, ax = plt.subplots()
    ax.scatter(x = df[feature], y = df['price'])
    plt.ylabel('price')
    plt.xlabel(feature)
    plt.show()

In [35]:
bi_num('minimum_nights')

In [36]:
bi_num('number_of_reviews')

In [37]:
bi_num('reviews_per_month')

In [38]:
bi_num('calculated_host_listings_count')

In [39]:
bi_num('availability_365')

From the numerical feature, we can conclude that those features is lessly correlated to the increase of the airbnb housing prices

**Categorical features**

In [40]:
# function to visualize bivariate analysis of the categorical feature
def bi_cat(feature):
    ax=sns.catplot(data=df,x=feature,y='price',kind='box',height=5,aspect=1.5)
    ax.set_xticklabels(rotation=270)

In [41]:
bi_cat('neighbourhood_group')

In [42]:
bi_cat('room_type')

Entire home/apartment and Hotel room have higher prices

#### 3.3.4 Most correlated feature to the target

In [43]:
cor_mat= df[:].corr()
cor_to_tar=cor_mat.sort_values(['price'],ascending=False)

In [44]:
print("The most relevant features (numeric) for the target are :")
cor_to_tar.price

The majority of the columns have low correlation to the price value. Even newly created column, the distance to center, has the lowest correlation to the price value.

## 4. Data Preparation

### 4.1 Select the feature

In [45]:
df.head()

Drop the column id, name, host_id, host_name, neighbourhood, latitude, and longitude. Set the the name of the housing as index.

In [46]:
df.set_index('name', inplace=True)
df.head()

Separate target from the features

In [47]:
selected_features = ['neighbourhood_group', 'room_type', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'dist_to_center']

y = df['price']
df = df[selected_features]
df.head()

In [48]:
df.shape

### 4.2 Normalize data

In [49]:
column_numeric = df.select_dtypes(include='number').keys()
column_numeric

In [50]:
scaler = MinMaxScaler()

In [51]:
scaled = scaler.fit_transform(df[column_numeric])

In [52]:
i=0
for column in column_numeric:
    df[column] = scaled[:,i]
    i += 1

In [53]:
df.head()

### 4.3 Convert categorical feature to numerical

In [54]:
df = pd.get_dummies(df)
df.head()

In [55]:
df.info()

### 4.4 Split train and test data

In [56]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=.2, shuffle=False)

In [57]:
print('X train shape  :', x_train.shape)
print('X test shape   :', x_test.shape)
print('Y train shape  :', y_train.shape)
print('Y test shape   :', y_test.shape)

## 5. Modeling
Lastly it is the time to apply various regression models and check how are we doing. I have used various regression models from the scikit.

Parameter tuning using GridSearchCV is also done to improve performance of some algos.

The evalauton metric that I have used is the Root Mean Squared Error between the 'Actual price' and 'Predicted price' which is also the evaluation metric used by the kaggle.

To get abetter idea one may also use the K-fold cross validation instead of the normal holdout set approach to cross validation.

### 5.1 Linear Regression

In [58]:
# fit linear regression model to train data
lr = LinearRegression()
lr.fit(x_train, y_train)

In [59]:
# predict test data
lrpred = lr.predict(x_test)

In [60]:
lr_mse = np.sqrt(mean_squared_error(y_test,lrpred))
lr_mse

### 5.2 Lasso and GridSearchCV Tuning

#### 5.2.1 Lasso Model

In [61]:
lasso=Lasso()
lasso.fit(x_train,y_train)

In [62]:
lassopred=lasso.predict(x_test)

In [63]:
lasso_mse = np.sqrt(mean_squared_error(y_test,lassopred))
lasso_mse

#### 5.2.2 Lasso GridSearchCV Tuning

In [64]:
params_dict={'alpha':[0.001, 0.005, 0.01,0.05,0.1,0.5,1]}
lasso_CV=GridSearchCV(estimator=Lasso(),param_grid=params_dict,scoring='neg_mean_squared_error',cv=10)
lasso_CV.fit(x_train, y_train)

In [65]:
lassoCVpred = lasso_CV.predict(x_test)

In [66]:
lassoCV_mse = np.sqrt(mean_squared_error(y_test,lassoCVpred))
lassoCV_mse

### 5.3 Ridge Regression and GridSearchCV Tuning
#### 5.3.1 Ridge Regression Model

In [67]:
ridge=Ridge()
ridge.fit(x_train,y_train)

In [68]:
ridgepred = ridge.predict(x_test)

In [69]:
ridge_mse = np.sqrt(mean_squared_error(y_test,ridgepred))
ridge_mse

#### 5.3.2 Ridge Regression GridSearchCV Tuning

In [70]:
params_dict={'alpha':[0.1, 0.15, 0.20,0.25,0.30,0.35,0.4,0.45,0.50,0.55,0.60]}
ridge_CV=GridSearchCV(estimator=Ridge(),param_grid=params_dict,scoring='neg_mean_squared_error',cv=10)
ridge_CV.fit(x_train,y_train)

In [71]:
ridgeCVpred = ridge_CV.predict(x_test)

In [72]:
ridgeCV_mse = np.sqrt(mean_squared_error(y_test,ridgeCVpred))
ridgeCV_mse

### 5.4 Gradient Boosting

In [73]:
gb=GradientBoostingRegressor()
gb.fit(x_train,y_train)

In [74]:
gbpred = gb.predict(x_test)

In [75]:
gb_mse = np.sqrt(mean_squared_error(y_test,gbpred))
gb_mse

So far, Gradient Boosting Regressor has the minimum mse

## 6. Model Evaluation

In [76]:
result = pd.DataFrame({'Name': y_test.index, 'Actual Price': y_test.values, 'Predicted Price': gbpred})
result.head()

From result above, we can say that the model hasn't reach a good outcome. Another model algorithm like Adaboost, RandomForest, etc. maybe better. The majority of the columns have low correlation to the price value. Even newly created column, the distance to center, has the lowest correlation to the price value. Other feature including amenities like bedroom count, beds count, maximum guest, swimming pool, wifi, air conditioning, gym, breakfast, etc. may affect the price even better. 