<h1 style="background-color:skyblue;font-family:sans-serif;font-size:320%;text-align:center">Madrid: Data Analysis and Price Prediction</h1>

In [None]:
from IPython.display import Image
import os
Image("../input/madridairbnbdata/Madrid.jpg")

<h2 style="background-color:skyblue;font-family:sans-serif;font-size:300%;text-align:center">Table Of Content</h2>

* [1. First Steps](#1)
    * [1.1 Libraries](#1.1)
    * [1.2 Data Exploration](#1.2)
* [2. Data Analysis](#2)
    * [2.1 Price per Room Type](#2.1)
    * [2.2 Room Type Distribution over Districts](#2.2)
    * [2.3 Influence of Reviews on the Price](#2.3)
* [3. Data Preprocessing](#3)    
* [4. Model](#4) 

<a id="1"></a>
<h2 style="background-color:skyblue;font-family:sans-serif;font-size:300%;text-align:center">First Steps</h2>

<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:sans-serif;font-size:230%;text-align:center">Libraries</h3>

In [None]:
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt

from datetime import datetime
from scipy import stats
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
listings = pd.read_csv("/kaggle/input/madrid-airbnb-data/listings.csv")

<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:sans-serif;font-size:230%;text-align:center">Data Exploration</h3>

In [None]:
listings.shape

In [None]:
 listings.head()

In [None]:
listings.describe()

In [None]:
listings.info()

In [None]:
# Is missing data in reviews_per_month 
# connected with a number_of_reviews of null?

sum(listings.number_of_reviews[listings.reviews_per_month == None])

# conlusion: yes

In [None]:
# Are there any duplicates in the data?

listings.duplicated().sum()

# conclusion: No

In [None]:
# Check if the price in the data is a price per night or a price per visit

price_per_minimum_nights = listings.groupby(["minimum_nights"]).mean().price
price_per_minimum_nights.plot(kind="line", color="indigo")
plt.ylabel("Price")
plt.title("Are Prices per Night or per Visit?")

# Conclusion: The price seems to be per night

In [None]:
# Take a look at the distributions
def distribution_plot(col, boundaries=(0, 100), data_type="numeric", rot=0):
    """
    Description: Plots a histogram in order to see the distribution of the feature. 
    
    Arguments:
        col: column of a dataframe
        boundaries: range that should be plotted
        data_type: string with the information, if the data is numeric or not
        rot: rotation of the labels in the plot
    
    Returns:
       A distribution plot
    """
    
    plt.figure(figsize=(4,2))
    if data_type == "numeric":
        listings[col].hist(range=boundaries, bins=20, color="purple", edgecolor="indigo", 
                       linewidth=1)
    else:
        sns.countplot(listings[col], palette="plasma")
        
    plt.grid(False)
    plt.xlabel(col)
    plt.ylabel("Count")
    plt.title("Distribution of the " + col)
    plt.xticks(rotation=rot)
    plt.show()



distribution_plot("price", boundaries=(0, 500))
distribution_plot("minimum_nights", boundaries=(0, 35))
distribution_plot("number_of_reviews", boundaries=(0, 100))
distribution_plot("reviews_per_month", boundaries=(0, 10))
distribution_plot("calculated_host_listings_count", boundaries=(0, 20))
distribution_plot("availability_365", boundaries=(0, 365))
distribution_plot("neighbourhood_group", data_type="not numeric", rot=90)
distribution_plot("room_type", data_type="not numeric", rot=90)

<a id="1.1"></a>
<h2 style="background-color:skyblue;font-family:sans-serif;font-size:300%;text-align:center">Data Analysis</h2>

Before (further) preprocessing the data for modeling, we can already execute a data analysis.

<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:sans-serif;font-size:230%;text-align:center">Price per Room Type</h3>

In [None]:
#For which room type can I take which price?
price_per_room_type = listings.groupby(["room_type"]).mean().price 
price_per_room_type.sort_values(ascending=False)

In [None]:
plt.figure(figsize=(8,5))
price_per_room_type.plot(kind="bar", color=["indigo", "purple", "lightsalmon", "peachpuff"]);
plt.ylabel("mean price")
plt.title("Which price can I take for my room type?")

<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:sans-serif;font-size:230%;text-align:center">Room Type Distribution over Districts</h3>

In [None]:
# Is there a gap in the market for a room_type in any district

plt.figure(figsize=(8,5))
sns.scatterplot(listings.longitude, listings.latitude, hue=listings.room_type, 
                palette="CMRmap")

# The whole city seems to be well covered.
# There are fewer private rooms in the center than in the outer parts.
# Shared rooms and hotel rooms are not very common. This could correspond to a lower demand. 

<a id="1.1"></a>
<h3 style="background-color:skyblue;font-family:sans-serif;font-size:230%;text-align:center">Influence of Reviews on the Price</h3>

In [None]:
# Can flats with at least one review take higher prices?
price_per_number_of_reviews = listings.groupby(["number_of_reviews"]).mean().price
price_per_number_of_reviews.sort_values(ascending=False)

# Answer: They surprisingly do not.
# Possible reason 1: Cheaper apartments are booked more often and therefore get more reviews.
# Possible reason 2: Apartments with more reviews are older (older furniture) 
#                    and therefore less attractive.
# Possible reason 3: Rewiews must not always be good. Former guests could have complained.

In [None]:
plt.figure(figsize=(8,5))
price_per_number_of_reviews.plot(kind="line", color="indigo")
plt.title("Lead More Reviews to Higher Prices?")
plt.ylabel("Price")


<a id="1.1"></a>
<h2 style="background-color:skyblue;font-family:sans-serif;font-size:300%;text-align:center">Data Preprocessing</h2>

In [None]:
# Create a new column that counts the days since the last review
# Missing values mean that there has not been a review yet. They receive a high number.

listings["last_review"] = listings["last_review"].fillna("2015-01-01")
listings["last_review"] = pd.to_datetime(listings["last_review"])
listings["days_since_last_review"] = (datetime.now() - listings["last_review"]).dt.days

In [None]:
# Distribution of the days_since_last_review 
# The right bin represents listings without any review

plt.figure(figsize=(8,5))
listings["days_since_last_review"].hist(range=(400,2300), bins=20, color="purple",
                                        edgecolor="indigo", linewidth=1)
plt.grid(False)
plt.xlabel("Days since last Review")
plt.ylabel("Count")
plt.title("Distribution of the Days since the last Review")

In [None]:
# Fill missing values in reviews_per_month
listings["reviews_per_month"] = listings["reviews_per_month"].fillna(0)

In [None]:
# Delete columns we do not need:
listings_slim = listings.drop(["id", "name", "host_name", "last_review", "neighbourhood_group", "neighbourhood"], axis=1)

# With latitude and longitude we still have location information in the data.  
# ID would be a relatively important feature in the model. Nevertheless I drop it, because
# there is no solid reason for this.

In [None]:
# Get rid of outliers

cols = ["price", "minimum_nights", "calculated_host_listings_count",
        "reviews_per_month", "number_of_reviews"]

for col in cols:
    upper_bound = listings_slim[col].quantile(0.95)
    lower_bound = listings_slim[col].quantile(0.05)
    listings = listings[listings[col] < upper_bound]
    listings = listings[listings[col] > lower_bound]

In [None]:
# Plot before transformation
stats.probplot(listings_slim["price"], plot=plt)

# Power Transformer
numeric_cols = list(listings_slim._get_numeric_data().columns)
pt = PowerTransformer(method="yeo-johnson")
listings_slim[numeric_cols] = pt.fit_transform(listings_slim[numeric_cols])

In [None]:
# After transformation
stats.probplot(listings_slim["price"], plot=plt)

In [None]:
# Restructure categorial data:
listings_preprocessed = pd.get_dummies(listings_slim,drop_first=True)
listings_preprocessed.head()

In [None]:
listings_preprocessed.info()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(listings_preprocessed.corr(), square=True, annot=True)

# Conclusion: Most of the features do not correlate strongly with the price.

<a id="1.1"></a>
<h2 style="background-color:skyblue;font-family:sans-serif;font-size:300%;text-align:center">Model</h2>

In [None]:
# Split into explanatory and response variables
X = listings_preprocessed.drop(["price"], axis = 1)
y = listings_preprocessed["price"]

In [None]:
#Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Cross valiation to find the best estimator and its most suitable parameters

estimators = [Ridge(), Lasso(), RandomForestRegressor()]

params = [{"alpha": [0.1, 0.5, 1, 5, 10, 20],
           "tol": [0.1, 0.5, 0.9]},
             
          {"alpha": [0.1, 0.5, 1, 5, 10, 20],
           "max_iter": [1000, 2000]},
             
          {"max_depth": [21, 22, 23, 24, 25], 
           "min_samples_split": [5, 6, 7, 8, 9, 10]}]
    
for estimator, param in zip(estimators, params):
        
    estimators_cv = GridSearchCV(estimator, param_grid=param, cv=5).fit(X_train, y_train)

    print(estimators_cv .best_estimator_)
    print(estimators_cv .best_score_)
    print(estimators_cv .best_params_)

In [None]:
rf = RandomForestRegressor(n_estimators=500, max_depth=25, min_samples_split=5)
rf.fit(X_train, y_train) 

y_pred = rf.predict(X_test)

print("Trainscore R^2: {}".format(rf.score(X_train, y_train))) 
print("Testscore R^2: {}".format(rf.score(X_test, y_test))) 
rmse = np.sqrt(mean_squared_error(y_test,y_pred)) 
print("Testdata Root Mean Squared Error: {}".format(rmse))

In [None]:
feature_importance = pd.Series(rf.feature_importances_, index = X.columns)
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(8,5))
plt.barh(pos, feature_importance[sorted_idx], align="center",
         color=["peachpuff", "lightsalmon", "salmon", "palevioletred", "purple", "indigo"])
plt.yticks(pos, np.array(listings_preprocessed.columns)[sorted_idx])
plt.title("Feature Importance")