In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

## First look at the data

In [None]:
flat_data = pd.read_csv('../input/madrid-airbnb-data/listings.csv')

In [None]:
print('train: {}'.format(flat_data.shape))
flat_data.head()

In [None]:
flat_data.info()

Looks like 'name' and 'host_name', 'host_id' don't useful feature for rent price. 
Latitude and longitude look interesting, but I think they should show the same as neighbourhood. 

In [None]:
#missing values for columns

is_null_data = flat_data.isnull()
total = is_null_data.sum()
percent = ((total/is_null_data.count())*100)
missing_data = pd.concat([total, percent], axis=1, keys=['Total Missing', 'Percent'])
missing_data = missing_data[missing_data['Total Missing'] > 0]
    
missing_data.sort_values(by='Percent', ascending=False)

The number of missing values is small, but the last two columns I plan to drop. 

### Statistics and outliers

In [None]:
sns.set()
plt.figure(figsize=(8,5))

#visualize number of unique atributer for each categorical feature
count_uniq = []
columns = ['neighbourhood', 'room_type']
for column in columns:
    count_uniq.append(flat_data[column].nunique())
    
sns.barplot(x=columns, y=count_uniq)
plt.xticks(rotation='vertical');

There are a lot of values for neighbourhood area. 
It may sense to try Frequency Encoding for it. </br>
up: FE was removed from code as I removed neighbourhood column.

In [None]:
#outliers for categorical features
for col in columns:
    sns.boxplot(x=flat_data[col], y=flat_data.price)
    plt.ylabel("price", fontsize=12)
    plt.xticks(rotation='vertical')
    plt.title(col, fontsize=14)
    plt.show()

We have a more expensive and cheaper area what is ok.
Also, some room type should be more expansive than others, for example, Entire home sounds better than a private room, but the price for a room looks some time higher.

In [None]:
#stats for categorical features
for col in columns:
    sns.countplot(flat_data[col])
    plt.xticks(rotation='vertical')
    plt.title(col, fontsize=14)
    plt.show()

Maybe make sense to reduce the number of areas.
It would be better for Random Forest Regressor model as it could not work with such amount of values. 
up: combined neighbourhoods did not improve the result.

In [None]:
#outliers for numerical features
numerical_columns = flat_data.select_dtypes(exclude=['object']).columns[2:-1]
for col in numerical_columns:
    sns.scatterplot(flat_data[col], flat_data.price)
    plt.title(col, fontsize=14)
    plt.show()

In [None]:
#distribution for numerical features
for col in numerical_columns:
    sns.distplot(flat_data[col])
    plt.title(col, fontsize=14)
    plt.show()

#### Let's look are there a correlation between neighbourhood and price?

In [None]:
# Visualization by area and price.
plt.figure(figsize=(14,10))
sns.scatterplot(flat_data['longitude'], flat_data['latitude'],
                size=flat_data['price'],
                hue=flat_data['neighbourhood'], legend=False);

In [None]:
#visualization for price 
plt.figure(figsize=(14,10))
sns.scatterplot(flat_data['longitude'], flat_data['latitude'],
                hue=flat_data['price']);

There are just a few points with price bigger than average.

In [None]:
flat_grp = flat_data[['neighbourhood', 'price']].groupby('neighbourhood', as_index=False).median()\
        .sort_values(by='price')

In [None]:
sns.lineplot(flat_grp['neighbourhood'], flat_grp['price'])
plt.xticks(rotation='vertical');

There are strong correlation between price and neighbourhood area.

#### Let's look are there a correlation between reviews and price?

In [None]:
flat_data['last_review'] = pd.to_datetime(flat_data['last_review'])

In [None]:
rewiev_data = flat_data[['number_of_reviews', 'last_review', 'reviews_per_month', 'price']]

for col in rewiev_data.columns[:-1]:
    sns.lineplot(flat_data[col], flat_data.price)
    plt.title(col, fontsize=14)
    plt.show()

I don't see a strong correlation between price and reviews. 

#### Why Host id is important feature?

In [None]:
#number of flats for each host
numb_host_flats = flat_data['host_id'].value_counts()

In [None]:
grp_host = flat_data.groupby('host_id').agg(
    num_of_flats=('name', 'count'), 
    mean_price=('price', 'mean')).sort_values('num_of_flats', ascending=False)

grp_host.head()

In [None]:
sns.scatterplot(grp_host['num_of_flats'], grp_host['mean_price']);

it looks like the more flats a host has - the less price.
So there is an influence.

In [None]:
#numerical feature correlation with price
plt.figure(figsize=(10, 6))
sns.heatmap(flat_data.corr(), cbar=True, annot=True, square=True, annot_kws={'size': 8}, 
            cmap=["#4C72B0", "#708EBF", "#9CAFD1", "#C3CDE2", "#EAEAF2"]);

I don't found a strong linear correlation with target variable, so I think it is good idea to try Random Forest Regression.

## Data preprocessing

In [None]:
#remove columns not releted to accomodations directly and neighbourhood as we have lat and lon
flat_data = flat_data.drop(['id', 'host_name', 'name', 'last_review', 'neighbourhood'], axis=1)

In [None]:
#fill missing reviews with 0
flat_data = flat_data.fillna(0)

In [None]:
#seraching for outliers in one columns and back dataset without rows with outliers. Quantiles method here
def outliers_searche(df):
    q_25 = np.quantile(df, 0.25)
    q_75 = np.quantile(df, 0.75)
    x_min = q_25 - (q_75 - q_25) * 1.5
    x_max = q_75 + (q_75 - q_25) * 1.5    
    return df[(df <= x_min) | (df >= x_max)]

In [None]:
flat_data.drop(outliers_searche(flat_data['price']).index,inplace=True)

We have only one object column on this step and this column has just 4 values, so we can go with dummies for it. 

In [None]:
flat_data = pd.get_dummies(flat_data)

In [None]:
flat_data.head()

## Model

I choose RN because it is robust to outliers and don't need Normal distribution.

In [None]:
X = flat_data.drop('price', axis=1)
y = flat_data['price']

In [None]:
xTrain, xTest, yTrain, yTest = train_test_split(X, y, test_size=1/3, random_state=0)

In [None]:
model = RandomForestRegressor()

In [None]:
model.get_params()

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 20)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 150, num = 11)]
min_samples_split = [2, 5, 10, 20]
min_samples_leaf = [1, 2, 4, 10, 20]
bootstrap = [True, False]

parametrs = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
rnd_search_cv = RandomizedSearchCV(model, parametrs, cv=5)

In [None]:
rnd_search_cv.fit(xTrain, yTrain)

In [None]:
predictions = rnd_search_cv.best_estimator_.predict(xTest)

In [None]:
#Mean Absolute Error 
print('MAE:', metrics.mean_absolute_error(yTest, predictions))

#Mean Squared Error
print('MSE:', metrics.mean_squared_error(yTest, predictions))

#Root Mean Squared Error
print('RMSE:', np.sqrt(metrics.mean_squared_error(yTest, predictions)))

#R2
print('R2:', metrics.r2_score(yTest, predictions))

## Feature importance

In [None]:
pd.Series(rnd_search_cv.best_estimator_.feature_importances_, index=X.columns).sort_values(ascending=False)