In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/us-airbnb-open-data/AB_US_2020.csv')

data.head(5)

In [None]:
data.set_index('id')

In [None]:
import folium
from sklearn.cluster import KMeans

MapModel = data[['latitude', 'longitude']]

# Instantiate the clustering model and visualizer
model = KMeans()

kmeans = KMeans(n_clusters = 15, random_state=42).fit(MapModel)
kmeans.cluster_centers_

cluster_map = folium.Map([41.8781, -87.6298], zoom_start=4)
for i in range(kmeans.cluster_centers_.shape[0]):
    num = sum(kmeans.labels_ == i)
    folium.CircleMarker([kmeans.cluster_centers_[i,0], kmeans.cluster_centers_[i,1]],
                        radius=15,
                        popup=str(num) + ' Listings Associated with this Cluster',
                        fill_color="#3db7e4", # divvy color
                        ).add_to(cluster_map)
cluster_map

In [None]:
#Numeric Features Distribution Analysis
numeric_features = data.select_dtypes(include=['int64','float64']).columns
nominal_features = data.select_dtypes(include=['object'])
numeric_features=numeric_features.delete(0)
fig, axes = plt.subplots(nrows=2, ncols=4)
aux = 0
fig.set_figheight(15)
fig.set_figwidth(25)
for row in axes:
    for col in row:
        data[numeric_features[aux]].plot(kind='kde',ax=col)
        col.set_title(numeric_features[aux] +' Distribution',fontsize=16,fontweight='bold')
        aux+=1

In [None]:
# Removing Outliers !!!
lower_bound = .25
upper_bound = .75
iqr = data[data['price'].between(data['price'].quantile(lower_bound), data['price'].quantile(upper_bound), inclusive=True)]
iqr = iqr[iqr['number_of_reviews'] > 0]
iqr = iqr[iqr['calculated_host_listings_count'] < 10]
iqr = iqr[iqr['number_of_reviews'] < 200]
iqr = iqr[iqr['minimum_nights'] < 10]
iqr = iqr[iqr['reviews_per_month'] < 5]

In [None]:
data=iqr.copy()
del iqr

In [None]:
drop_list = ['name','neighbourhood_group','host_id','host_name','last_review']
data.drop(data[drop_list], axis=1, inplace=True)

In [None]:
data.dtypes

In [None]:
data.dropna(inplace=True)

In [None]:
### categorizing data
to_categorical_list = ['neighbourhood','room_type','city']
for i in to_categorical_list:
    data[i]=data[i].astype('category')
    
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
for i in to_categorical_list:
    data[i] = labelencoder.fit_transform(data[i])
data

In [None]:
X = data.drop(['price'], axis=1)
y = data['price']

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [None]:
from sklearn.model_selection import RandomizedSearchCV
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0,2],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5]
        }
xgb = XGBRegressor()
rs = RandomizedSearchCV(xgb, param_distributions=params, n_iter=5, n_jobs=-1, cv=5, verbose=3, random_state=42 )
rs.fit(X_train, y_train)

In [None]:
y_pred0 = rs.best_estimator_.predict(X_train)
y_pred = rs.best_estimator_.predict(X_test)
print(rs.best_params_)
print(np.sqrt(mean_squared_error(y_train, y_pred0)))
print(np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
# FROM this code get best params which are:
# n_estimators=70, min_samples_leaf=4, max_features='log2', bootstrap=False

from sklearn.ensemble import RandomForestRegressor
"""
n_estimators = [int(x) for x in np.arange(start = 10, stop = 100, step = 10)]
max_features = [0.5,'auto', 'sqrt','log2']
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
#First create the base model to tune
m = RandomForestRegressor()
#Fit the random search model
m_random = RandomizedSearchCV(estimator = m, param_distributions = random_grid, n_iter = 15, cv = 5, verbose=2, random_state=42, n_jobs = -1)
m_random.fit(X_train, y_train)
m_random.best_params_
"""

In [None]:
m = RandomForestRegressor(n_estimators=70, min_samples_leaf=4, max_features='log2', bootstrap=False)
m.fit(X_train, y_train)
y_pred_rf = m.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_rf)))