### Context
Since its inception in 2008, Airbnb has disrupted the traditional hospitality industry as more travellers decide to use Airbnb as their primary means of accommodation. Airbnb offers travellers a more unique and personalized way of accommodation and experience.


## Task Details
As of October 2020, this data set contains nearly 226029 Airbnb listings in U.S. The purpose of this task is to predict the price of U.S. Airbnb rentals based on the data provided and any external dataset(s) with relevant information.

Expected Submission
Users should submit a CSV file with each listing from the data set and the model-predicted price :

id,  price <br>
49091, 83 <br>
50646, 81 <br>
56334, 69 <br>
...



In [None]:
# from zipfile import ZipFile
# import os
# 
# with ZipFile('us-airbnb-open-data.zip') as f :
#     f.extractall(path ='Airbnb-data')
#     
# data_dir = 'Airbnb-data'
# os.listdir(data_dir)

In [None]:
#csv_path = data_dir + '/AB_US_2020.csv'
#csv_path

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
pd.options.display.max_columns = 200
pd.options.display.max_rows = 200

data = pd.read_csv('../input/us-airbnb-open-data/AB_US_2020.csv')
data

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = 13, 10

In [None]:
sns.heatmap(data.corr(), annot = True, fmt = ".2f", cmap = 'Blues');

In [None]:
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split as tts
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
import numpy as np


# train & target :
X = data.drop('price', axis = 1)
y = data['price']

# split the data first :
X_train, X_valid, y_train, y_valid = tts(X, y, test_size=0.25,
                                         random_state = 42)


#1. identify numerical and categorical cols :
numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 50 and 
                    X[cname].dtype in ['object', 'bool']]


#2. set up pipelines to transform numerical & categorical data :
# Also pipelines are valuable for cleaning up machine learning code and avoiding errors, 
# and are especially useful for workflows with sophisticated data preprocessing. 

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy = 'most_frequent')),
    ('onehot', OneHotEncoder(sparse = False, handle_unknown = 'ignore'))
])


#3. Tranforming columns both numeric & categorical :
preprocessor = ColumnTransformer(transformers = [
    ('num',  numerical_transformer, numerical_cols),
    ('cat', categorical_transformer, categorical_cols)
])


#4. Import model and set up Hyperparameteres : 
model = XGBRegressor(base_score = 0.5, booster = 'gbtree', colsample_bylevel = 1,
                     colsample_bynode = 1, colsample_bytree = 0.6, gamma = 0.0, gpu_id = -1,
                     importance_type = 'gain', interaction_constraints = '',
                     learning_rate = 0.02, max_delta_step = 0, max_depth = 4,
                     min_child_weight = 0.0, n_estimators = 1250, n_jobs = 0, 
                     num_parallel_tree = 1, random_state = 0,
                     reg_alpha = 0, reg_lambda = 1, scale_pos_weight = 1, subsample=0.8,
                     tree_method = 'exact', validate_parameters = 1, verbosity=None )

# These hyperparameters are found after performing GridSearchCV
# Not mentioning the process here, since its was too lengthy.


#5. Putting it together
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

clf.fit(X_train, y_train, model__verbose=False) 
preds = clf.predict(X_valid)


print('RMSE:', mean_squared_error(y_valid, preds, squared = False))

In [None]:
preds = clf.predict(X)
op = pd.DataFrame({'id':X.id,
                  'Price':preds})

op.to_csv('submission.csv', index = False)