In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import os
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/us-airbnb-open-data/AB_US_2020.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()/len(df)*100

In [None]:
df.drop(['neighbourhood_group','last_review','name'],axis=1,inplace=True)

In [None]:
df.dtypes

In [None]:
df.columns

In [None]:
df['reviews_per_month'].fillna(df['reviews_per_month'].mean(),inplace=True)

In [None]:
df.dropna().shape

In [None]:
df.describe()

In [None]:
plt.rcParams['figure.figsize']=10,10
sns.heatmap(df.corr(), cmap='coolwarm', annot=True)

In [None]:
X = df.drop(['price'], axis=1)
y = df.price

In [None]:
X_train, X_test , y_train , y_test = train_test_split(X , y , random_state=1)

In [None]:
numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

categorical_cols = [cname for cname in X.columns if
                    X[cname].nunique() < 50 and 
                    X[cname].dtype in ['object', 'bool']]

numerical_transformer = SimpleImputer(strategy='constant')

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
])

model = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0.0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.02, max_delta_step=0, max_depth=4,
             min_child_weight=0.0,n_estimators=1250, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.8,
             tree_method='exact', validate_parameters=1, verbosity=None)

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)
                     ])

clf.fit(X_train, y_train, model__verbose=False) 
preds = clf.predict(X_test)

print('RMSE:', mean_squared_error(y_test, preds, squared=False))

In [None]:
preds = clf.predict(X)
output = pd.DataFrame({'id': X.id,
                       'price': preds})
output.to_csv('submission.csv', index=False)