# 1. Dataset Preparation

In [None]:
!pip install catboost

In [None]:
# Dataframe
import numpy as np
import pandas as pd
from math import sqrt

# Preprocessing
import category_encoders as ce

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
from plotly import tools
init_notebook_mode(connected=True)  
import plotly.figure_factory as ff
sns.set_style("whitegrid")

# Outlier Detection
from scipy import stats

# Sklearn utilities
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.inspection import permutation_importance

# Regression model
from catboost import Pool, CatBoostRegressor, cv
from catboost.utils import eval_metric

# Feature Importance
import shap

In [None]:
df = pd.read_csv('../input/singapore-airbnb/listings.csv')

# 2. Data Understanding

In [None]:
df.head()

In [None]:
df.describe().transpose()

In [None]:
df.info()

# 3. Data Preprocessing

## 3.1 Features Removal

In [None]:
# Check null values
null_value_stats = df.isnull().sum()
null_value_stats[null_value_stats != 0]

In [None]:
# Unecessary features to predict price
df.drop(['name','id','host_name', 'host_id', 'last_review'], inplace=True, axis=1)

# Impute missing values to 0
df["reviews_per_month"] = df["reviews_per_month"].fillna(0)

In [None]:
# Show correlation between features
plt.figure(figsize=(12,5)) 
sns.heatmap(df.corr(),annot=True,fmt="f")
plt.show()

## 3.2 Outlier Detection

In [None]:
# Specify all numeric features for outlier detection
numeric_features = ['latitude','longitude','price',
                    'minimum_nights','number_of_reviews',
                    'calculated_host_listings_count','availability_365']

In [None]:
# Check target's distribution
sns.histplot(df["price"])

In [None]:
# Remove outlier from target value (price)
outlier = (np.abs(stats.zscore(df["price"]))<0.7)
outlier_ix = np.where(outlier==False)
df.drop(index=outlier_ix[0], inplace=True)

In [None]:
# Validate target's distribution after removing outlier
sns.histplot(df["price"])

In [None]:
# Checking outlier using boxplot
plt.figure(figsize=(18,10))

for i in range(1,8):    
    plt.subplot(2,4,i)
    fig = df.boxplot(column=numeric_features[i-1])

In [None]:
# Checking outlier using distribution
plt.figure(figsize=(18,10))

for j in range(1,8):
    plt.subplot(2,4,j)
    sns.histplot(df[numeric_features[j-1]])

In [None]:
outliers = [] 

# Check outliers using zscore
for lat in df['latitude']:
    zscore = (lat - np.mean(df['latitude'])) / np.std(df['latitude'])
    if zscore > 3:
        outliers.append(lat)
        
print(len(outliers))

In [None]:
# replace outliers with median
df_cleaned = df.replace(outliers, np.median(df['latitude']))

In [None]:
# Hosts at air-bnb provide a maximum of one year stay (365 days) in the form of rent to the visitors
df_cleaned = df[df['minimum_nights'] <= 365]

In [None]:
# Find total classes for room_type
df_cleaned['room_type'].unique()

In [None]:
# Map integers to categorical values
room_dict ={
    'Entire home/apt': 1,
    'Private room': 2,
    'Shared room': 3
}

df_cleaned['room_type'] = df_cleaned['room_type'].map(room_dict)

In [None]:
# Find total classes for neighbourhood
df_cleaned['neighbourhood'].unique()

In [None]:
# Binary encoding for neighbourhood
binary = ce.BinaryEncoder(cols=['neighbourhood'])
df_cleaned = binary.fit_transform(df_cleaned)

In [None]:
# See the first 10 rows
df_cleaned.head(10)

In [None]:
# Specify features and target values
X = df_cleaned.iloc[:,[0,1,2,3,4,5,6,7,8,9,10,11,13,14,15]].values
y = df_cleaned.iloc[:,12:13].values

In [None]:
# Find total classes for neighbourhood_group
df_cleaned['neighbourhood_group'].unique()

In [None]:
# Label encoding for neighbourhood group
label = LabelEncoder()
X[:,0] = label.fit_transform(X[:,0])

# 4. Modeling

In [None]:
# Split model into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

# Utilize pooling from Catboost
train_pool = Pool(X_train, y_train) 
test_pool = Pool(X_test, y_test)

In [None]:
# Specify hyperparameters for the model
params = {
    'iterations': 6000,
    'learning_rate': 0.002,
    'random_seed': 42,
    'logging_level': 'Silent',
    'early_stopping_rounds': 500
}

In [None]:
# Define regression model using the specified hyperparameters
model = CatBoostRegressor(**params)

In [None]:
# Train the model and check plot its training data
model.fit(
    train_pool, 
    eval_set=test_pool, 
    verbose=False, 
    plot=True
)

# 5. Evaluation

In [None]:
preds = model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
rmse = sqrt(mean_squared_error(y_test, preds))

print("MAE: " + str(mae))
print("RMSE: " + str(rmse))
print("MSE: " + str(mse))