## This notebook focuses on testing an xgboost model without resorting to feature selection or dimensionality reduction.

### Importing Libraries

In [14]:
import numpy as np
import pandas as pd
import seaborn as sns
from re import sub
from decimal import Decimal
from matplotlib import pyplot as plt
import datetime
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk import word_tokenize, WordNetLemmatizer
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import Imputer, StandardScaler
from math import sqrt
import tests as t
import re
import nltk
from sklearn.preprocessing import OneHotEncoder
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor
import folium

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import xgboost as xgb
pd.set_option('display.max_columns', 106)

import collections

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sousa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sousa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sousa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Loading data

In [2]:
porto = pd.read_csv(r"C:\Users\sousa\Desktop\github\Airbnb Tale of 2 cities\Data Portugal\porto_listings.csv")
lisbon = pd.read_csv(r"C:\Users\sousa\Desktop\github\Airbnb Tale of 2 cities\Data Portugal\lisbon_listings.csv")

### Defining target

In [3]:
X_porto = porto.drop('price', 1)
y_porto = porto['price']

X_lisbon = lisbon.drop('price', 1)
y_lisbon = lisbon['price']

### Splitting Data

In [4]:
# split our data
X_porto_train, X_porto_test, y_porto_train, y_porto_test = train_test_split(X_porto, y_porto, test_size=0.2)

X_lisbon_train, X_lisbon_test, y_lisbon_train, y_lisbon_test = train_test_split(X_lisbon, y_lisbon, test_size=0.2)

### Scaling data

In [5]:
#scaling the data
sc = StandardScaler()
X_porto_train = sc.fit_transform(X_porto_train)
X_porto_test  = sc.transform(X_porto_test)

X_lisbon_train = sc.fit_transform(X_lisbon_train)
X_lisbon_test  = sc.transform(X_lisbon_test)

### Prediction Model: XGboost

In [16]:
booster = xgb.XGBRegressor()

param_grid = {'n_estimators': [100, 200, 300, 400],
              'learning_rate': [0.01, 0.05, 0.1], 
              'max_depth': [3, 4, 5, 6, 7],
              'colsample_bytree': [0.6, 0.7, 1],
              'gamma': [0.0, 0.1, 0.2]}

booster_grid_search = GridSearchCV(booster, param_grid, cv=3, n_jobs=-1)

In [17]:
# train the tuned random forest porto
booster_grid_search.fit(X_porto_train, y_porto_train)

print(booster_grid_search.best_params_)

{'colsample_bytree': 0.6, 'gamma': 0.0, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 400}


In [18]:
# train the tuned random forest lisbon
booster_grid_search.fit(X_lisbon_train, y_lisbon_train)

print(booster_grid_search.best_params_)

{'colsample_bytree': 0.6, 'gamma': 0.1, 'learning_rate': 0.05, 'max_depth': 6, 'n_estimators': 400}


In [9]:
#instantiating the models based on the hyperparameters found by grid search
booster_porto = xgb.XGBRegressor(colsample_bytree=0.6, gamma=0.0, learning_rate=0.1, 
                           max_depth=7, n_estimators=200, random_state=4)

booster_lisbon = xgb.XGBRegressor(colsample_bytree=0.7, gamma=0.2, learning_rate=0.1, 
                           max_depth=7, n_estimators=200, random_state=4)

In [10]:
#training
booster_porto.fit(X_porto_train, y_porto_train)

booster_lisbon.fit(X_lisbon_train, y_lisbon_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0.2,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=7, min_child_weight=1, missing=None, n_estimators=200,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=4,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [11]:
#prediction
y_pred_train_porto = booster_porto.predict(X_porto_train)
y_pred_test_porto = booster_porto.predict(X_porto_test)

y_pred_train_lisbon = booster_lisbon.predict(X_lisbon_train)
y_pred_test_lisbon = booster_lisbon.predict(X_lisbon_test)

In [15]:
#metrics to determine quality of model
RMSE_porto = np.sqrt(mean_squared_error(y_porto_test, y_pred_test_porto))
RMSE_lisbon = np.sqrt(mean_squared_error(y_lisbon_test, y_pred_test_lisbon))

MSE_porto = mean_squared_error(y_porto_test, y_pred_test_porto)
MSE_lisbon = mean_squared_error(y_lisbon_test, y_pred_test_lisbon)

r2_porto = r2_score(y_porto_test, y_pred_test_porto)
r2_lisbon = r2_score(y_lisbon_test, y_pred_test_lisbon)



print(f"RMSE_porto: {round(RMSE_porto, 4)}")
print(f"RMSE_lisbon: {round(RMSE_lisbon, 4)}")

print(f"MSE_porto: {round(MSE_porto, 4)}")
print(f"MSE_lisbon: {round(MSE_lisbon, 4)}")

print(f"r2_porto: {round(r2_porto, 4)}")
print(f"r2_lisbon: {round(r2_lisbon, 4)}")

RMSE_porto: 37.5776
RMSE_lisbon: 43.4376
MSE_porto: 1412.079
MSE_lisbon: 1886.8287
r2_porto: 0.4858
r2_lisbon: 0.6729
