In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Understanding

In [None]:
#read data and parse dates
airbnb_data = pd.read_csv('../input/us-airbnb-open-data/AB_US_2020.csv',
                          parse_dates=['last_review'],
                          low_memory=False)
airbnb_data.head()

In [None]:
#check for NaNs per column
airbnb_data.isnull().sum()

In [None]:
#we can see that 'neighbourhood_group' column has more than half NaNs of whole column
print('neighbourhood_group column has',100*airbnb_data.neighbourhood_group.isnull().sum()/airbnb_data.shape[0],'% of NaNs')

In [None]:
#count unique values for each column
[(col, airbnb_data[col].nunique()) for col in airbnb_data.columns]

In [None]:
airbnb_data.describe()

In [None]:
df = airbnb_data.copy()

import datetime 
#convert date to number of days since 01/01/01
df['last_review'] = df['last_review'].map(datetime.datetime.toordinal)
#NaNs converted to 1
df['last_review']

# NLP analysis

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

In [None]:
#convert column to string and lowercase all words 
df.name = df.name.astype(str)
df.name = df.name.str.lower()
df.name

In [None]:
import re
#choose everyting except alphabetical characters, spaces or tabs.
regex = re.compile('[^a-z\s]')
#substitude non-alphabetical characteres with nothing
df.name = df.name.apply(lambda x: regex.sub('', x))

df['unigrams'] = df['name'].apply(nltk.word_tokenize)
#excude words that are in stop_words and its length is less than 3
df['unigrams'] = df['unigrams'].apply(lambda x: [word for word in x 
                                            if word not in stop_words and len(word)>2])
df['unigrams']

In [None]:
#count frequency of each word appeard in 'unigrams' column in descending order
count=pd.Series(np.concatenate([x for x in df.unigrams])).value_counts()
count = pd.DataFrame({'unigrams': list(count.keys()),
                   'count': list(count[:])})
#take 30 most frequent words
most_common = count[:30]
most_common

In [None]:
#create a new feature that indicates apperance in 'name' column of at least one most_common word.
#function returns [] if string doesn't contain any word of most_common words.
df['common_names'] = df.name.apply(lambda x: [word for word in x.split() 
                                              if word in list(most_common.unigrams)])
#substitude [] with ['Null']
for i in df.index:
    if len(df['common_names'][i])<1:
        df['common_names'][i] = ['Null']

#extract the first elemet of the list for further preprocessing
df.common_names = df.common_names.apply(lambda x: x[0])
df.common_names.head()

In [None]:
df = df.drop('unigrams', axis=1)

# Remove Outliers

In [None]:
df1 = df.copy()
#create a list of numerical columns for plotting
numerical_cols = [cname for cname in df1.columns if 
                df1[cname].dtype in ['int64', 'float64']]
numerical_cols.remove('id')

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

fig, axes = plt.subplots(nrows=3, ncols=3) 
i = 0
fig.set_figheight(17)
fig.set_figwidth(25)
for row in axes:
    for col in row:
        df1[numerical_cols[i]].plot(kind='kde',ax=col)
        col.set_title(numerical_cols[i] +' distribution',fontsize=16,fontweight='bold')
        i+=1

In [None]:
columns = ['price','minimum_nights','calculated_host_listings_count',
          'reviews_per_month', 'number_of_reviews','last_review']


for col in columns:
    upper_bound = df1[col].quantile(0.92)
    print('92% of',col, 'values are under ',upper_bound)
    df1 = df1[df1[col]<upper_bound]

#drop rows where price is NaN after outliers removal    
df1 = df1[df1.price.isnull()!=True]

**Now graphs of price, minimum_nights, calculated_host_listings_count, reviews_per_month, number_of_reviews and last_review look much better!**

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3)
i = 0
fig.set_figheight(17)
fig.set_figwidth(25)
for row in axes:
    for col in row:
        df1[numerical_cols[i]].plot(kind='kde',ax=col)
        col.set_title(numerical_cols[i] +' distribution',fontsize=16,fontweight='bold')
        i+=1

In [None]:
corr = df1.corr()
plt.figure(figsize=(20,10))
sns.heatmap(corr, annot=True)
plt.show()

# Preprocessing categorical data

In [None]:
# Select categorical columns with relatively low cardinality 
# "Cardinality" means the number of unique values in a column
categorical_cols = [col for col in df1.columns 
                    if df1[col].nunique() < 40 and df1[col].dtype == "object"]
categorical_cols

In [None]:
from sklearn.preprocessing import LabelEncoder

# Make copy to avoid changing original data 
labeled_df1 = df1.copy()
#fill NaNs in 'neighbourhood_group' to be able to apply LabelEncoder 
df1['neighbourhood_group'].fillna('Others',inplace=True)

# Apply label encoder to each column with categorical data
label_encoder = LabelEncoder()
for col in categorical_cols:
    labeled_df1[col] = label_encoder.fit_transform(df1[col])

# Create train and validation sets

In [None]:
y = labeled_df1.price
X = labeled_df1.drop(['price'],axis=1)

from sklearn.model_selection import train_test_split
# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)
# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]
# Keep selected columns only
X_train = X_train_full[numerical_cols]
X_valid = X_valid_full[numerical_cols]

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer 

#preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='mean')

#bundle preprocessing for numerical data
preprocessor = ColumnTransformer(
                transformers=[
                    ('num',numerical_transformer,numerical_cols)
                ])

# RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score

#define model
model = RandomForestRegressor()
#bundle preprocessing and modelling code in pipeline
my_pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                              ('scaler', MinMaxScaler()),
                             ('model',model)])
#preprocessinf of training data, fit model
my_pipeline.fit(X_train, y_train)
#preprocessing of validation data, get predictions
rf_preds = my_pipeline.predict(X_valid)

#evaluate the model
mse = mean_squared_error(y_valid, rf_preds)
print('MSE:', mse)
rmse = np.sqrt(mse)
print('RMSE:', rmse)
r2 = r2_score(y_valid, rf_preds)
print('R2 Score',r2)

In [None]:
rf_data = pd.DataFrame({'Predicted Labels':rf_preds, 'Actual Labels':y_valid})

ax = sns.scatterplot(data=rf_data, x="Actual Labels", y="Predicted Labels", color='green')
#add a regression line
sns.regplot(data=rf_data, x="Actual Labels", y="Predicted Labels", scatter=False, ax=ax, color='blue')
plt.title('RandomForestRegressor')
plt.show()

# Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

#define model
model = LinearRegression()
#bundle preprocessing and modelling code in pipeline
my_pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                              ('scaler', MinMaxScaler()),
                             ('model',model)])
#preprocessinf of training data, fit model
my_pipeline.fit(X_train, y_train)
#preprocessing of validation data, get predictions
lin_preds = my_pipeline.predict(X_valid)

#evaluate the model
mse = mean_squared_error(y_valid, lin_preds)
print("MSE:", mse)
rmse = np.sqrt(mse)
print("RMSE:", rmse)
r2 = r2_score(y_valid, lin_preds)
print('R2 Score',r2)

In [None]:
lin_data = pd.DataFrame({'Predicted Labels':lin_preds, 'Actual Labels':y_valid})

ax = sns.scatterplot(data=lin_data, x="Actual Labels", y="Predicted Labels", color='green')
#add a regression line
sns.regplot(data=lin_data, x="Actual Labels", y="Predicted Labels", scatter=False, ax=ax, color='blue')
plt.title('LinearRegression')
plt.show()

# XGBRegressor

In [None]:
from xgboost import XGBRegressor
#define model
model = XGBRegressor()
#bundle preprocessing and modelling code in pipeline
my_pipeline = Pipeline(steps=[('preprocessor',preprocessor),
                              ('scaler', MinMaxScaler()),
                             ('model',model)])
#preprocessinf of training data, fit model
my_pipeline.fit(X_train, y_train)
#preprocessing of validation data, get predictions
xgb_preds = my_pipeline.predict(X_valid)

#evaluate the model
mse = mean_squared_error(y_valid, xgb_preds)
print("MSE:", mse)
rmse = np.sqrt(mse)
print("RMSE:", rmse)
r2 = r2_score(y_valid, xgb_preds)
print('R2 Score',r2)

In [None]:
xgb_data = pd.DataFrame({'Predicted Labels':xgb_preds, 'Actual Labels':y_valid})

ax = sns.scatterplot(data=xgb_data, x="Actual Labels", y="Predicted Labels", color='green')
#add a regression line
sns.regplot(data=xgb_data, x="Actual Labels", y="Predicted Labels", scatter=False, ax=ax, color='blue')
plt.title('XGBRegressor')
plt.show()

**XGBRegresssor with default parameters got the lowest RMSE score. Let's find the best parameters for it using GridSearchCV and cross validation**

In [None]:
#use cross validation and random_state to make results reprodusible
from sklearn.model_selection import RepeatedKFold
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=0)

from sklearn.model_selection import GridSearchCV
parameters = {}
parameters['model__n_estimators'] = [750,800,850]
parameters['model__learning_rate'] = [0.01,0.1]

grid = GridSearchCV(my_pipeline, parameters, cv=cv, 
                    scoring = 'neg_mean_squared_error')
grid.fit(X_train, y_train)
print('Best score and parameter combination = ')

print(grid.best_score_)    
print(grid.best_params_) 
xgb_pred = grid.predict(X_valid)

#evaluate the model
mse = mean_squared_error(y_valid, xgb_pred)
print("MSE:", mse)
rmse = np.sqrt(mse)
print("RMSE:", rmse)
r2 = r2_score(y_valid, xgb_pred)
print('R2 Score',r2)

In [None]:
xgb_tuned_data = pd.DataFrame({'Predicted Labels':xgb_pred, 'Actual Labels':y_valid})

ax = sns.scatterplot(data=xgb_tuned_data, x="Actual Labels", y="Predicted Labels", color='green')
#add a regression line
sns.regplot(data=xgb_tuned_data, x="Actual Labels", y="Predicted Labels", scatter=False, 
            ax=ax, color='blue')
plt.title('XGBRegressor after tuning')
plt.show()

# Conclusion

**Linear Regression has shown the worst result.
The best result was shown by XGBRegressor with parameters n_estimators = 850 and learning_rate = 0.1.
So we will use this model to save our results.**

In [None]:
results = pd.DataFrame({'Prediction':xgb_pred, 'Real_price':y_valid})
results = results.reset_index()
results

In [None]:
results.to_csv('Price_Predictions.csv',index=False)