In [None]:
import re
import nltk
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly
import plotly.express as px
import ipywidgets as widgets
from plotly.offline import init_notebook_mode
from ipywidgets import interact, interactive, fixed, interact_manual
from datetime import datetime
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from statsmodels.graphics.gofplots import qqplot
from scipy import stats
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation,Dropout
from tensorflow.keras.callbacks import EarlyStopping
pd.set_option('display.max_columns', None)

In [None]:
init_notebook_mode(connected = True)

In [None]:
data = pd.read_csv('../input/airbnb-price-prediction/train.csv')
data.head()

# Splits

In [None]:
df, validation_df  = train_test_split(data,
                                test_size=0.25,
                                random_state = 101)

In [None]:
df_train, df_test  = train_test_split(df,
                                test_size=0.25,
                                random_state = 101)

# EDA

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
plt.figure(figsize = (15, 8))
sns.distplot(df_train['log_price'])
plt.title('Price distribution')
plt.show()


plt.figure(figsize = (15, 8))
sns.distplot((df_train['log_price']-np.mean(df_train['log_price'])) / np.std(df_train['log_price']))
plt.title('Price distribution converted to z')
plt.show()

In [None]:
#qqplot
qqplot(df_train['log_price'])

Looks like price distribution is roughly Gaussian

In [None]:
df_train[df_train['bedrooms'] == 0].head()

In [None]:
plt.figure(figsize=(12,8))
sns.heatmap(df_train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

Columns with a lot of missing data:

In [None]:
years_of_first_review = pd.DataFrame({
    'year of first review':pd.to_datetime(df_train['first_review'], format='%Y-%m-%d', errors='coerce').dt.year.fillna(0),
    'log_price': df_train['log_price']
})
plt.figure(figsize=(12,4))

sns.countplot(x="year of first review", data=years_of_first_review)
plt.title('Row count')
plt.show()

plt.figure(figsize=(12,8))
sns.boxplot(data=years_of_first_review,orient='v', x = 'year of first review', y = 'log_price')
plt.title('Years of first review and price')
plt.show()

In [None]:
na_or_not = lambda x: 'na' if x else 'value'

first_review_error_status= pd.DataFrame({
    'first review error status':df_train['first_review'].isna().apply(na_or_not),
    'log_price': df_train['log_price']
})

plt.figure(figsize=(12,3))
sns.countplot(y='first review error status', data=first_review_error_status, orient='h')
plt.title('Row count')
plt.show()

plt.figure(figsize=(12,3))
sns.boxplot(data=first_review_error_status,orient='h', y = 'first review error status', x = 'log_price')
plt.title('Price distribution for na and notna values in first_review column')
plt.show()

In [None]:
plt.figure(figsize=(10,5))
g = sns.FacetGrid(first_review_error_status, hue="first review error status", height = 5, aspect = 2)
g.map(sns.kdeplot, "log_price")
plt.legend()
plt.title('Price distribution for na and notna values in first_review column')
plt.show()

In [None]:
years_of_last_review = pd.DataFrame({
    'year of last review':pd.to_datetime(df_train['last_review'], format='%Y-%m-%d', errors='coerce').dt.year.fillna(0),
    'log_price': df_train['log_price']
})

plt.figure(figsize=(12,4))
sns.countplot(x="year of last review", data=years_of_last_review)
plt.title('Row count')
plt.show()

plt.figure(figsize=(12,8))
sns.boxplot(data=years_of_last_review,orient='v', x = 'year of last review', y = 'log_price')
plt.title('Years of last review and price')
plt.show()

In [None]:
popular_property_types = list(df_train['property_type'].value_counts()[df_train['property_type'].value_counts() > 50].index)
popular_property_types

In [None]:
df_train['room_type'].value_counts()

In [None]:
df_train['bed_type'].value_counts()

In [None]:
df_train['cancellation_policy'].value_counts()
popular_cancellation_policy = list(df_train['cancellation_policy'].value_counts()[df_train['cancellation_policy'].value_counts() > 100].index)
popular_cancellation_policy

In [None]:
df_train['city'].value_counts()

In [None]:
df_train['host_response_rate'].value_counts()

In [None]:
df_train['review_scores_rating'].value_counts()

In [None]:
df_train[df_train['bathrooms'].isna()].head()

In [None]:
df_train['bathrooms'].value_counts()

In [None]:
df_train['host_has_profile_pic'].value_counts()

In [None]:
years_of_host_since = pd.DataFrame({
    'year of host_since':pd.to_datetime(df_train['host_since'], format='%Y-%m-%d', errors='coerce').dt.year.fillna(0),
    'log_price': df_train['log_price']
})

plt.figure(figsize=(12,4))
sns.countplot(x="year of host_since", data=years_of_host_since)
plt.title('Row count')
plt.show()

plt.figure(figsize=(12,8))
sns.boxplot(data=years_of_host_since,orient='v', x = 'year of host_since', y = 'log_price')
plt.title('Years of host_since and price')
plt.show()

Price distribution of na values looks like one for 2008 values. I think, best of worst is to use 2008 value to fill na in this situation.

In [None]:
#map

def create_map(city):
    price_view = lambda x: 'price: ' + str(round(x, 2))

    if city in {'NYC', 'LA'}:
        fraction = 0.2
    else:
        fraction = 0.6
        
    df_temp = df_train[df_train['city'] == city].sample(frac=fraction,random_state=101)
    df_temp['log_price'] = np.round(df_temp['log_price'], 2)
    
    fig = px.scatter_mapbox(df_temp, 
                            lat="latitude", 
                            lon="longitude", 
                            hover_data=["log_price"],
                            color='log_price', 
                            zoom=10)
    
    fig.update_layout(
            title = f'Airbnb prices in {city}',
            geo_scope='usa',
            width=1000, 
            height=600,
            mapbox_style="white-bg",
            mapbox_layers=[{
                 "below": 'traces',
                 "sourcetype": "raster",
                 "sourceattribution": "United States Geological Survey",
                 "source": ["https://basemap.nationalmap.gov/arcgis/rest/services/USGSImageryOnly/MapServer/tile/{z}/{y}/{x}"]
              }]
    )
    #fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.update_geos(fitbounds="locations")
    fig.show()
    
#city = interact(lambda x: create_map(x), x=['NYC', 'LA', 'SF', 'DC', 'Chicago', 'Boston'])  #widget
create_map('NYC')

# Outliers handling

In [None]:
price_distr = pd.DataFrame(np.histogram(df_train['log_price'], bins = 50)).T
price_distr.columns = ['#', 'border']
price_distr.dropna(inplace = True)
price_distr.astype({'#': 'int32'})
price_distr

In [None]:
df_train[(df_train['log_price'] < 2.5) | (df_train['log_price'] > 7.5)].sort_values(by = 'log_price')

In [None]:
df_train.drop(df_train[df_train['log_price']==0].index, inplace = True)

For modelling purposes it would be enough to detele just 0 price record

In [None]:
df_train[df_train['log_price']< 2]


# Data preparation

In [None]:
amenities_set = set()
for amenitie in df_train['amenities']:
    amenitie_set = set(re.sub(r'(\"|\{|\})', '', amenitie).split(','))
    for piece in amenitie_set:
        if "translation missing" not in piece and piece:
            amenities_set.add(piece.strip())
            
amenities_set

In [None]:
def dummification(df):
    dummy_df = pd.DataFrame()
    object_cols = ['property_type', 'room_type', 'bed_type', 'cancellation_policy', 'city', 'first_review', 'neighbourhood']
    for col in object_cols:
        dummy_df = pd.concat([dummy_df, create_dummy(col, df)], axis = 1)
    
    return pd.concat([df.drop(columns = object_cols), dummy_df], axis = 1)
        
def create_dummy(col, df):
    df_dummy = pd.get_dummies(df[col], drop_first = True)
    df_dummy.columns = ['dum: ' + col + ': ' + name for name in df_dummy.columns]
    return df_dummy

def dum_col_filling(main_col_val, dum_col_name, main_col_name):
    if dum_col_name.replace(main_col_name + ': ', '') in main_col_val:
        return 1
    else:
        return 0

def set_to_dummies(df, column_name):
    all_values_of_sets = amenities_set
    dummy_columns_name = [column_name + ': ' + dum_col for dum_col in all_values_of_sets if dum_col]
    dummy_df = pd.DataFrame(0, index=df.index, columns=dummy_columns_name)
    df = pd.concat([dummy_df, df], axis = 1)
    for dum_col_name in dummy_columns_name:
        for i in df.index:
            df[dum_col_name][i] = dum_col_filling(df[column_name][i], dum_col_name, column_name)
    return df

def property_type_proc(val):
    if val in popular_property_types:
        return val
    else:
        return 'other'
    
def cancellation_policy_proc(val):
    if val in popular_cancellation_policy:
        return val
    else:
        return 'other'
    
def host_response_rate_proc(val):
    if pd.isna(val):
        return -100
    else:
        return float(val.replace('%', ''))

def first_review_poc(val):
    if pd.isna(val):
        return 'no data'
    elif datetime.strptime(val,'%Y-%m-%d').year < 2014:
        return ' < 2014'
    else:
        return str(int(datetime.strptime(val,'%Y-%m-%d').year))
    
def unknown_filling(val):
    if pd.isna(val):
        return 'no data'
    else:
        return val

def host_since_proc(val):
    if pd.isna(val):
        return 2008
    else:
        return int(datetime.strptime(val,'%Y-%m-%d').year)

def true_to_1(val):
    if val in {'True', True, 't'}:
        return 1
    else:
        return 0
    
def processing(df):
    df = df.copy()
    df['property_type'] = df['property_type'].apply(property_type_proc)
    df['cancellation_policy'] = df['cancellation_policy'].apply(cancellation_policy_proc)
    df['host_response_rate'] = df['host_response_rate'].apply(host_response_rate_proc)
    df['first_review'] = df['first_review'].apply(first_review_poc)
    df['neighbourhood'] = df['neighbourhood'].apply(unknown_filling)
    df['review_scores_rating'] = df['review_scores_rating'].fillna(0)
    df['bathrooms'] = df['bathrooms'].fillna(-1)
    df['bedrooms'] = df['bedrooms'].fillna(-1)
    df['beds'] = df['beds'].fillna(-1)
    df['host_has_profile_pic'] = df['host_has_profile_pic'].fillna('f')
    df['host_identity_verified'] = df['host_identity_verified'].fillna('f')
    df['host_since'] = df['host_since'].apply(host_since_proc)
    df['cleaning_fee'] = df['cleaning_fee'].apply(true_to_1)
    df['host_has_profile_pic'] = df['host_has_profile_pic'].apply(true_to_1)
    df['host_identity_verified'] = df['host_identity_verified'].apply(true_to_1)
    df['instant_bookable'] = df['instant_bookable'].apply(true_to_1)
    df = set_to_dummies(df, 'amenities')
    df = dummification(df)
    df.drop(['amenities', 'thumbnail_url', 'description', 'id', 'last_review', 'zipcode', 'name'], axis = 1, inplace = True)
    return df.sort_index(ascending=False, axis=1)

#df_d = processing(df.sample(frac=0.01,random_state=101))
df_d = processing(df_train)
df_d.head()

In [None]:
df_d[df_d.isnull().any(axis=1)]

# Modelling

In [None]:
#adding missing columns to test \ validation sets and deleting unnecessary
columns_needed = set(df_d.columns)
def columns_standardization(df):
    df = df.copy()
    for col in columns_needed:
        if col not in set(df.columns):
            df.insert(loc = len(df.columns), column = col, value = 0, allow_duplicates=False)
    
    for col in set(df.columns):
        if col not in columns_needed:
            df.drop(columns = col, axis = 1, inplace = True)
    
    return df.sort_index(ascending=False, axis=1)

In [None]:
test = columns_standardization(processing(df_test))
test

In [None]:
test[test.isnull().any(axis=1)]

In [None]:
def show_metrics(prediction_test, prediction_train, y_test, y_train):
    MAE = round(metrics.mean_absolute_error(y_test, prediction_test), 2)
    MSE = round(metrics.mean_squared_error(y_test, prediction_test), 2)
    RMSE = round(np.sqrt(metrics.mean_squared_error(y_test, prediction_test)), 2)
    RMSE_ratio_test= round(np.sqrt(metrics.mean_squared_error(y_test, prediction_test)) / np.mean(y_test),3)
    RMSE_ratio_train = round(np.sqrt(metrics.mean_squared_error(y_train, prediction_train)) / np.mean(y_train),3)
    R_2_test = round(metrics.explained_variance_score(y_test, prediction_test), 2)
    R_2_train = round(metrics.explained_variance_score(y_train, prediction_train), 2)
    
    metrics_data = pd.DataFrame(data = [MAE, MSE, RMSE, RMSE_ratio_test, 
                                    RMSE_ratio_train, R_2_test, R_2_train]).T
    
    metrics_data.columns = ['MAE', 'MSE', 'RMSE', 'RMSE_ratio_test', 
                                    'RMSE_ratio_train', 'R_2_test', 'R_2_train']
    display(metrics_data)
           


def analysis(model, X_train, X_test, y_train, y_test):
    
    prediction_test = model.predict(X_test)
    prediction_train = model.predict(X_train)
        
    
    show_metrics(prediction_test, prediction_train, y_test, y_train)
    
    sns.regplot(x = y_test, y = prediction_test, fit_reg=False)
    plt.title('Prediction and real')
    plt.show()

    sns.distplot(y_test - prediction_test, bins = 50)
    plt.title('Error variance')
    plt.show()

In [None]:
lm = LinearRegression(
        n_jobs = -1,
        normalize = True
)

lm.fit(df_d.drop('log_price', axis = 1), df_d['log_price'])

comment = ''

analysis(
    model = lm, 
    X_train = df_d.drop('log_price', axis = 1), 
    X_test = test.drop('log_price', axis = 1), 
    y_train = df_d['log_price'], 
    y_test = test['log_price']
)

In [None]:
rfm = RandomForestRegressor(
          max_depth = 10,
          n_jobs = -1, 
          random_state = 101,
          n_estimators = 700
    
)
rfm.fit(df_d.drop('log_price', axis = 1), df_d['log_price'])

comment = ''

analysis(
    model = rfm, 
    X_train = df_d.drop('log_price', axis = 1), 
    X_test = test.drop('log_price', axis = 1), 
    y_train = df_d['log_price'], 
    y_test = test['log_price']
)

In [None]:
gbr_model = GradientBoostingRegressor(random_state = 101)
gbr_model.fit(df_d.drop('log_price', axis = 1), df_d['log_price'])

comment = ''

analysis(
    model = gbr_model, 
    X_train = df_d.drop('log_price', axis = 1), 
    X_test = test.drop('log_price', axis = 1), 
    y_train = df_d['log_price'], 
    y_test = test['log_price']
)

# Dense neural models

In [None]:
#scaling
scaler = MinMaxScaler()
scaler.fit(df_d.drop('log_price', axis = 1))
X_train_sc = scaler.transform(df_d.drop('log_price', axis = 1).values)
X_test_sc = scaler.transform(test.drop('log_price', axis = 1).values)
y_train = df_d['log_price'].values
y_test = test['log_price'].values

In [None]:
nn_model1 = Sequential()


es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=4)


nn_model1.add(Dense(64, activation = 'relu'))
nn_model1.add(Dropout(0.1))
nn_model1.add(Dense(1))

nn_model1.compile(
    optimizer='rmsprop',
    loss='mse'
)

nn_model1.fit(
    x = X_train_sc,
    y = y_train,
    epochs = 100,
    validation_data=(X_test_sc, y_test), 
    batch_size = 128,
    callbacks=[es]
)


pd.DataFrame(nn_model1.history.history).plot()
plt.show()

analysis(model = nn_model1, 
         X_train = X_train_sc, 
         X_test = X_test_sc, 
         y_train = y_train, 
         y_test = y_test)

# Using text data from 'description' column

In [None]:
df_train['description']

In [None]:
def del_punct(text):
    chars = [char for char in text if char not in string.punctuation]
    return ''.join(chars)

def del_stopwords(text):
    words = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    return words

def text_preparation(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d+', 'somenumbers', text)
    text = del_stopwords(del_punct(text))
    return text
    
    
text_preparation('Enjoy a beautiful 67 contemporary residence with')

In [None]:
bow_transformer = CountVectorizer(analyzer = text_preparation, max_features = 1500).fit(df_train['description'])
bow = bow_transformer.transform(df_train['description'])
bow.shape

In [None]:
tfidf_transformer = TfidfTransformer().fit(bow)
train_tfidf = tfidf_transformer.transform(bow)

bow_test = bow_transformer.transform(df_test['description'])
test_tfidf = tfidf_transformer.transform(bow_test)

In [None]:
rfm_text = RandomForestRegressor(
    max_depth = 10, 
    n_estimators = 500,
    n_jobs = -1, 
    random_state = 101
)
rfm_text.fit(train_tfidf, df_train['log_price'])

comment = ''

analysis(
    model = rfm_text, 
    X_train = train_tfidf, 
    X_test = test_tfidf, 
    y_train = df_train['log_price'], 
    y_test = df_test['log_price']
)

We can see that there is not much information in description of object, that can be used for price prediction, but we can try use this in general model. As an experiment we can try to create a model, that would we a stack of 2 models - prediction of text model will be used in model built on all available features as one more feature.

In [None]:
class StackRegression:
    
    def __init__(self, model_general, model_secondary):
        self.model_general = model_general
        self.model_secondary = model_secondary
    
    def fit_predict(self, X_train_general, X_train_secondary, y_train, X_test_general, X_test_secondary, y_test):
        self.model_secondary.fit(X_train_secondary, y_train)
        
        self.secondary_predictions_test =  self.model_secondary.predict(X_test_secondary)
        self.secondary_predictions_train =  self.model_secondary.predict(X_train_secondary)
        
        self.X_train_full = X_train_general.copy()
        self.X_train_full.assign(secondary_model_predictions = self.secondary_predictions_train)
        
        self.X_test_full = X_test_general.copy()
        self.X_test_full.assign(secondary_model_predictions = self.secondary_predictions_test)
                        
        self.model_general.fit(self.X_train_full, y_train)
        
        return self.model_general.predict(self.X_test_full)

In [None]:
model_general = RandomForestRegressor(
          max_depth = 10,
          n_jobs = -1, 
          random_state = 101,
          n_estimators = 700
    
)

model_for_text = RandomForestRegressor(
          max_depth = 10,
          n_jobs = -1, 
          random_state = 101,
          n_estimators = 700
    
)

stack_model = StackRegression(model_general, model_for_text)


prediction_test = stack_model.fit_predict(
    X_train_general = df_d.drop('log_price', axis = 1), 
    X_train_secondary = train_tfidf, 
    y_train = df_d['log_price'], 
    X_test_general = test.drop('log_price', axis = 1), 
    X_test_secondary = test_tfidf, 
    y_test = test['log_price']
)

prediction_train = stack_model.fit_predict(
    X_train_general = df_d.drop('log_price', axis = 1), 
    X_train_secondary = train_tfidf, 
    y_train = df_d['log_price'], 
    X_test_general = df_d.drop('log_price', axis = 1), 
    X_test_secondary = train_tfidf, 
    y_test = df_d['log_price']
)

show_metrics(prediction_test, prediction_train, y_test = test['log_price'], y_train = df_d['log_price'])


sns.regplot(x = test['log_price'], y = prediction_test, fit_reg=False)
plt.title('Prediction and real')
plt.show()

sns.distplot(test['log_price'] - prediction_test, bins = 50)
plt.title('Error variance')
plt.show()

This model doesn't have an advantage over more simple models. So for final submission I prefer to use just simple RFR

# Validation

In [None]:
val_data_for_model = columns_standardization(processing(validation_df))
val_data_for_model

In [None]:
val_data_for_model[val_data_for_model.isnull().any(axis=1)]

In [None]:
analysis(
    model = rfm, 
    X_train = df_d.drop('log_price', axis = 1), 
    X_test = val_data_for_model.drop('log_price', axis = 1), 
    y_train = df_d['log_price'], 
    y_test = val_data_for_model['log_price']
)