# Airbnb Listings

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#Import libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import pyodbc
import time
import seaborn as sns
from scipy import stats
from collections import Counter
import geopy.distance
from IPython import get_ipython
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from yellowbrick.regressor import ResidualsPlot
from sklearn import preprocessing
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Read Data

In [None]:
#Read .csv file
df = pd.read_csv('../input/berlin-airbnb-data/listings_summary.csv', sep = ',')


In [None]:
#Inspect column types
df.info(verbose=True, null_counts=True)

#Inspect duplicates on primary key(single column)
df['id'].duplicated().sum()
#There are no duplicates on the primary key (lisitng id)


#Denoting duplicate whole rows (for all columns)
#df.duplicated().sum()

#Investigating nans
nulls = df.isnull().sum().reset_index()

#Or investigating from custom function 
def missing_values_table(df):
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        return mis_val_table_ren_columns

missing_values_table(df)

In [None]:
missing_values_table(df)

In [None]:
df.info()

In [None]:
#Investigate column values for our analysis and ML model (We may drop some columns after the investigation)
df['price'].value_counts()
df['host_location'].value_counts() #Create new column with 2 values Germany/ No Germany
df['host_response_time'].value_counts() #Many nulls
df['host_response_rate'].value_counts() # #Drop due to imbalanced values, nulls and meaning
df['host_is_superhost'].value_counts()
df['host_has_profile_pic'].value_counts()
df['host_identity_verified'].value_counts()
df['neighbourhood_cleansed'].value_counts()
df['neighbourhood_group_cleansed'].value_counts()
df['market'].value_counts() #Drop due to imbalanced values, obvious meaning
df['property_type'].value_counts() #Drop due to imbalanced and many values
df['bed_type'].value_counts()
df['amenities'].value_counts() #Split, create new columns for main attributes
df['calendar_updated'].value_counts() #No meaning 
df['has_availability'].value_counts() #No meaning
df['is_location_exact'].value_counts() #Drop due to imbalanced values
df['requires_license'].value_counts() #Drop due to imbalanced values
df['instant_bookable'].value_counts()
df['is_business_travel_ready'].value_counts() #Drop due to imbalanced values
df['cancellation_policy'].value_counts() 
df['bed_type'].value_counts() #No meaning for almost 700 values different from real bed
df['require_guest_profile_picture'].value_counts() ##May drop due to imbalanced values, will work with this
df['require_guest_phone_verification'].value_counts() #May drop due to imbalanced values, will work with this
df['experiences_offered'].value_counts() #Contains only None value which is not considered as null


The selected dropped columns have been chosen based on the below criteria<br>
1)Contain special values, much information that is not recorder properly<br>
2)Contain many nulls<br>
3)Have the same meaning with other columns<br>
4)They don't give value to our scope<br>
5)Combination<br>

In [None]:
df.drop(['street', 'neighbourhood', 'city', 'state', 'smart_location', 'host_id', 'host_acceptance_rate', 'thumbnail_url', 'jurisdiction_names', 'xl_picture_url', 'medium_url', 'square_feet', \
         'monthly_price', 'license', 'weekly_price', 'host_acceptance_rate', 'thumbnail_url', 'jurisdiction_names',\
         'xl_picture_url', 'medium_url', 'square_feet', 'monthly_price', 'license', 'weekly_price',\
         'listing_url', 'scrape_id', 'last_scraped', 'experiences_offered', 'picture_url', 'host_url',\
         'host_name', 'host_about', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood',\
         'host_listings_count', 'host_verifications', 'country_code', 'country'\
         , 'calendar_last_scraped', 'first_review', 'last_review', 'calendar_updated',\
         'has_availability', 'market', 'host_response_rate', 'host_response_time',\
         'property_type', 'is_location_exact', 'requires_license', 'is_business_travel_ready', 'zipcode', 'bed_type'], axis=1, inplace=True)


- We will keep only neigbourhood_cleansed, neighbourhood_group to work with the area of the listing because of false inserted rows
for the columns street, city, state, neighbourhood(we will keep the cleanses and group), smart_location. <br>
Many rows contain "Berlin" written in several languages and values like plain 'X' etc. <br>
- We drop host_id because of the existence of the column host_listings monthly & weekly prices would be also good variables to predict but we got insufficient data, many nulls.
- Drop columns that contain information for NLP Processing If that's the scope <br>
We won't use them on this 1st attempt. We can investigate for the future.   


In [None]:
df.drop(['name', 'summary', 'space', 'description', 'neighborhood_overview', 'notes', 'transit', 'access',\
         'interaction', 'house_rules'], axis=1, inplace=True)

In [None]:
#Inspect column types again
df.info(verbose=True, null_counts=True)

#Convert column types
df['id'] = df['id'].astype(str) #df['id'] = df['id'].astype('str') for pandas version >= 1.0.0

In [None]:
#Price columns contain special character (dollar sign) and need to be conveted to float 
#First we investigate if these columns contain nans
df.price.isna().sum()
#0

In [None]:
df.cleaning_fee.isna().sum()
#7280

In [None]:
df.extra_people.isna().sum()
#0

In [None]:
df.security_deposit.isna().sum()
#9624

In [None]:
df['cleaning_fee'].value_counts()

In [None]:
df['security_deposit'].value_counts()

- There's a fair number of nans for cleaning_fee and security_deposit.<br>
We need to decide the way we will proceed with this information. We got 3 options.

  1. Drop this columns. 
  We don't want to drop of course the rows containing nans. 

  2. Replace with 0 suppose the user did not fill this kind of information because filling this field was not mandatory
  This solution is preferable because the scenario has a high probability to be realistic. 
  There are 2233 and 6674 zero values respectively for these 2 columns(cleaning_fee and security_deposit)

  3. Replace with average value of non nans
  We may test this method too, but if scenario 2 is true we will leverage these columns. 

Nan values will be replaced first because if we convert to float and replace afterwards,
we get wrong values after conversion, nan are also converted as float values. 


In [None]:
#Replace nans
df.cleaning_fee.fillna('$0.00', inplace=True)
df.security_deposit.fillna('$0.00', inplace=True)

#First we clear dollar signs, the lstrip() method removes any leading characters
df['price'] = df['price'].map(lambda x: x.lstrip('$'))
df['cleaning_fee'] = df['cleaning_fee'].map(lambda x: x.lstrip('$'))
df['security_deposit'] = df['security_deposit'].map(lambda x: x.lstrip('$'))
df['extra_people'] = df['extra_people'].map(lambda x: x.lstrip('$'))

#Then we clear commas
df['price'] = df['price'].str.replace(',', '')
df['cleaning_fee'] = df['cleaning_fee'].str.replace(',', '')
df['security_deposit'] = df['security_deposit'].str.replace(',', '')
df['extra_people'] = df['extra_people'].str.replace(',', '')

# Convert safely to float
df['price'] = df['price'].astype(float)
df['cleaning_fee'] = df['cleaning_fee'].astype(float)
df['security_deposit'] = df['security_deposit'].astype(float)
df['extra_people'] = df['extra_people'].astype(float)

#Get a view of values min max etc in order to investigate if coolumns contain strange values(negatives etc)
df['price'].describe() #There are zero price listings, we will cover this below
df['cleaning_fee'].describe()

In [None]:
df['security_deposit'].describe()

In [None]:
df['extra_people'].describe()

In [None]:
missing_values_table(df)

There are somes nans for review score columns. The vast majority of these nans(19,8%) has to do with 0 number of reviews.
Having this in mind we will first drop these columns for our predictions and investigate them after the 1st attempt
Predicting price based on reviews only for some listings does not make sense. 

In [None]:
df.drop(['review_scores_value', 'review_scores_checkin', 'review_scores_location', 'review_scores_communication'\
         , 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_rating', 'reviews_per_month'], axis=1, inplace=True)

Because of the fact that majority of hosts are located at Berlin (75%), 143 missing values for 
host_location will be replaced with DE. Below we will create a new column from 'Germany' and 'DE' values 

In [None]:
df['host_location'].fillna('DE', inplace=True)

In [None]:
#Replacements below refer to <100 records for each column
#host_since
#Repalce these 51 records with 01/01/2015, 2015 is the mode. 2014 follows.
df['host_since'].fillna('2015-01-01', inplace=True)
df['host_since'] = (pd.to_datetime(df['host_since'], format='%Y-%m-%d'))

#host_total_listings_count
df['host_total_listings_count'].value_counts()
#16315 records for value 1 
df['host_total_listings_count'].fillna(1, inplace=True)
 
#host_is_superhost
df['host_is_superhost'].value_counts()
#20642 records for value f
df['host_is_superhost'].fillna('f', inplace=True)

#host_has_profile_pic
df['host_has_profile_pic'].value_counts()
#24430 records for value t
df['host_has_profile_pic'].fillna('t', inplace=True)
#Maybe we can drop this column if we dont face significant differences at prices 

#host_identity_verified
df['host_identity_verified'].value_counts()
#15818 records for value t
df['host_identity_verified'].fillna('f', inplace=True)

df['host_identity_verified'].value_counts()
#15818 records for value t
df['host_identity_verified'].fillna('f', inplace=True)

#cancellation_policy
df['cancellation_policy'].value_counts()
#15818 records for value t
df['cancellation_policy'].fillna('flexible', inplace=True)

Because of the numeric type of beds, bathrooms, bedrooms combined with the fact that nans are not many,
rows with nans for these columns will be dropped.

In [None]:
df = df[pd.notnull(df['beds'])]
df = df[pd.notnull(df['bathrooms'])]
df = df[pd.notnull(df['bedrooms'])]

## Create new columns

In [None]:
#Create amanities columns
# We want to extract all the information from amenities column in order to create new variables
amanities_processed = df['amenities'].map(lambda x: x.lstrip('{'))
amanities_processed = amanities_processed.map(lambda x: x.rstrip('}'))
amanities_processed = amanities_processed.str.replace('"','')

#https://stackoverflow.com/questions/2600191/how-can-i-count-the-occurrences-of-a-list-item
l = list(amanities_processed.values)
l = (",".join(l))
test = l.split(',')
final = Counter(test)

#There are 175 different amentities elements. 
#https://stackoverflow.com/questions/62567406/pandas-check-if-a-substring-exists-in-another-column-then-create-a-new-column-w
#We will create some columns based on amentities we imagine that can affect the price of a listing and are not so
#common (i.e wifi)

df['hair_dryer'] = df['amenities'].map(lambda x: 'Yes' if 'Hair dryer' in x else 'No')
df['laptop_workspace'] = df['amenities'].map(lambda x: 'Yes' if 'Laptop friendly workspace' in x else 'No')
df['iron'] = df['amenities'].map(lambda x: 'Yes' if 'Iron' in x else 'No')
df['hot_water'] = df['amenities'].map(lambda x: 'Yes' if 'Hot water' in x else 'No')
df['tv'] = df['amenities'].map(lambda x: 'Yes' if 'TV' in x else 'No')
df['family_kid_friendly'] = df['amenities'].map(lambda x: 'Yes' if 'Family/kid friendly' in x else 'No')
df['refrigerator'] = df['amenities'].map(lambda x: 'Yes' if 'Refrigerator' in x else 'No')
df['cooking_basics'] = df['amenities'].map(lambda x: 'Yes' if 'Cooking basics' in x else 'No')
df['oven'] = df['amenities'].map(lambda x: 'Yes' if 'Oven' in x else 'No')
df['elevator'] = df['amenities'].map(lambda x: 'Yes' if 'Elevator' in x else 'No')
df['free_street_parking'] = df['amenities'].map(lambda x: 'Yes' if 'Free street parking' in x else 'No')
df['smoking'] = df['amenities'].map(lambda x: 'Yes' if 'Smoking allowed' in x else 'No')

#Drop amenities 
df.drop(['amenities'], axis=1, inplace=True)


In [None]:
#Create column which indicates distance from center based on long, lat 

#Get Berlin coordinates
#https://stackoverflow.com/questions/19412462/getting-distance-between-two-points-based-on-latitude-longitude
#https://www.latlong.net/place/berlin-germany-9966.html

def distance_center(row): 
    berlin_coords = (52.520008, 13.404954)
    listing_coords = (row['latitude'],row['longitude'])
    return geopy.distance.distance(listing_coords, berlin_coords).km
    
    
df['distance'] = df.apply(lambda row: distance_center(row), axis=1)


In [None]:
df['distance'].describe()

In [None]:
#Create new column for host_location with 2 values in/out of Germany
#Previously, we replaced nans of host location with DE. After searching the values
#we create a new column host_in_germany to inspect if hosts located to other countries differ their listings' prices.
def host_in_germany(row):
   if (('Germany' in row['host_location']) | ('DE' in row['host_location'])):
       return 'Yes'
   else:
       return 'No'

df['host_in_germany'] = df.apply(lambda row: host_in_germany(row), axis=1)

In [None]:
df['host_in_germany'].value_counts() # We got few 1885 listings from hosts located outside from Germany

#Drop host_location
df.drop(['host_location'], axis=1, inplace=True)

In [None]:
#Create column total_months_hosting from today - host_since (in months)
df['host_total_months'] = ((pd.to_datetime('today') - df['host_since'])/np.timedelta64(1, 'M'))
df['host_total_months'] = df['host_total_months'].astype(int)

#Drop host_since
df.drop(['host_since'], axis=1, inplace=True)

# Data Exploration

In [None]:
#EDA
#We first inspect our dependent variable
df['price'].describe()

In [None]:
#There are records with 0 price value. We will drop these rows
df = df.loc[df['price'] >0]
#6 records deleted

#Create boxplot to inspect values
ax = sns.boxplot( y="price", data=df)

In [None]:
#Limit price to 1000
ax = sns.boxplot( y="price", data=df)
ax.set_ylim([0, 1000]) 

In [None]:
# Computing IQR
q1 = df['price'].quantile(0.25)
q3 = df['price'].quantile(0.75)
iqr = q3 - q1
print(iqr)

print(q3+(1.5*iqr))


outliers= df.loc[df['price'] > q3+(1.5*iqr)]

We got 1641 records considered as outliers. These records may include more expensive listing.<br>
We are going to delete these rows

In [None]:
df_without_outliers = df.loc[df['price'] <= q3+(1.5*iqr)]

#Boxplot
ax = sns.boxplot( y="price", data=df_without_outliers)

## Correlation Matrix before removing outliers

In [None]:
#Create correlation matrix
#inspect correlations
df_without_outliers.columns

#Let's see first the correlation of price and all these selected variables for the dataframe
#which includes the outliers
corr_df = df.loc[:, ['host_total_listings_count',
        'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people', 'minimum_nights', 'maximum_nights', 'availability_30',
       'availability_60', 'availability_90', 'availability_365',
       'number_of_reviews','calculated_host_listings_count',  'distance',
       'host_total_months', 'price']]

#Inspect corr matrix (default value for pearson correlation)
corrMatrix = corr_df.corr()
fig, ax = plt.subplots(figsize=(30,30)) 
ax = sns.heatmap(corrMatrix, annot=True)
plt.show()

There's not a significant correlation of any variable with price

## Correlation Matrix after removing outliers

In [None]:
corr_df_without_outliers = df_without_outliers.loc[:, ['host_total_listings_count',
        'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people', 'minimum_nights', 'maximum_nights', 'availability_30',
       'availability_60', 'availability_90', 'availability_365',
       'number_of_reviews','calculated_host_listings_count',  'distance',
       'host_total_months', 'price']]

#Inspect corr matrix (default value for pearson correlation)
corr_matrix_outliers = corr_df_without_outliers.corr()
fig, ax = plt.subplots(figsize=(30,30)) 
ax = sns.heatmap(corr_matrix_outliers, annot=True)
plt.show()

We have some worthy correlations when excluding all the outliers. <br>
Accomodates, bedrooms, beds, cleaning fee, guests_included are positive correlated with price<br>
Maybe listings count should not be included in our independent variables list
because of the negative correlation for total host private rooms and the fact 
that all other host listing variables are positive correlated. <br>
This can be due to randomness and not a pattern. It does not make sense at first sight. 


In [None]:
#Now  check spearman corrs
spearman_matrix = df_without_outliers.corr(method='spearman')
spearman_matrix_df = spearman_matrix
spearman_matrix_df

Pearson & spearman results do not differ. 

In [None]:
#Inspect price boxplots for different categorical variables
sns.boxplot(x='host_is_superhost', y='price', data=df_without_outliers)

In [None]:
sns.boxplot(x='host_has_profile_pic', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='host_identity_verified', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='neighbourhood_group_cleansed', y='price', data=df_without_outliers)
#There different price distributions per group


In [None]:
sns.boxplot(x='room_type', y='price', data=df_without_outliers) 
#Entire are clearly more expensive


In [None]:
sns.boxplot(x='instant_bookable', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='cancellation_policy', y='price', data=df_without_outliers)
#Price differs for different policies values


In [None]:
sns.boxplot(x='require_guest_profile_picture', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='require_guest_phone_verification', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='hair_dryer', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='laptop_workspace', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='iron', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='hot_water', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='tv', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='family_kid_friendly', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='refrigerator', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='cooking_basics', y='price', data=df_without_outliers)

In [None]:
sns.boxplot(x='oven', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='elevator', y='price', data=df_without_outliers)


In [None]:
sns.boxplot(x='free_street_parking', y='price', data=df_without_outliers)

In [None]:
sns.boxplot(x='smoking', y='price', data=df_without_outliers)

In [None]:
sns.boxplot(x='host_in_germany', y='price', data=df_without_outliers) 

Our new variable has not an impact on price.

Most remarkable differences in price occur for Tv and smoking
Listings with tv and smoke free seem to be a bit more expensive


## Pairplot for variables that seem to be crucial for predictions

In [None]:
columns_pairplot = ['price', 'accommodates', 'guests_included', 'beds']
sns.pairplot(df_without_outliers[columns_pairplot])
plt.show()

# Machine Learning Model


We will follow 2 approches for predicting the price
 1.  approach contains host info
 2.  approach does not contain host info. It will only contain listing data

Try this because we want to inspect if the airbnb platform host data affect the price.<br>
Method 1 predicts based completely with all available listing features. 


In [None]:
#Select columns as indepentent variables
df_without_outliers.columns

## Method 1

In [None]:
cols = ['host_is_superhost', 'host_total_listings_count',
       'host_has_profile_pic', 'host_identity_verified',
       'neighbourhood_group_cleansed', 'room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'security_deposit', 'cleaning_fee', 'guests_included',
       'extra_people', 'minimum_nights', 'maximum_nights', 'availability_30',
       'availability_60', 'availability_90', 'availability_365',
       'number_of_reviews', 'instant_bookable',
       'cancellation_policy', 'require_guest_profile_picture',
       'require_guest_phone_verification', 'hair_dryer',
       'laptop_workspace', 'iron', 'hot_water', 'tv', 'family_kid_friendly',
       'refrigerator', 'cooking_basics', 'oven', 'elevator',
       'free_street_parking', 'smoking', 'distance', 'host_in_germany',
       'host_total_months']

Remove ltm reviews, long, lat

In [None]:
#Split  categorical and numeical varialbes
#categorical = list(df_without_outliers.select_dtypes(include=['object']).columns)
#numerical = list(df_without_outliers.select_dtypes(include=['float64', 'int64']).columns)

By one-hot encoding a categorical variable, we are inducing sparsity into the dataset which is undesirable.

In [None]:
#Create copy
ml_data = df_without_outliers.copy()

le = preprocessing.LabelEncoder()
for column_name in ml_data.columns:
    if ml_data[column_name].dtype == object:
        ml_data[column_name] = le.fit_transform(ml_data[column_name])
    else:
        pass
    
#Split dataset to independent and dependent variables
X = ml_data[cols]
y = ml_data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
#Grid search
regressor = RandomForestRegressor(random_state = 0)

#n_estimatos = The number of trees in the forest.

#min_samples_split specifies the minimum number of samples required to split an internal node,
#while min_samples_leaf specifies the minimum number of samples required to be at a leaf node.

#max_features is the size of the random subsets of features to consider when splitting a node.
#So max_features is what you call m. When max_features="auto", m = p and no feature subset 
#selection is performed in the trees, so the "random forest" is actually a bagged ensemble of
#ordinary regression trees.

#Bootstrap is set to True by default

param_grid = { 
            "n_estimators"      : [20,30,50],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_split" : [4,6,8],
            }

grid = GridSearchCV(regressor, param_grid, n_jobs=-1, cv=5)
grid.fit(X_train, y_train)

print(("best logistic regression from grid search: %.3f"
       % grid.score(X_test, y_test)))

print(grid.best_params_)

#score = R^2 of the prediction.


In [None]:
regressor = RandomForestRegressor(n_estimators = 50, min_samples_split = 8, random_state = 0)
regressor.fit(X_train, y_train)
    
preds = regressor.predict(X_test)

rf_mse = mean_squared_error(y_test, preds)
rf_rmse = np.sqrt(rf_mse)
print('rf_rmse', rf_rmse) 

In [None]:
scores = cross_val_score(regressor, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
rmse_scores = np.sqrt(-scores)

In [None]:
rmse_scores

For every fold our rmse ranges from 16 to 18. 
Furthermore, the most important here is that the rmse is stable without big differences per fold. 


In [None]:
ml_data['price'].describe()

In [None]:
ml_data['price'].std()

We will plot residuals to see if our errors are randomly distributed and the model does not suffer from heteroskedasticity, multicollinearity

One of the best explanations out there: 

Multicollinearity:

You sleep well before the same tests you study well for, so you do not know what to attribute the gains for. In order to come to some conclusions, you have to try studying without sleeping, or sleeping without studying.

Heteroskedasticity:

When you study for a test, you consistently get a score between 85 and 95. When you don’t study the results are more variable; your scores are between 60 and 90. <br>

Heteroscedasticity refers to cases where the residuals have a non-constant variance

In [None]:
visualizer = ResidualsPlot(regressor)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show()   

Residual = Observed – Predicted <br>
Positive values for the residual (on the y-axis) mean the prediction was too low, and negative values mean the prediction was too high; 0 means the guess was exactly correct.
The error of our predictions is increased when the actual price is between 20 to 60. <br>
Model has made some high prediction for low priced listings and some low predictions for high priced listings. <br>
Model has NOT made high prediction for high priced listings (Max price at training set was 138) <br>
Dependent variable (price) has only positive values

In [None]:
#Importances
importance = regressor.feature_importances_

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X.columns, regressor.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance').plot(kind='bar', rot=45)

In [None]:
importances

Room type, distance, accomodates seem to be the most important features

### Calculate R^2 and Adj. R^2

In [None]:
R2 = r2_score(y_test, preds)   
Adj_r2 = 1-(1-R2)*(18266-1)/(18266-40-1)
print(Adj_r2)

## Method 2 (Let's remove any host and platform information)

In [None]:
cols_method_2 = ['neighbourhood_group_cleansed','room_type', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'hair_dryer','laptop_workspace', 'iron', 'hot_water', 'tv', 'family_kid_friendly',
       'refrigerator', 'cooking_basics', 'oven', 'elevator',
       'free_street_parking', 'distance']

X = ml_data[cols_method_2]
y = ml_data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [None]:
#Grid search
regressor = RandomForestRegressor(random_state = 0)

#n_estimatos = The number of trees in the forest.

#min_samples_split specifies the minimum number of samples required to split an internal node,
#while min_samples_leaf specifies the minimum number of samples required to be at a leaf node.

#max_features is the size of the random subsets of features to consider when splitting a node.
#So max_features is what you call m. When max_features="auto", m = p and no feature subset 
#selection is performed in the trees, so the "random forest" is actually a bagged ensemble of
#ordinary regression trees.

#Bootstrap is set to True by default

param_grid = { 
            "n_estimators"      : [20,30,50],
            "max_features"      : ["auto", "sqrt", "log2"],
            "min_samples_split" : [4,6,8],
            }

grid = GridSearchCV(regressor, param_grid, n_jobs=-1, cv=5)
grid.fit(X_train, y_train)

print(("best logistic regression from grid search: %.3f"
       % grid.score(X_test, y_test)))

print(grid.best_params_)

#score = R^2 of the prediction.

max_features is the size of the random subsets of features to consider when splitting a node.<br>
If max_features="auto" then no feature subset selection is performed in the trees.<br>
If max_features != 'auto' then we have a 'real' RF

In [None]:
regressor = RandomForestRegressor(max_features = 'sqrt', min_samples_split= 8, n_estimators = 50, random_state = 0)
regressor.fit(X_train, y_train)
    
preds = regressor.predict(X_test)

rf_mse = mean_squared_error(y_test, preds)
rf_rmse = np.sqrt(rf_mse)
print('rf_train_rmse', rf_rmse) 

In [None]:
scores_method_2 = cross_val_score(regressor, X_train, y_train, scoring='neg_mean_squared_error', cv=10)
rmse_scores_method_2 = np.sqrt(-scores_method_2)
rmse_scores_method_2

For every fold our rmse ranges from 18 to 20. <br>
We are now sure that host and platform data as it was expected affect the price of a listing. 

In [None]:
visualizer = ResidualsPlot(regressor)
visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
visualizer.show() 

Our predictions are a bit more skewed to left and right. 

In [None]:
#Importances, get importance
importance = regressor.feature_importances_

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X.columns, regressor.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance').plot(kind='bar', rot=45)

In [None]:
importances

Again, room type, distance, accommodates remain the most important features. 
Distance coeff importance is increased versus previous model

### Calculate R^2 and Adj. R^2

In [None]:
R2 = r2_score(y_test, preds)   
Adj_r2 = 1-(1-R2)*(16980-1)/(16980-18-1)
print(Adj_r2)

In [None]:
X_train.shape

## Summary 

- Including host and platform data can help us predict a listing's price. 

- Comparing the statistics (mean, std, median, quantiles etc) with the rmse and our model's error we doubt about the efficiency of the model.<br>

- The listings we deal with are not so expensive to accept without doubt our model's accuracy. Imagine we schedule a  5 days trip and our accomodation cost comes to 5 * 40 (per night) = 200 euro. A fair prediction of our model  would be 5*57 = 285 euro. 
  How should we react in such an increase?
  If the prediction was lower than the actual value how should we react when we have to pay 115 euro? We may had some serious     concerns about the quality of service there.<br>
- We can also make acceptable bins (tight ranges) for price and convert this regression to a classification problem


