**Task:**   
Predict the price of U.S. AirBnB rentals.     
Evaluate the solution with reproduceable low root-mean-squared error (RMSE) based on cross-validation.   

**Data:**   
As of October 2020, the dataset has 226030 rows and 17 columns of Airbnb listings in the U.S.   
The dataset includes NaNs, and data is of mixed types.   

The average price is about $219.72.   
Minimum night averages between 4 and 5.  
There are 4 categories of room type: Entire home/apt, Private room, Shared room, and Hotel room. 

There is minimal correlation between price and other columns in the dataset.   

Added a column (state). Grouped prices into ranges to improve model performance.   

**Models:**   
Linear and multple linear regressions did not perform well for this datase.   
KNN regression performed better and so did decision tree.   
KNN classifier performed best.   







Expected Submission   
Users should submit a CSV file with each listing from the data set and the model-predicted price:   

id, price   
49091, 83   
50646, 81   
56334, 69   
...   


In [None]:
# Python 3 environment with analytics libraries installed
# as defined by the kaggle/python Docker 

import numpy as np 
import pandas as pd 

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
#other libraries for math and plotting
import seaborn as sns

import statsmodels.api as sm

import matplotlib
%matplotlib inline

import matplotlib.pyplot as plt

#map plotting libraries
import folium
from folium import plugins
from folium.plugins import HeatMap

#ML models
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn import metrics

from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.neighbors import KNeighborsRegressor

from sklearn.linear_model import LassoCV, LassoLarsCV
from sklearn.model_selection import cross_val_score

#ignore error messages
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")    
    

In [None]:
df1=pd.read_csv("../input/us-airbnb-open-data/AB_US_2020.csv")
df1

In [None]:
#how many NaNs in the dataset?
df1.isnull().sum().sum()

In [None]:
df1.name.isnull().sum().sum()

In [None]:
df1.host_name.isnull().sum().sum()

In [None]:
df1.reviews_per_month.isnull().sum().sum()

In [None]:
##replace the NaNs in some columns
df1[['name', 'host_name']] = df1[['name', 'host_name']].fillna(value='None')

In [None]:
#replace NaNs with 0s
df1[['reviews_per_month']] = df1[['reviews_per_month']].fillna(value=0)

In [None]:
#over 100,000 entries are NaN in this column
df1.neighbourhood_group.isnull().sum().sum()

In [None]:
#missing dates
df1.last_review.isnull().sum().sum()

In [None]:
#dropped 2 columns with several NaNs
df2=df1.drop(['neighbourhood_group', 'last_review'], axis=1)

Added a new colunm: State

In [None]:
# create a list of our conditions
conditions = [
    (df2['city'] == 'New York City'),
    (df2['city'] == 'Columbus'),
    (df2['city'] == 'Hawaii'),
    (df2['city'] == 'Asheville'),
     (df2['city'] == 'Jersey City'),
    
     (df2['city'] == 'Washington D.C.'),
    (df2['city'] == 'Clark County'),
     (df2['city'] == 'Rhode Island'),
     (df2['city'] == 'Portland'),
     (df2['city'] == 'Austin'),
    
    (df2['city'] == 'Broward County'),
    (df2['city'] == 'Seattle'),
    (df2['city'] == 'Twin Cities MSA'),
    (df2['city'] == 'New Orleans'),
    
    (df2['city'] == 'Chicago'),
    (df2['city'] == 'Nashville'),
    (df2['city'] == 'Denver'),
        
    (df2['city'] == 'Cambridge') | (df2['city'] == 'Boston') | (df2['city'] == 'Salem'),
    
    (df2['city'] == 'Los Angeles') | (df2['city'] == 'Oakland') | (df2['city'] == 'San Diego') |
    (df2['city'] == 'San Francisco') | (df2['city'] == 'Santa Cruz County') |
    (df2['city'] == 'Pacific Grove')| (df2['city'] == 'San Clara Country') | (df2['city'] == 'San Mateo County')
    ]

# create a list of the values we want to assign for each condition
values = ['NY', 'OH', 'HI', 'NC', 'NJ',
          'DC', 'NV', 'RI', 
          'OR', 'TX', 'FL','WA', 
          'MN', 'LA', 'IL', 'TN',
         'CO', 'MA', 'CA']

# create a new column and use np.select to assign values to it using our lists as arguments
df2['state'] = np.select(conditions, values)

# display updated DataFrame
df2.head()

In [None]:
#name is an intro to the property
df2.name.unique()

In [None]:
df2.room_type.value_counts()

In [None]:
df2.describe(include='all')

In [None]:
st_count = df2['state'].value_counts()
sns.set(style="darkgrid")
sns.barplot(st_count.index, st_count.values, alpha=0.9)
plt.title('Frequency of States')
plt.ylabel('Occurrences', fontsize=10)
plt.xlabel('State', fontsize=12)
plt.show()

In [None]:
#find possible outliers
#boxplot of columns
boxplot = df2.boxplot(figsize=(8,7),rot=45)

In [None]:
#drop column 'host_id'
df3=df2.drop(['host_id'], axis=1)

In [None]:
corr1=df3.corr()
sns.heatmap(corr1, cmap="YlOrBr")

In [None]:
df3['room_type'].value_counts().plot(kind='barh', figsize=(6,4), 
                                     edgecolor=(0,0,0),color='tan', title='Room Type')

In [None]:
df3['state'].value_counts().plot(kind='barh', figsize=(6,6), 
                                     edgecolor=(0,0,0),color='lightblue', title='State')

In [None]:
df3.plot(x='price', y='availability_365', style='+', color='salmon')  
plt.xlabel('Id')  
plt.ylabel('SalePrice')  
plt.show()

In [None]:
##map 
#a map of North America
US = folium.Map(location=[54.5260, -105.2551],
                   zoom_start = 3)


# List comprehension to make a list of lists
heat_data = [[row['latitude'],row['longitude']] for index, row in df3.iterrows()]

# Plot it on the map
HeatMap(heat_data).add_to(US)

# Display the map
US

In [None]:
df3.plot(kind='density', subplots=True, layout=(14,1), sharex=False, figsize=(10,10))
plt.show()

In [None]:
sns.lmplot(x='price', y='id', data=df3, logistic=False,  markers=["^"])

In [None]:
#drop outliers: minimum_nights 31 and higher
df3=df3[df3['minimum_nights'] < 31]
#verify changes
df3.shape

### Models and predictions
-**Price** is the desired prediction.

1. Linear Regression

In [None]:
#Linear regression
X = df3['price'].values.reshape(-1,1)
y = df3['availability_365'].values.reshape(-1,1)

In [None]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

regressor = LinearRegression()  
regressor.fit(X_train, y_train) #train the model

In [None]:
#Intercept
print(regressor.intercept_)
#Slope
print(regressor.coef_)

In [None]:
#Predict
y_pred = regressor.predict(X_test)

In [None]:
#actual value and predicted value
dfLinReg = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
dfLinReg

In [None]:
plt.scatter(X_test, y_test,  color='purple')
plt.plot(X_test, y_pred, color='yellow', linewidth=2)
plt.show()

In [None]:
#comparison 
first20preds=dfLinReg.head(20)
c='darkgreen', 'steelblue'
first20preds.plot(kind='bar',figsize=(9,6), color=c)
plt.grid(which='major', linestyle='-', linewidth='0.3', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

In [None]:
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

The above linear regression model is not ideal.   
Changes to test size and random state did not impact the results.

2- Multiple Reression

In [None]:
#Convert to Numeric
# creating instance of labelencoder
labelencoder = LabelEncoder()# Assigning numerical values and storing in another column
df3['room_type_Cat'] = labelencoder.fit_transform(df3['room_type'])
df3['city_Cat'] = labelencoder.fit_transform(df3['city'])
df3['state_Cat'] = labelencoder.fit_transform(df3['state'])
df3.head()

In [None]:
#multiple regression

X_ = df3[['calculated_host_listings_count', 'room_type_Cat', 
          'room_type_Cat', 'city_Cat', 'state_Cat']] # multiple variable regression. 
Y = df3['price']
 
# with sklearn
regr = LinearRegression()
regr.fit(X_, Y)

print('Intercept: ', regr.intercept_)
print('Coefficients: ', regr.coef_)

print (regr)

In [None]:
y_pred2 = regr.predict(X_)

In [None]:
dfmult= pd.DataFrame({'Actual': Y, 'Predicted': y_pred2.flatten()})
dfmult

In [None]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y, y_pred2)))

In [None]:
#comparison 
first20preds2=dfmult.head(20)
first20preds2.plot(kind='bar',figsize=(9,5))
plt.grid(which='major', linestyle='-', linewidth='0.3', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

3- KNN

Grouping prices into economic, low-mid, high-mid, and high to improve the model.

In [None]:
# create a list of conditions
conditions = [
    (df3['price'] < 100),
    (df3['price'] >= 100) & (df3['price'] < 250),
     (df3['price'] >= 250) & (df3['price'] < 600),
    (df3['price'] >= 600) ]

# create a list of the values we want to assign for each condition
values = ['economic', 'low-mid', 'high-mid','high']

# create a new column and use np.select to assign values to it using our lists as arguments
df3['price_range'] = np.select(conditions, values)

In [None]:
df3['price_range'].value_counts()

In [None]:
labelencoder = LabelEncoder()# Assigning numerical values and storing in another column
df3['price_rng_Cat'] = labelencoder.fit_transform(df3['price_range'])

In [None]:
df4=df3[['id','minimum_nights','availability_365', 
         'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count',
         'room_type_Cat','city_Cat', 'state_Cat', 'price', 'price_rng_Cat']]

In [None]:
df4.corr().style.background_gradient(cmap='magma')

In [None]:
#K-nearest neighbors (KNN)
##Best performing model

neigh = KNeighborsClassifier(n_neighbors=3)

X1=df4.iloc[:, :-1].values
Y1=df4['price_rng_Cat'].values

# Split into training and test  
X_train, X_test, y_train, y_test = train_test_split( 
             X1, Y1, test_size = 0.4, random_state=1) 

#standardize data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

#fit the model
neigh.fit(X_train, y_train)

In [None]:
# Predicted class
y_pred3=neigh.predict(X_test)

In [None]:
KNNmod = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred3.flatten()})
KNNmod

In [None]:
# Calculate the accuracy of the model 
print(neigh.score(X_test, y_test)) 

In [None]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred3)))

In [None]:
#comparison 
first20preds3=KNNmod.head(20)
c2='darkkhaki', 'dimgray'
first20preds3.plot(kind='barh',figsize=(9,6), color=c2)
plt.grid(which='major', linestyle='-', linewidth='0.3', color='orange')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

The above model did not work with the original price parameters. The changing number of neighbors, test size or random state make insignificant improvement to the **RMSE**.   
The model improve significantly after grouping prices.

In [None]:
#find possible outliers
#boxplot of columns
boxplot5 = df4.boxplot(figsize=(8,7),rot=45)

In [None]:
#find possible outliers
#boxplot of columns
boxplot6 = df4.boxplot(column=['minimum_nights', 'availability_365', 'number_of_reviews',
       'reviews_per_month', 'calculated_host_listings_count', 'room_type_Cat',
       'city_Cat', 'state_Cat', 'price_rng_Cat'],
                       figsize=(8,7),rot=45)


In [None]:
#Scatter plot 
plt.figure(figsize=(9, 6))
sns.scatterplot(
    data=df4, x="calculated_host_listings_count", y="price_rng_Cat", hue="room_type_Cat", 
    sizes=(10, 200), hue_norm=(0, 4), legend="full")

In [None]:
df4.corr().style.background_gradient(cmap='mako')

In [None]:
pRange_count = df4['price_rng_Cat'].value_counts()
sns.set(style="darkgrid")
sns.barplot(pRange_count.index, pRange_count.values, alpha=0.9)
plt.title('Price Range occurances')
plt.ylabel('Occurrences', fontsize=10)
plt.xlabel('Price Range', fontsize=12)
plt.show()

In [None]:
sns.catplot(x='price_rng_Cat', y='price', data=df4)

In [None]:
#KNN Regression 
knn = KNeighborsRegressor(algorithm='auto')
cols = df4[['calculated_host_listings_count', 'room_type_Cat']].values

X2=cols
Y2=df4['price_rng_Cat'].values

In [None]:
#normalize 
X_normalized = preprocessing.normalize(X2, norm='max')
X_normalized

In [None]:
# Split into training and test  
X_train, X_test, y_train, y_test = train_test_split( 
             X_normalized, Y2, test_size = 0.3) 

#fit the model
knn.fit(X_train, y_train)

In [None]:
# Predicted class
y_pred4=knn.predict(X_test)

In [None]:
KNNreg = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred4.flatten()})
KNNreg

In [None]:
KNNreg['squared_error'] = (KNNreg['Predicted'] - KNNreg['Actual'])**(2)
mse = KNNreg['squared_error'].mean()
rmse2 = mse ** (1/2)
rmse2

In [None]:
#comparison 
first20preds4=KNNreg.head(20)
c2='teal', 'orchid','gray'
first20preds4.plot(kind='bar',figsize=(9,6), color=c2)
plt.grid(which='major', linestyle='-', linewidth='0.3', color='goldenrod')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

The above KNN regresion is not optimal to predict price or price range.   
Changing parameters did not improve performance.

4- Decision Tree

In [None]:
Treedf=df4[[ 'minimum_nights', 'availability_365', 'number_of_reviews',
       'reviews_per_month', 'calculated_host_listings_count', 'room_type_Cat',
       'city_Cat', 'state_Cat', 'price_rng_Cat']]

In [None]:
##Decision Tree
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.tree import plot_tree

#split dataset in features and target variable
feature_cols = ['room_type_Cat', 'calculated_host_listings_count']
X = Treedf[feature_cols].values # Features
y = Treedf.price_rng_Cat.values # Target variable

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                   random_state = 5)

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train_scaled,y_train)

#Predict the response for test dataset
y_pred6 = clf.predict(X_test)

In [None]:
# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred6))

In [None]:
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred6)))

In [None]:
Treecls = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred6.flatten()})
Treecls

In [None]:
a=Treecls.Actual
b=Treecls.Predicted
plt.plot(a, b, color='peru', linewidth=1)
plt.show()


In [None]:
#plot tree
fig = plt.figure(figsize=(25,26))
_ = tree.plot_tree(clf)


In [None]:
##Results

KNNmod

In [None]:
df4.columns

In [None]:
ids=df4[['id']]
ids

In [None]:
df4.id.nunique()

In [None]:
y_pred3.size

In [None]:
# Predicted class
y_predALL=neigh.predict(X1)

In [None]:
y_predALL

In [None]:
y_predALL.size

In [None]:
sub = pd.DataFrame()
sub['Price_Range_Predicted'] = y_predALL

In [None]:
final=pd.concat([ids, sub], axis=1)
final

In [None]:
#nulls
final.isnull().sum().sum()


In [None]:
final=final.dropna()
final

In [None]:



#join two dataframes
#answerSubm = pd.concat([ids, sub], axis=1)
#rename 0 to target

#answerSubm=answerSubm.rename(columns={0: "target"})

#answerSubm