In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import math

In [2]:
dc_listings = pd.read_csv('dc_airbnb.csv')
dc_listings.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state
0,92%,91%,26,4,Entire home/apt,1.0,1.0,2.0,$160.00,$115.00,$100.00,1,1125,0,38.890046,-77.002808,Washington,20003,DC
1,90%,100%,1,6,Entire home/apt,3.0,3.0,3.0,$350.00,$100.00,,2,30,65,38.880413,-76.990485,Washington,20003,DC
2,90%,100%,2,1,Private room,1.0,2.0,1.0,$50.00,,,2,1125,1,38.955291,-76.986006,Hyattsville,20782,MD
3,100%,,1,2,Private room,1.0,1.0,1.0,$95.00,,,1,1125,0,38.872134,-77.019639,Washington,20024,DC
4,92%,67%,1,4,Entire home/apt,1.0,1.0,1.0,$50.00,$15.00,$450.00,7,1125,0,38.996382,-77.041541,Silver Spring,20910,MD


In [3]:
dc_listings.describe()

Unnamed: 0,host_listings_count,accommodates,bedrooms,bathrooms,beds,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude
count,3723.0,3723.0,3702.0,3696.0,3712.0,3723.0,3723.0,3723.0,3723.0,3723.0
mean,13.517325,3.195004,1.210157,1.256358,1.643319,2.250067,580306.9,15.306742,38.913967,-77.023294
std,64.534408,2.012216,0.839851,0.585539,1.182117,3.622879,35195520.0,29.645586,0.021647,0.026951
min,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,38.825061,-77.110525
25%,1.0,2.0,1.0,1.0,1.0,1.0,120.0,1.0,38.901789,-77.039859
50%,1.0,2.0,1.0,1.0,1.0,2.0,1125.0,4.0,38.913375,-77.02641
75%,3.0,4.0,1.0,1.0,2.0,3.0,1125.0,16.0,38.926509,-77.002798
max,480.0,16.0,10.0,8.0,16.0,180.0,2147484000.0,362.0,38.996382,-76.913137


In [4]:
# Cleaning "price" column data by removing "$" and ","
dc_listings["price"] = dc_listings["price"].str.replace("$", "").str.replace(",", "").astype(float)

  dc_listings["price"] = dc_listings["price"].str.replace("$", "").str.replace(",", "").astype(float)


In [5]:
# Drop the non-numerical, non-ordinal, and unnecessary columns
drop_columns = ['host_response_rate', 'host_acceptance_rate', 'host_listings_count', 'room_type', 'city', 'state', 'latitude', 'longitude', 'zipcode']
dc_listings = dc_listings.drop(drop_columns, axis=1)

In [6]:
# Remove columns with huge number of missing values
# Drop rows corresponding to a few missing values in columns
dc_listings = dc_listings.drop(['cleaning_fee', 'security_deposit'], axis=1)
dc_listings = dc_listings.dropna(axis=0)
dc_listings.head()

Unnamed: 0,accommodates,bedrooms,bathrooms,beds,price,minimum_nights,maximum_nights,number_of_reviews
0,4,1.0,1.0,2.0,160.0,1,1125,0
1,6,3.0,3.0,3.0,350.0,2,30,65
2,1,1.0,2.0,1.0,50.0,2,1125,1
3,2,1.0,1.0,1.0,95.0,1,1125,0
4,4,1.0,1.0,1.0,50.0,7,1125,0


In [7]:
dc_listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3671 entries, 0 to 3722
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   accommodates       3671 non-null   int64  
 1   bedrooms           3671 non-null   float64
 2   bathrooms          3671 non-null   float64
 3   beds               3671 non-null   float64
 4   price              3671 non-null   float64
 5   minimum_nights     3671 non-null   int64  
 6   maximum_nights     3671 non-null   int64  
 7   number_of_reviews  3671 non-null   int64  
dtypes: float64(4), int64(4)
memory usage: 258.1 KB


In [8]:
# Running KNN model with all features
train_df = dc_listings.iloc[0:2754] # 75% of the rows
test_df = dc_listings.iloc[2754:]
features = train_df.columns.tolist()
features.remove('price')

knn = KNeighborsRegressor(algorithm='brute', n_neighbors=5)
knn.fit(train_df[features], train_df['price'])
all_features_predictions = knn.predict(test_df[features])
all_features_mse = mean_squared_error(test_df['price'], all_features_predictions)
all_features_rmse = math.sqrt(all_features_mse)

In [9]:
print(all_features_mse)
print(all_features_rmse)

11500.774394765542
107.24166352106602


In [10]:
# Running KNN model with 4 features
train_df = dc_listings.iloc[0:2754] # 75% of the rows
test_df = dc_listings.iloc[2754:]
features = ['accommodates', 'bedrooms', 'bathrooms', 'beds']

knn = KNeighborsRegressor(algorithm='brute', n_neighbors=5)
knn.fit(train_df[features], train_df['price'])
four_features_predictions = knn.predict(test_df[features])
four_features_mse = mean_squared_error(test_df['price'], four_features_predictions)
four_features_rmse = math.sqrt(four_features_mse)

In [11]:
print(four_features_mse)
print(four_features_rmse)

27805.452955288984
166.7496715297784
