In [3]:
import pandas as pd

dc_listings = pd.read_csv('dc_airbnb.csv')
print(dc_listings.shape)

(3723, 19)


In [4]:
# Calculating Eucledian distance between first living space and our accommodation
import numpy as np
our_acc_value = 3
first_living_space_value = dc_listings.loc[0,'accommodates']
first_distance = np.abs(first_living_space_value - our_acc_value)
print(first_distance)

1


In [5]:
# Creating new column distance by finding the absolute value of the difference between each living space accommdation and our value
# Also, looking at the range of values we have using pd.value_counts()
dc_listings['distance'] = np.abs(dc_listings.accommodates - our_acc_value)
dc_listings.distance.value_counts().sort_index()

0      461
1     2294
2      503
3      279
4       35
5       73
6       17
7       22
8        7
9       12
10       2
11       4
12       6
13       8
Name: distance, dtype: int64

In [6]:
# So there are 461 listings with a distance of 0 which means they accomodate same number of people as we do.
# Randomizing 100% of the samples for our prediction

dc_listings = dc_listings.sample(frac=1, random_state=0)
dc_listings = dc_listings.sort_values('distance')
print(dc_listings.head())

     host_response_rate host_acceptance_rate  host_listings_count  \
2645               100%                 100%                    1   
2825               100%                 100%                    1   
2145               100%                 100%                    1   
2541                NaN                  NaN                    1   
3349                90%                 100%                    1   

      accommodates        room_type  bedrooms  bathrooms  beds     price  \
2645             3  Entire home/apt       1.0        1.0   1.0   $75.00    
2825             3  Entire home/apt       3.0        2.0   2.0  $120.00    
2145             3     Private room       1.0        2.0   2.0   $90.00    
2541             3     Private room       1.0        1.0   1.0   $50.00    
3349             3  Entire home/apt       1.0        1.0   1.0  $105.00    

     security_deposit cleaning_fee  minimum_nights  maximum_nights  \
2645         $300.00       $50.00                7        

In [7]:
# Converting price column data type from object to number
dc_listings['price'] = dc_listings.price.str.replace("\$|,",'').astype(float)

mean_price = dc_listings.price.iloc[:5].mean()
print(mean_price)

88.0


In [8]:
print(dc_listings.price.head())

2645     75.0
2825    120.0
2145     90.0
2541     50.0
3349    105.0
Name: price, dtype: float64


In [9]:
# This shows that based on just 'accommodates' feature, the price of our listing should be around the mean value of $135.2
# Evaluating our model by creating new datasets - training set and test with 75%-25% split
dc_listings.drop('distance', axis=1)
train_df = dc_listings.iloc[:2792]
test_df = dc_listings.iloc[2792:]
print(train_df.shape)
print(test_df.shape)

(2792, 20)
(931, 20)


In [10]:
# Creating method for calculating distance and predicting price
def predict_price(new_listing_value, feature_column):
    temp_df = train_df
    temp_df['distance'] = np.abs(dc_listings[feature_column] - new_listing_value)
    temp_df = temp_df.sort_values('distance')
    knn_5 = temp_df.price.iloc[:5]
    predicted_price = knn_5.mean()
    return(predicted_price)

test_df['predicted_price'] = test_df.accommodates.apply(predict_price, feature_column = 'accommodates')
print(test_df['predicted_price'].head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


2850     83.6
2279     83.6
2771    340.4
910     340.4
2434    340.4
Name: predicted_price, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


In [11]:
# Calculating root mean square error
test_df['squared_error'] = (test_df['predicted_price'] - test_df['price'])**2
mse = test_df['squared_error'].mean()
rmse = mse **(1/2)
print(rmse)

212.98927967051543


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [12]:
for feature in ['accommodates', 'bedrooms','bathrooms', 'number_of_reviews' ]:
    test_df['predicted_price'] = test_df.accommodates.apply(predict_price, feature_column = feature)
    test_df['squared_error'] = (test_df['predicted_price'] - test_df['price'])**2
    mse = test_df['squared_error'].mean()
    rmse = mse ** (1/2)
    print("Root Mean Squared Error for the {} column: {}".format(feature, rmse))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Root Mean Squared Error for the accommodates column: 212.98927967051543


Root Mean Squared Error for the bedrooms column: 216.49048609414763


Root Mean Squared Error for the bathrooms column: 216.89419042215684


Root Mean Squared Error for the number_of_reviews column: 240.21528314334847
