In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dc_listings = pd.read_csv('dc_airbnb.csv')

In [3]:
dc_listings.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state
0,92%,91%,26,4,Entire home/apt,1.0,1.0,2.0,$160.00,$115.00,$100.00,1,1125,0,38.890046,-77.002808,Washington,20003,DC
1,90%,100%,1,6,Entire home/apt,3.0,3.0,3.0,$350.00,$100.00,,2,30,65,38.880413,-76.990485,Washington,20003,DC
2,90%,100%,2,1,Private room,1.0,2.0,1.0,$50.00,,,2,1125,1,38.955291,-76.986006,Hyattsville,20782,MD
3,100%,,1,2,Private room,1.0,1.0,1.0,$95.00,,,1,1125,0,38.872134,-77.019639,Washington,20024,DC
4,92%,67%,1,4,Entire home/apt,1.0,1.0,1.0,$50.00,$15.00,$450.00,7,1125,0,38.996382,-77.041541,Silver Spring,20910,MD


In [4]:
# Let's assume we have a new listing, which can accomodate 3 people and we are trying to figure out ideal rental price
# Let's create a new column, which can help us in identifying other similar listings

dc_listings['accom_diff'] = abs(dc_listings['accommodates']-3)
dc_listings.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state,accom_diff
0,92%,91%,26,4,Entire home/apt,1.0,1.0,2.0,$160.00,$115.00,$100.00,1,1125,0,38.890046,-77.002808,Washington,20003,DC,1
1,90%,100%,1,6,Entire home/apt,3.0,3.0,3.0,$350.00,$100.00,,2,30,65,38.880413,-76.990485,Washington,20003,DC,3
2,90%,100%,2,1,Private room,1.0,2.0,1.0,$50.00,,,2,1125,1,38.955291,-76.986006,Hyattsville,20782,MD,2
3,100%,,1,2,Private room,1.0,1.0,1.0,$95.00,,,1,1125,0,38.872134,-77.019639,Washington,20024,DC,1
4,92%,67%,1,4,Entire home/apt,1.0,1.0,1.0,$50.00,$15.00,$450.00,7,1125,0,38.996382,-77.041541,Silver Spring,20910,MD,1


In [5]:
dc_listings['accom_diff'].value_counts()

1     2294
2      503
0      461
3      279
5       73
4       35
7       22
6       17
9       12
13       8
8        7
12       6
11       4
10       2
Name: accom_diff, dtype: int64

In [6]:
# There are 461 other listings that accomodate exactly 3 people. But we don't want to select top 5 such listing
# That will bias our average price to the way dataset was order
# Let's randomize the database and then select top 5 listings for which accom_diff =0

# The frac keyword argument specifies the fraction of rows to return in the random sample, so frac=1 means 
# to return all rows (in random order)
# specifying drop=True prevents .reset_index from creating a column containing the old index entries
dc_listings = dc_listings.sample(frac=1).reset_index(drop=True)
dc_listings = dc_listings.sort_values('accom_diff')
print(dc_listings.iloc[0:10][['price', 'accom_diff']])

        price  accom_diff
1707  $249.00           0
537   $259.00           0
2166  $149.00           0
1308  $159.00           0
2169  $125.00           0
3060  $569.00           0
1306  $149.00           0
3056  $125.00           0
556   $249.00           0
559   $110.00           0


In [7]:
# Let's convert price column to float before taking the mean
# remove '$' and ','

dc_listings['price'] = dc_listings['price'].str.replace('$','').str.replace(',', '').astype(float)

  dc_listings['price'] = dc_listings['price'].str.replace('$','').str.replace(',', '').astype(float)


In [8]:
mean_price = dc_listings[dc_listings['accom_diff']==0].iloc[:5]['price'].mean()
print(mean_price)

188.2
