In [16]:
import numpy
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [2]:
airbnb = pd.read_csv("clean_airbnb.csv")
airbnb.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,characters_in_name,symbols_in_name,exclamation_pts_in_name,numbers_in_name,caps_in_name,words_in_name
0,2539,clean quiet apt home park,2787,john,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,...,2018-10-19,0.21,6,365,34,1,0,0,1,7
1,2595,skylit midtown castle,2845,jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,...,2019-05-21,0.38,2,355,21,0,0,0,3,3
2,3647,village harlemnew york,4632,elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,...,,0.0,1,365,35,5,1,0,25,6
3,3831,cozy entire floor brownstone,4869,lisaroxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,...,2019-07-05,4.64,1,194,31,0,0,0,4,5
4,5022,entire apt spacious studioloft central park,7192,laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,...,2018-11-19,0.1,1,0,48,2,0,0,5,7


In [3]:
airbnb.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,characters_in_name,symbols_in_name,exclamation_pts_in_name,numbers_in_name,caps_in_name,words_in_name
count,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0,48858.0
mean,19023350.0,67631690.0,40.728941,-73.95217,152.740309,7.012444,23.273098,1.091124,7.148369,112.801425,36.907119,1.017336,0.160608,0.518196,5.534856,5.76622
std,10982890.0,78623890.0,0.054528,0.046159,240.232386,20.019757,44.549898,1.59727,32.9646,131.610962,10.494872,1.576561,0.485724,0.959848,5.952159,1.814807
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
25%,9475980.0,7818669.0,40.69009,-73.98307,69.0,1.0,1.0,0.04,1.0,0.0,31.0,0.0,0.0,0.0,3.0,5.0
50%,19691140.0,30791330.0,40.72307,-73.95568,106.0,3.0,5.0,0.37,1.0,45.0,37.0,1.0,0.0,0.0,4.0,6.0
75%,29157650.0,107434400.0,40.763107,-73.93628,175.0,5.0,24.0,1.58,2.0,227.0,46.0,2.0,0.0,1.0,6.0,7.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0,179.0,47.0,9.0,14.0,45.0,27.0


We will remove data that seems unusual, like for example, a price of 0

In [4]:
airbnb.drop(airbnb[airbnb["price"] == 0].index, inplace = True)
#remove NA's.
airbnb.isna().sum()

id                                    0
name                                122
host_id                               0
host_name                            98
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10036
reviews_per_month                     0
calculated_host_listings_count        0
availability_365                      0
characters_in_name                    0
symbols_in_name                       0
exclamation_pts_in_name               0
numbers_in_name                       0
caps_in_name                          0
words_in_name                         0
dtype: int64

In [5]:
#remove any NA
airbnb.dropna(inplace=True)

It seems like most of the "minimum nights" occur between 1 and 40. We will subset this.

In [6]:
airbnb.drop(airbnb[airbnb["price"] >200 ].index, inplace = True)

In [7]:
airbnb.drop(airbnb[airbnb["minimum_nights"] > 10 ].index, inplace = True)

In [8]:
#remove availability_365
#remove name
#remove host_
airbnb.drop(["name", "host_name", "neighbourhood", "availability_365", "last_review"], axis = 1, inplace=True)


In [9]:
airbnb.shape

(29304, 17)

In [10]:
airbnb.head()

Unnamed: 0,id,host_id,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,characters_in_name,symbols_in_name,exclamation_pts_in_name,numbers_in_name,caps_in_name,words_in_name
0,2539,2787,Brooklyn,40.64749,-73.97237,Private room,149,1,9,0.21,6,34,1,0,0,1,7
3,3831,4869,Brooklyn,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,31,0,0,0,4,5
4,5022,7192,Manhattan,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,48,2,0,0,5,7
5,5099,7322,Manhattan,40.74767,-73.975,Entire home/apt,200,3,74,0.59,1,41,0,0,1,8,7
7,5178,8967,Manhattan,40.76489,-73.98493,Private room,79,2,430,3.47,1,31,0,0,0,5,5


Create dummy variables 

In [11]:
le = LabelEncoder()
n = airbnb["neighbourhood_group"]
r = airbnb["room_type"]
airbnb["neighbourhood_group"] = le.fit_transform(n)
airbnb["room_type"] = le.fit_transform(r)

In [12]:
airbnb.head()

Unnamed: 0,id,host_id,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,characters_in_name,symbols_in_name,exclamation_pts_in_name,numbers_in_name,caps_in_name,words_in_name
0,2539,2787,1,40.64749,-73.97237,1,149,1,9,0.21,6,34,1,0,0,1,7
3,3831,4869,1,40.68514,-73.95976,0,89,1,270,4.64,1,31,0,0,0,4,5
4,5022,7192,2,40.79851,-73.94399,0,80,10,9,0.1,1,48,2,0,0,5,7
5,5099,7322,2,40.74767,-73.975,0,200,3,74,0.59,1,41,0,0,1,8,7
7,5178,8967,2,40.76489,-73.98493,1,79,2,430,3.47,1,31,0,0,0,5,5


In [17]:
x = airbnb.drop(["price"], axis = 1)
y = airbnb[["price"]]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
reg = LinearRegression().fit(X_train, y_train)
reg.fit(X_train, y_train)

LinearRegression()

In [23]:
1 - reg.score(X_test, y_test)

0.5384644655585028

In [None]:
parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid.fit(X_train, y_train) 
grid_predictions = grid.predict(X_test)
print(grid.best_params_) 
print(classification_report(y_test, grid_predictions))
