In [47]:
import pandas as pd
import numpy as np

In [48]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder

In [49]:
df = pd.read_csv('airbnb_ny_clean.csv')

In [50]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [51]:
df.dtypes

id                                  int64
name                               object
host_id                             int64
host_name                          object
neighbourhood_group                object
neighbourhood                      object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [52]:
# remove last_review field
del df['last_review']

In [53]:
# encode columns that contain non-numerical values (ML model requires numeric values)
labelencoder = LabelEncoder()
df['name'] = labelencoder.fit_transform(df['name'])
df['host_name'] = labelencoder.fit_transform(df['host_name'])
df['neighbourhood_group'] = labelencoder.fit_transform(df['neighbourhood_group']) 
df['neighbourhood_group'] = labelencoder.fit_transform(df['neighbourhood']) 
df['room_type'] = labelencoder.fit_transform(df['room_type'])

In [54]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [55]:
# setting feature and target variables
x = df[['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']]
y = df['price']

# convert all nan numbers to 0
x = np.nan_to_num(x)
y = np.nan_to_num(y)

# test and training set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state=353)

# instantiate fit
linreg = LinearRegression()
linreg.fit(x_train, y_train)

# accuracy score
y_pred = linreg.predict(x_test)
print('Accuracy on test set: {}'.format(linreg.score(x_test, y_test)))

Accuracy on test set: 0.05002470048554308


In [56]:
# We notice that the accuracy on test set is onl 4%. This means that this linear
# regression metric as a machine learning model may not be the best for the data.

In [57]:
# calculate our predications after training the model using simple linear regression model
# on the test set
predictions = linreg.predict(x_test)
error = pd.DataFrame(np.array(y_test).flatten(), columns=['actual'])
error['prediction'] = np.array(predictions)
error.head(10)

Unnamed: 0,actual,prediction
0,99,93.22288
1,75,137.937644
2,260,177.980274
3,200,168.981482
4,135,197.76919
5,35,230.315538
6,225,194.825895
7,35,105.041478
8,280,261.092508
9,125,162.517534


In [58]:
# we see that the prediction values are pretty off with the actual values. 
# Another linear model building algorithm will be used for better accuracy