## Imports

In [38]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

## Data Load

In [51]:
data_cleaned = pd.read_csv(r'C:\Users\spider\Documents\Springboard\GitHub\NYC-Airbnb\Data Wrangling\airbnb_data_cleaned.csv')

## Extract Information

In [52]:
data_cleaned.head()

Unnamed: 0,id,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2762,0.21,6,365
1,2595,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2976,0.38,2,355
2,3647,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,0,0.0,1,365
3,3831,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,3021,4.64,1,194
4,5022,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2793,0.1,1,0


In [53]:
data_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              48895 non-null  int64  
 1   host_id                         48895 non-null  int64  
 2   neighbourhood_group             48895 non-null  object 
 3   neighbourhood                   48895 non-null  object 
 4   latitude                        48895 non-null  float64
 5   longitude                       48895 non-null  float64
 6   room_type                       48895 non-null  object 
 7   price                           48895 non-null  int64  
 8   minimum_nights                  48895 non-null  int64  
 9   number_of_reviews               48895 non-null  int64  
 10  last_review                     48895 non-null  int64  
 11  reviews_per_month               48895 non-null  float64
 12  calculated_host_listings_count  

In [54]:
data_cleaned.shape

(48895, 14)

In [55]:
df_obj = data_cleaned.select_dtypes(include=['object']).copy()
df_obj.shape

(48895, 3)

In [56]:
df_obj.head()

Unnamed: 0,neighbourhood_group,neighbourhood,room_type
0,Brooklyn,Kensington,Private room
1,Manhattan,Midtown,Entire home/apt
2,Manhattan,Harlem,Private room
3,Brooklyn,Clinton Hill,Entire home/apt
4,Manhattan,East Harlem,Entire home/apt


In [48]:
df_obj['neighbourhood'].value_counts()

Williamsburg          3920
Bedford-Stuyvesant    3714
Harlem                2658
Bushwick              2465
Upper West Side       1971
                      ... 
Willowbrook              1
Rossville                1
Fort Wadsworth           1
New Dorp                 1
Richmondtown             1
Name: neighbourhood, Length: 221, dtype: int64

In [50]:
dummies = pd.get_dummies(df_obj)
dummies.head()

Unnamed: 0,neighbourhood_group_Bronx,neighbourhood_group_Brooklyn,neighbourhood_group_Manhattan,neighbourhood_group_Queens,neighbourhood_group_Staten Island,neighbourhood_Allerton,neighbourhood_Arden Heights,neighbourhood_Arrochar,neighbourhood_Arverne,neighbourhood_Astoria,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [67]:
dummies['neighbourhood_Williamsburg'].value_counts()

0    44975
1     3920
Name: neighbourhood_Williamsburg, dtype: int64

In [74]:
df = pd.concat([data_cleaned,dummies], axis =1)
df.head()

Unnamed: 0,id,host_id,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,2539,2787,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,...,0,0,0,0,0,0,0,0,1,0
1,2595,2845,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,...,0,0,0,0,0,0,0,1,0,0
2,3647,4632,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,...,0,0,0,0,0,0,0,0,1,0
3,3831,4869,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,...,0,0,0,0,0,0,0,1,0,0
4,5022,7192,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,...,0,0,0,0,0,0,0,1,0,0


In [75]:
df.drop(columns = ['neighbourhood_group', 'neighbourhood', 'room_type', 'id', 'host_id'], inplace = True)

In [76]:
df.head()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,neighbourhood_group_Bronx,...,neighbourhood_Williamsburg,neighbourhood_Willowbrook,neighbourhood_Windsor Terrace,neighbourhood_Woodhaven,neighbourhood_Woodlawn,neighbourhood_Woodrow,neighbourhood_Woodside,room_type_Entire home/apt,room_type_Private room,room_type_Shared room
0,40.64749,-73.97237,149,1,9,2762,0.21,6,365,0,...,0,0,0,0,0,0,0,0,1,0
1,40.75362,-73.98377,225,1,45,2976,0.38,2,355,0,...,0,0,0,0,0,0,0,1,0,0
2,40.80902,-73.9419,150,3,0,0,0.0,1,365,0,...,0,0,0,0,0,0,0,0,1,0
3,40.68514,-73.95976,89,1,270,3021,4.64,1,194,0,...,0,0,0,0,0,0,0,1,0,0
4,40.79851,-73.94399,80,10,9,2793,0.1,1,0,0,...,0,0,0,0,0,0,0,1,0,0


## Train/Test Split

In [77]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='price'), 
                                                    df.price, test_size=0.25, 
                                                    random_state=47)

In [78]:
X_train.shape, X_test.shape

((36671, 237), (12224, 237))

In [79]:
y_train.shape, y_test.shape

((36671,), (12224,))

In [80]:
X_train.dtypes

latitude                     float64
longitude                    float64
minimum_nights                 int64
number_of_reviews              int64
last_review                    int64
                              ...   
neighbourhood_Woodrow          uint8
neighbourhood_Woodside         uint8
room_type_Entire home/apt      uint8
room_type_Private room         uint8
room_type_Shared room          uint8
Length: 237, dtype: object

In [81]:
X_test.dtypes

latitude                     float64
longitude                    float64
minimum_nights                 int64
number_of_reviews              int64
last_review                    int64
                              ...   
neighbourhood_Woodrow          uint8
neighbourhood_Woodside         uint8
room_type_Entire home/apt      uint8
room_type_Private room         uint8
room_type_Shared room          uint8
Length: 237, dtype: object

### Metrics

In [82]:
train_mean = y_train.mean()
train_mean

152.05652968285565

In [83]:
dumb_reg = DummyRegressor(strategy='mean')
dumb_reg.fit(X_train, y_train)
dumb_reg.constant_

array([[152.05652968]])

In [84]:
y_tr_pred = dumb_reg.predict(X_train)
y_tr_pred[:5]

array([152.05652968, 152.05652968, 152.05652968, 152.05652968,
       152.05652968])

In [85]:
y_te_pred = dumb_reg.predict(X_test)
y_tr_pred[:5]

array([152.05652968, 152.05652968, 152.05652968, 152.05652968,
       152.05652968])

### R-Squared

In [86]:
r2_score(y_train, y_tr_pred), r2_score(y_test, y_te_pred)

(0.0, -8.801662748658678e-05)

### Mean Absolute Error

In [87]:
mean_absolute_error(y_train, y_tr_pred), mean_absolute_error(y_test, y_te_pred)

(91.5998872695891, 93.96468770269202)

### Mean Squared Error

In [88]:
mean_squared_error(y_train, y_tr_pred), mean_squared_error(y_test, y_te_pred)

(50167.654326687785, 80189.57022653896)

### Scale the data

In [89]:
scaler = StandardScaler()
scaler.fit(X_train)
X_tr_scaled = scaler.transform(X_train)
X_te_scaled = scaler.transform(X_test)