In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
from sklearn.metrics import mutual_info_score, mean_squared_error
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression, Ridge

In [3]:
data = pd.read_csv('AB_NYC_2019.csv')
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [5]:
features  = [
    'neighbourhood_group',
    'room_type',
    'latitude',
    'longitude',
    'price',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]
df = data[features]
"""Checking description of this project's data set"""
df.describe()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [15]:
df.isnull().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [21]:
df.fillna(0, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [22]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [23]:
"""Creating List of Numerical and Categorical columns"""
categorical = [col for col in df.columns if df[col].dtype == 'object']
numerical = [col for col in df.columns if col not in categorical]

In [24]:
"""Correlation of Numerical Columns"""
display(df[numerical].corr())

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.084788,0.033939,0.024869,-0.015389,-0.018758,0.019517,-0.010983
longitude,0.084788,1.0,-0.150019,-0.062747,0.059094,0.138516,-0.114713,0.082731
price,0.033939,-0.150019,1.0,0.042799,-0.047954,-0.050564,0.057472,0.081829
minimum_nights,0.024869,-0.062747,0.042799,1.0,-0.080116,-0.124905,0.12796,0.144303
number_of_reviews,-0.015389,0.059094,-0.047954,-0.080116,1.0,0.589407,-0.072376,0.172028
reviews_per_month,-0.018758,0.138516,-0.050564,-0.124905,0.589407,1.0,-0.047312,0.163732
calculated_host_listings_count,0.019517,-0.114713,0.057472,0.12796,-0.072376,-0.047312,1.0,0.225701
availability_365,-0.010983,0.082731,0.081829,0.144303,0.172028,0.163732,0.225701,1.0


In [25]:
"""Heatmap of Numerical Variables"""
plotCorrelationMatrix(df[numerical], 8)

NameError: name 'plotCorrelationMatrix' is not defined

In [26]:
y_train = df_train['price'].values
y_val = df_val['price'].values
y_test = df_test['price'].values

In [27]:
del df_train['price']
del df_val['price']
del df_test['price']

"""Taking care of Categorical Variables"""
dv = DictVectorizer(sparse=False)
features = categorical + numerical
features.remove('price')
train_dict = df_train[features].to_dict(orient='records')
val_dict = df_val[features].to_dict(orient='records')

X_train = dv.fit_transform(train_dict)
X_val = dv.transform(val_dict)

In [28]:
scores = {}
for alpha in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=alpha, random_state= 42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = np.sqrt(mean_squared_error(y_val, y_pred))
    scores[alpha] = score.round(3)
    print("RMSE with alpha = %s and not rounding to 3 digits: %s"%(alpha, score) )
print(" \nRMSE with rounding off to 3 digits")
print(scores)

RMSE with alpha = 0 and not rounding to 3 digits: 211.90273881166098
RMSE with alpha = 0.01 and not rounding to 3 digits: 211.90273289441876
RMSE with alpha = 0.1 and not rounding to 3 digits: 211.9026886060598
RMSE with alpha = 1 and not rounding to 3 digits: 211.90306911871275
RMSE with alpha = 10 and not rounding to 3 digits: 211.94999657439928
 
RMSE with rounding off to 3 digits
{0: 211.903, 0.01: 211.903, 0.1: 211.903, 1: 211.903, 10: 211.95}


Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number: 8.45100955605809e-18
