In [58]:
import pandas as pd
import numpy as np

In [128]:
df=pd.read_csv('AB_NYC_2019.csv')

In [129]:
df = df[[
    'latitude',
    'longitude',
    'price',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]]

In [130]:
df.isna()['reviews_per_month'].value_counts()

False    38843
True     10052
Name: reviews_per_month, dtype: int64

In [85]:
df['minimum_nights'].median()

3.0

In [79]:
# df['reviews_per_month'] = df['reviews_per_month'].fillna(0)

In [72]:
# df['reviews_per_month'] = df['reviews_per_month'].fillna(df['reviews_per_month'].mean())

In [121]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [120]:
def train_linear_regression_reg(X, y, r=0.001):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX = XTX + r * np.eye(XTX.shape[0])

    XTX_inv = np.linalg.inv(XTX)
    w_full = XTX_inv.dot(X.T).dot(y)
    
    return w_full[0], w_full[1:]

In [119]:
def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

In [122]:
from sklearn.model_selection import train_test_split


def train_model(df_input,seed_value =42, r=None):
    df = df_input.sample(frac=1,random_state=seed_value)
    train,test = train_test_split(df,test_size=0.4)
    test,val = train_test_split(test,test_size=0.5)

    columns = ['latitude',
        'longitude',
        'minimum_nights',
        'number_of_reviews',
        'reviews_per_month',
        'calculated_host_listings_count',
        'availability_365'
          ]
    X_train = train[columns]
    X_test = test[columns]
    X_val = val[columns]
    y_train = np.log1p(train['price'])
    y_test = np.log1p(test['price'])
    y_val = np.log1p(val['price'])
    
    
    X_train['reviews_per_month'] = X_train['reviews_per_month'].fillna(0)
    X_val['reviews_per_month'] = X_val['reviews_per_month'].fillna(0)
    
    if r:
        w0, w = train_linear_regression_reg(X_train, y_train,r)
    else:
        w0, w = train_linear_regression(X_train, y_train)
    y_pred = w0 + X_val.dot(w)
    return round(rmse(y_val,y_pred),2)

In [124]:
rmse_list=[]
for r in [0, 0.000001, 0.0001, 0.001, 0.01, 0.1, 1, 5, 10]:
    rmse_r = train_model(df,r=r)
    rmse_list.append(rmse_r)
    print(r,'->',rmse_r)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


0 -> 0.65
1e-06 -> 0.64
0.0001 -> 0.64
0.001 -> 0.64
0.01 -> 0.67
0.1 -> 0.69
1 -> 0.69
5 -> 0.69
10 -> 0.69


In [126]:
rmse_list=[]
for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    rmse_r = train_model(df,seed_value=seed)
    rmse_list.append(rmse_r)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [127]:
np.std(rmse_list)

0.006633249580710806

In [131]:
df_new = df.sample(frac=1,random_state=9)
train,test = train_test_split(df_new,test_size=0.4)
test,val = train_test_split(test,test_size=0.5)

columns = ['latitude',
    'longitude',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
      ]
X_train = train[columns]
X_test = test[columns]
X_val = val[columns]
y_train = np.log1p(train['price'])
y_test = np.log1p(test['price'])
y_val = np.log1p(val['price'])


X_train['reviews_per_month'] = X_train['reviews_per_month'].fillna(0)
X_val['reviews_per_month'] = X_val['reviews_per_month'].fillna(0)
X_test['reviews_per_month'] = X_test['reviews_per_month'].fillna(0)

X=X_train.append(X_val)
y=y_train.append(y_val)


w0, w = train_linear_regression_reg(X, y,0.001)
y_pred = w0 + X_test.dot(w)
round(rmse(y_test,y_pred),2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
42720,40.71829,-73.98900,2,4,1.54,1,3
41538,40.79795,-73.93448,2,20,4.38,1,52
27899,40.68637,-73.91159,2,19,1.03,8,177
35531,40.85409,-73.85032,5,8,0.87,5,22
35745,40.77089,-73.93036,1,3,0.30,1,0
...,...,...,...,...,...,...,...
37587,40.72994,-74.00456,3,2,0.29,1,0
21342,40.77103,-73.91409,2,6,0.21,1,176
6247,40.73351,-73.95459,3,0,0.00,2,280
46535,40.76650,-73.98673,2,1,1.00,1,35


0 -> 0.64
1e-06 -> 0.64
0.0001 -> 0.64
0.001 -> 0.64
0.01 -> 0.65
0.1 -> 0.67
1 -> 0.68
5 -> 0.68
10 -> 0.68


In [110]:
np.std(rmse_list)

0.018121673811444562