# Cross Validation

We'll be working with a dataset from October 3, 2015 on AirBnB listings from Washington, D.C. Each row in the dataset is a specific listing that's available for renting on AirBnB in the Washington, D.C. area. Here are the column discriptiobs:
- `host_response_rate`: the response rate of the host 
- `host_acceptance_rate`: number of requests to the host that convert to rentals 
- `host_listings_count`: number of other listings the host has 
- `latitude`: latitude dimension of the geographic coordinates 
- `longitude`: longitude part of the coordinates 
- `city`: the city the living space resides 
- `zipcode`: the zip code the living space resides 
- `state`: the state the living space resides 
- `accommodates`: the number of guests the rental can accommodate 
- `room_type`: the type of living space (Private room, Shared room or Entire home/apt 
- `bedrooms`: number of bedrooms included in the rental 
- `bathrooms`: number of bathrooms included in the rental 
- `beds`: number of beds included in the rental 
- `price`: nightly price for the rental 
- `cleaning_fee`: additional fee used for cleaning the living space after the guest leaves 
- `security_deposit`: refundable security deposit, in case of damages 
- `minimum_nights`: minimum number of nights a guest can stay for the rental 
- `maximum_nights`: maximum number of nights a guest can stay for the rental 
- `number_of_reviews`: number of reviews that previous guests have left

In [1]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error

In [2]:
dc_listings = pd.read_csv("dc_airbnb.csv")
stripped_commas = dc_listings['price'].str.replace(',', '')
stripped_dollars = stripped_commas.str.replace('$', '')
dc_listings['price'] = stripped_dollars.astype('float')
dc_listings = dc_listings.reindex(np.random.permutation(dc_listings.index))

In [3]:
dc_listings.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state
2947,100%,75%,3,2,Private room,1.0,1.0,1.0,119.0,$50.00,$200.00,4,14,3,38.926319,-77.023209,Washington,20001,DC
1545,,,1,2,Private room,1.0,1.0,1.0,500.0,,,1,1125,0,38.940395,-76.989493,Washington,20017,DC
2521,100%,100%,1,2,Private room,1.0,1.0,1.0,50.0,$10.00,,2,1125,47,38.915613,-77.008219,Washington,20002,DC
1564,,,1,2,Private room,1.0,1.0,1.0,75.0,,$150.00,2,1125,41,38.93644,-77.028335,"Washington, D.C.",20010,DC
2475,100%,0%,2,2,Entire home/apt,1.0,1.0,1.0,110.0,,,1,1125,38,38.920021,-77.038271,Washington,20009,DC


In [4]:
split_one = dc_listings.iloc[0:1862].copy()
split_two = dc_listings.iloc[1862:].copy()

### Holdout Validation

In [5]:
train_one = split_one
test_one = split_two

model = KNeighborsRegressor()
model.fit(train_one[["accommodates"]], train_one["price"])
test_one["predicted_price"] = model.predict(test_one[["accommodates"]])
iteration_one_rmse = mean_squared_error(test_one["price"], test_one["predicted_price"])**(1/2)
iteration_one_rmse

135.27847116552303

In [6]:
train_two = split_two
test_two = split_one

model.fit(train_two[["accommodates"]], train_two["price"])
test_two["predicted_price"] = model.predict(test_two[["accommodates"]])
iteration_two_rmse = mean_squared_error(test_two["price"], test_two["predicted_price"])**(1/2)
iteration_two_rmse

131.65352283622329

In [8]:
avg_rmse = np.mean([iteration_two_rmse, iteration_one_rmse])
avg_rmse

133.46599700087316

### K-Fold Cross Validation

In [9]:
# Manual partitioning the data set into 5 folds

dc_listings.loc[dc_listings.index[0:745], "fold"] = 1
dc_listings.loc[dc_listings.index[745:1490], "fold"] = 2
dc_listings.loc[dc_listings.index[1490:2234], "fold"] = 3
dc_listings.loc[dc_listings.index[2234:2978], "fold"] = 4
dc_listings.loc[dc_listings.index[2978:3723], "fold"] = 5

In [10]:
dc_listings['fold'].value_counts()

5.0    745
2.0    745
1.0    745
4.0    744
3.0    744
Name: fold, dtype: int64

In [11]:
# Number of missing values

dc_listings['fold'].isnull().sum()

0

In [12]:
dc_listings.head()

Unnamed: 0,host_response_rate,host_acceptance_rate,host_listings_count,accommodates,room_type,bedrooms,bathrooms,beds,price,cleaning_fee,security_deposit,minimum_nights,maximum_nights,number_of_reviews,latitude,longitude,city,zipcode,state,fold
2947,100%,75%,3,2,Private room,1.0,1.0,1.0,119.0,$50.00,$200.00,4,14,3,38.926319,-77.023209,Washington,20001,DC,1.0
1545,,,1,2,Private room,1.0,1.0,1.0,500.0,,,1,1125,0,38.940395,-76.989493,Washington,20017,DC,1.0
2521,100%,100%,1,2,Private room,1.0,1.0,1.0,50.0,$10.00,,2,1125,47,38.915613,-77.008219,Washington,20002,DC,1.0
1564,,,1,2,Private room,1.0,1.0,1.0,75.0,,$150.00,2,1125,41,38.93644,-77.028335,"Washington, D.C.",20010,DC,1.0
2475,100%,0%,2,2,Entire home/apt,1.0,1.0,1.0,110.0,,,1,1125,38,38.920021,-77.038271,Washington,20009,DC,1.0


### Performing the first iteration of k-fold cross validation

In [13]:
# Training a k-nearest neighbors model using the accommodates column as the sole feature from folds 2 to 5 as the training set.

knn_1 = KNeighborsRegressor()
train_1 = dc_listings[dc_listings['fold'] != 1]
test_1 = dc_listings[dc_listings['fold'] == 1].copy()
knn_1.fit(train_1[['accommodates']], train_1['price'])

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [14]:
# Using the model to make predictions on the test set

test_1['predicted price'] = knn_1.predict(test_1[['accommodates']])
rmse_1 = mean_squared_error(test_1['price'], test_1['predicted price']) ** (1/2)
rmse_1

135.11380517546419

### Writing a function to trainin models

In [15]:
def train_and_validate(df, folds):
    
    knn = KNeighborsRegressor()
    rmse = []
    for f in folds:
        train = dc_listings[dc_listings['fold'] != f]
        test = dc_listings[dc_listings['fold'] == f].copy()
        knn.fit(train[['accommodates']], train['price'])
        test['pred'] = knn.predict(test[['accommodates']])
        rmse.append(mean_squared_error(test['price'],test['pred'])**(1/2))
        
    return rmse

In [17]:
train_and_validate(dc_listings, [1])

[135.11380517546419]

In [18]:
train_and_validate(dc_listings, [2])

[115.28459258977587]

In [19]:
train_and_validate(dc_listings, [3])

[147.84535076198873]

In [20]:
train_and_validate(dc_listings, [4])

[125.91220037907794]

In [21]:
train_and_validate(dc_listings, [5])

[136.09022589404654]

In [22]:
avg_rmse = np.mean(train_and_validate(dc_listings, [1,2,3,4,5]))
avg_rmse

132.04923496007066

### Using Scikit-Learn to perform K-Fold Cross Validation

In [27]:
from sklearn.model_selection import cross_val_score, KFold

In [28]:
# instantiate the scikit-learn model class you want to fit
# Instantiating the K-Fold class and using the parameters to specify the k-fold crossvalidation ttributes

knn = KNeighborsRegressor()
kf = KFold(n_splits=5, shuffle=True, random_state=1)

In [29]:
# Using the cross_val_score() function to return the scoring metric you're interested in

mses = cross_val_score(knn, dc_listings[['accommodates']], dc_listings['price'], scoring="neg_mean_squared_error", cv = kf)
mses 

array([-17950.76204027, -18167.5021745 , -20167.95167785, -12083.05747312,
       -17848.38768817])

In [30]:
rmses = np.sqrt(np.absolute(mses))
rmses

array([ 133.98045395,  134.78687686,  142.01391368,  109.92296154,
        133.5978581 ])

In [31]:
avg_rmse = np.mean(rmses)
avg_rmse

130.86041282462162

### Varying K

In [36]:
for f in [3, 5, 7, 9, 10, 11, 13, 15, 17, 19, 21, 23]:
    kf = KFold(f, shuffle=True, random_state = 1)
    knn = KNeighborsRegressor()
    
    mses = cross_val_score(knn, dc_listings[["accommodates"]], dc_listings["price"], scoring="neg_mean_squared_error", cv = kf)
    rmses = np.sqrt(np.absolute(mses))
    
    avg_rmse = np.mean(rmses)
    std_rmse = np.std(rmses)
    
    print("Number of folds: ", str(f), "Average RMSE: ", str(avg_rmse), "RMSE std: ", str(std_rmse))

Number of folds:  3 Average RMSE:  140.633596376 RMSE std:  5.4920893001
Number of folds:  5 Average RMSE:  130.860412825 RMSE std:  10.9125875094
Number of folds:  7 Average RMSE:  127.838966139 RMSE std:  21.6588749411
Number of folds:  9 Average RMSE:  132.867678478 RMSE std:  22.1156858216
Number of folds:  10 Average RMSE:  132.337374603 RMSE std:  22.0060014741
Number of folds:  11 Average RMSE:  132.099802918 RMSE std:  20.3559421011
Number of folds:  13 Average RMSE:  130.519267364 RMSE std:  22.9770359305
Number of folds:  15 Average RMSE:  130.938952201 RMSE std:  30.214742871
Number of folds:  17 Average RMSE:  127.450900984 RMSE std:  33.0455494411
Number of folds:  19 Average RMSE:  131.294611375 RMSE std:  27.1212855112
Number of folds:  21 Average RMSE:  127.183155124 RMSE std:  33.4556185842
Number of folds:  23 Average RMSE:  125.012669449 RMSE std:  34.6912213608
