# Workshop 3

Starter code for workshop 3. You should have seen most of it before, but make sure you understand what it is doing!

In [1]:
# Common imports
import numpy as np
import os

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [2]:
import pandas as pd

housing = pd.read_csv("workshop3.csv")
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,342200


Split the available data 80/20 for training and testing. Don't use the test data until the very end!

In [3]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)

median_house_value is the value we want to predict, so separate it from the other features.

In [4]:
training_features = train_set.drop(["median_house_value"], axis=1)
training_labels = train_set["median_house_value"].copy()

In [5]:
# Import the linear regression model from scikit-learn
from sklearn.linear_model import LinearRegression

# Train the model
model = LinearRegression()
model.fit(training_features, training_labels)

In [6]:
# Import the necessary function for calculating MSE
from sklearn.metrics import mean_squared_error
import numpy as np

# Make predictions on the training data
train_predictions = model.predict(training_features)

# Calculate RMSE for the linear model
model_mse = mean_squared_error(training_labels, train_predictions)
model_rmse = np.sqrt(model_mse)

# Calculate baseline using mean of training labels
mean_label = np.mean(training_labels)
baseline_predictions = np.full(len(training_labels), mean_label)
baseline_mse = mean_squared_error(training_labels, baseline_predictions)
baseline_rmse = np.sqrt(baseline_mse)

# Print the results
print(f"Linear Model RMSE on training data: {model_rmse:.2f}")
print(f"Baseline RMSE (using mean): {baseline_rmse:.2f}")
print(f"Improvement over baseline: {(baseline_rmse - model_rmse) / baseline_rmse * 100:.2f}%")

Linear Model RMSE on training data: 69411.66
Baseline RMSE (using mean): 115051.63
Improvement over baseline: 39.67%


In [7]:
from sklearn.neighbors import KNeighborsRegressor
import numpy as np
from sklearn.metrics import mean_squared_error

# Linear model RMSE for comparison (already calculated)
linear_rmse = 69411.66
print(f"Linear Model RMSE: {linear_rmse:.2f}")

# Try a simple KNN model first
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(training_features, training_labels)
knn_predictions = knn.predict(training_features)
knn_rmse = np.sqrt(mean_squared_error(training_labels, knn_predictions))
print(f"KNN (k=5, uniform weights) RMSE: {knn_rmse:.2f}")

# Try a different k value
knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(training_features, training_labels)
knn_predictions = knn.predict(training_features)
knn_rmse = np.sqrt(mean_squared_error(training_labels, knn_predictions))
print(f"KNN (k=10, uniform weights) RMSE: {knn_rmse:.2f}")

# Try distance-based weights
knn = KNeighborsRegressor(n_neighbors=5, weights='distance')
knn.fit(training_features, training_labels)
knn_predictions = knn.predict(training_features)
knn_rmse = np.sqrt(mean_squared_error(training_labels, knn_predictions))
print(f"KNN (k=5, distance weights) RMSE: {knn_rmse:.2f}")

Linear Model RMSE: 69411.66
KNN (k=5, uniform weights) RMSE: 80521.65
KNN (k=10, uniform weights) RMSE: 86619.35
KNN (k=5, distance weights) RMSE: 0.00


In [8]:
from sklearn.model_selection import cross_val_score, cross_validate, KFold
from sklearn.neighbors import KNeighborsRegressor
import numpy as np

# Create a KNN model (using k=5 with uniform weights)
knn_model = KNeighborsRegressor(n_neighbors=5)

# Method 1: Using cross_val_score
print("Method 1: Using cross_val_score")
cv_scores1 = cross_val_score(knn_model, training_features, training_labels, 
                           cv=5, scoring='neg_root_mean_squared_error')
# Remember to negate the scores since they are negative
print(f"Validation RMSE scores: {-cv_scores1}")
print(f"Average RMSE: {-cv_scores1.mean():.2f}")
print(f"Standard deviation: {cv_scores1.std():.2f}\n")

# Method 2: Using cross_validate
print("Method 2: Using cross_validate")
cv_results = cross_validate(knn_model, training_features, training_labels,
                          cv=5, scoring='neg_root_mean_squared_error', 
                          return_train_score=True)
# Extract validation scores and negate them
cv_scores2 = -cv_results['test_score']
print(f"Validation RMSE scores: {cv_scores2}")
print(f"Average RMSE: {cv_scores2.mean():.2f}")
print(f"Standard deviation: {cv_scores2.std():.2f}")
# We also get training scores with this method
train_scores = -cv_results['train_score']
print(f"Training RMSE scores: {train_scores}")
print(f"Average training RMSE: {train_scores.mean():.2f}\n")

# Method 3: Using KFold directly
print("Method 3: Using KFold directly")
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores3 = []

for train_index, val_index in kf.split(training_features):
    # Split the data
    X_train_fold, X_val_fold = training_features.iloc[train_index], training_features.iloc[val_index]
    y_train_fold, y_val_fold = training_labels.iloc[train_index], training_labels.iloc[val_index]
    
    # Train the model
    knn_model.fit(X_train_fold, y_train_fold)
    
    # Make predictions
    val_predictions = knn_model.predict(X_val_fold)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val_fold, val_predictions))
    cv_scores3.append(rmse)

print(f"Validation RMSE scores: {cv_scores3}")
print(f"Average RMSE: {np.mean(cv_scores3):.2f}")
print(f"Standard deviation: {np.std(cv_scores3):.2f}")

Method 1: Using cross_val_score
Validation RMSE scores: [ 96858.31520441  99507.11514268  98768.88184361 100897.33191843
  98848.18874065]
Average RMSE: 98975.97
Standard deviation: 1305.18

Method 2: Using cross_validate
Validation RMSE scores: [ 96858.31520441  99507.11514268  98768.88184361 100897.33191843
  98848.18874065]
Average RMSE: 98975.97
Standard deviation: 1305.18
Training RMSE scores: [81341.74682485 80863.35379045 80963.05805948 80652.65692684
 81396.244387  ]
Average training RMSE: 81043.41

Method 3: Using KFold directly
Validation RMSE scores: [100401.38667929795, 99096.70507388783, 97077.55435588026, 100687.21275815014, 97246.58050623185]
Average RMSE: 98901.89
Standard deviation: 1519.35


In [9]:
from sklearn.model_selection import cross_validate
from sklearn.neighbors import KNeighborsRegressor
import numpy as np

# Create a KNN model
knn_model = KNeighborsRegressor(n_neighbors=5)

# Perform cross-validation with both training and validation scores
cv_results = cross_validate(knn_model, training_features, training_labels,
                           cv=5, scoring='neg_root_mean_squared_error', 
                           return_train_score=True)

# Extract and negate validation scores (they're negative in sklearn)
validation_scores = -cv_results['test_score']
print("Validation RMSE scores:")
print(validation_scores)
print(f"Average validation RMSE: {validation_scores.mean():.2f}")
print(f"Standard deviation: {validation_scores.std():.2f}")

# Extract and negate training scores
training_scores = -cv_results['train_score']
print("\nTraining RMSE scores:")
print(training_scores)
print(f"Average training RMSE: {training_scores.mean():.2f}")
print(f"Standard deviation: {training_scores.std():.2f}")

# Calculate the difference to quantify overfitting
print(f"\nAverage difference (validation - training): {validation_scores.mean() - training_scores.mean():.2f}")

Validation RMSE scores:
[ 96858.31520441  99507.11514268  98768.88184361 100897.33191843
  98848.18874065]
Average validation RMSE: 98975.97
Standard deviation: 1305.18

Training RMSE scores:
[81341.74682485 80863.35379045 80963.05805948 80652.65692684
 81396.244387  ]
Average training RMSE: 81043.41
Standard deviation: 284.63

Average difference (validation - training): 17932.55


In [10]:
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LinearRegression
import numpy as np

# Create a linear regression model
linear_model = LinearRegression()

# Perform cross-validation
cv_results = cross_validate(linear_model, training_features, training_labels,
                          cv=5, scoring='neg_root_mean_squared_error', 
                          return_train_score=True)

# Extract and negate validation scores
validation_scores = -cv_results['test_score']
print("Linear Regression - Validation RMSE scores:")
print(validation_scores)
print(f"Average validation RMSE: {validation_scores.mean():.2f}")
print(f"Standard deviation: {validation_scores.std():.2f}")

# Extract and negate training scores
training_scores = -cv_results['train_score']
print("\nLinear Regression - Training RMSE scores:")
print(training_scores)
print(f"Average training RMSE: {training_scores.mean():.2f}")
print(f"Standard deviation: {training_scores.std():.2f}")

# Calculate the difference to quantify overfitting
print(f"\nAverage difference (validation - training): {validation_scores.mean() - training_scores.mean():.2f}")

# For comparison, print the KNN results again
print("\n--- For comparison, KNN (k=5) results ---")
print(f"KNN - Average validation RMSE: 98975.97")
print(f"KNN - Average training RMSE: 81043.41")
print(f"KNN - Average difference: 17932.55")

Linear Regression - Validation RMSE scores:
[65443.95073181 71183.26341402 68364.19624552 73884.67211694
 69323.87651571]
Average validation RMSE: 69639.99
Standard deviation: 2818.98

Linear Regression - Training RMSE scores:
[70384.45975827 68969.99268402 69681.52939126 68408.71942926
 69454.36922343]
Average training RMSE: 69379.81
Standard deviation: 665.87

Average difference (validation - training): 260.18

--- For comparison, KNN (k=5) results ---
KNN - Average validation RMSE: 98975.97
KNN - Average training RMSE: 81043.41
KNN - Average difference: 17932.55


In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor

# Create the KNN model
knn = KNeighborsRegressor()

# Define the parameter grid to search
param_grid = {
    'n_neighbors': [3, 5, 7, 10, 15, 20, 30],
    'weights': ['uniform', 'distance']
}

# Create the grid search object
grid_search = GridSearchCV(
    knn, 
    param_grid, 
    cv=5,  # 5-fold cross-validation
    scoring='neg_root_mean_squared_error',  # Use RMSE as the scoring metric
    return_train_score=True
)

# Perform the grid search
grid_search.fit(training_features, training_labels)

# Get the best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best validation RMSE: {-grid_search.best_score_:.2f}")

# Show all results
print("\nAll results:")
results = grid_search.cv_results_
for i in range(len(results['params'])):
    params = results['params'][i]
    val_rmse = -results['mean_test_score'][i]
    train_rmse = -results['mean_train_score'][i]
    print(f"n_neighbors={params['n_neighbors']}, weights={params['weights']}: "
          f"Validation RMSE={val_rmse:.2f}, Training RMSE={train_rmse:.2f}, "
          f"Difference={val_rmse-train_rmse:.2f}")

Best parameters: {'n_neighbors': 20, 'weights': 'distance'}
Best validation RMSE: 94802.81

All results:
n_neighbors=3, weights=uniform: Validation RMSE=103487.18, Training RMSE=73277.29, Difference=30209.89
n_neighbors=3, weights=distance: Validation RMSE=103773.20, Training RMSE=-0.00, Difference=103773.20
n_neighbors=5, weights=uniform: Validation RMSE=98975.97, Training RMSE=81043.41, Difference=17932.55
n_neighbors=5, weights=distance: Validation RMSE=99076.80, Training RMSE=-0.00, Difference=99076.80
n_neighbors=7, weights=uniform: Validation RMSE=97363.31, Training RMSE=84524.73, Difference=12838.58
n_neighbors=7, weights=distance: Validation RMSE=97293.22, Training RMSE=-0.00, Difference=97293.22
n_neighbors=10, weights=uniform: Validation RMSE=96083.71, Training RMSE=87163.77, Difference=8919.94
n_neighbors=10, weights=distance: Validation RMSE=95896.06, Training RMSE=-0.00, Difference=95896.06
n_neighbors=15, weights=uniform: Validation RMSE=95311.83, Training RMSE=89453.22, 

In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_validate
import numpy as np

# Create the pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),  # Fill missing values with median
    ('scaler', StandardScaler()),                   # Standardize features
    ('knn', KNeighborsRegressor(n_neighbors=20, weights='distance'))  # Use best params from grid search
])

# Perform cross-validation
cv_results = cross_validate(
    pipeline, 
    training_features, 
    training_labels,
    cv=5, 
    scoring='neg_root_mean_squared_error',
    return_train_score=True
)

# Extract and print validation scores
validation_scores = -cv_results['test_score']
print("Validation RMSE scores:")
print(validation_scores)
print(f"Average validation RMSE: {validation_scores.mean():.2f}")
print(f"Standard deviation: {validation_scores.std():.2f}")

# Extract and print training scores
training_scores = -cv_results['train_score']
print("\nTraining RMSE scores:")
print(training_scores)
print(f"Average training RMSE: {training_scores.mean():.2f}")
print(f"Standard deviation: {training_scores.std():.2f}")

# Compare with previous best KNN and linear regression
print("\nComparison with previous models:")
print(f"Pipeline KNN: {validation_scores.mean():.2f}")
print(f"Best Grid Search KNN: 94802.81")
print(f"Linear Regression: 69639.99")

Validation RMSE scores:
[59503.29441056 63955.54147515 59867.69549421 63154.43607273
 62325.77000544]
Average validation RMSE: 61761.35
Standard deviation: 1775.30

Training RMSE scores:
[0. 0. 0. 0. 0.]
Average training RMSE: 0.00
Standard deviation: 0.00

Comparison with previous models:
Pipeline KNN: 61761.35
Best Grid Search KNN: 94802.81
Linear Regression: 69639.99


In [13]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
import numpy as np

# Create the pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor())
])

# Define the parameter grid with pipeline component names
param_grid = {
    'knn__n_neighbors': [3, 5, 7, 10, 15, 20, 30],
    'knn__weights': ['uniform', 'distance']
}

# Create the grid search object
grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=5,
    scoring='neg_root_mean_squared_error',
    return_train_score=True
)

# Perform the grid search
grid_search.fit(training_features, training_labels)

# Get the best parameters and score
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best validation RMSE: {-grid_search.best_score_:.2f}")

# Show all results
print("\nAll results:")
results = grid_search.cv_results_
for i in range(len(results['params'])):
    params = results['params'][i]
    val_rmse = -results['mean_test_score'][i]
    train_rmse = -results['mean_train_score'][i]
    print(f"n_neighbors={params['knn__n_neighbors']}, weights={params['knn__weights']}: "
          f"Validation RMSE={val_rmse:.2f}, Training RMSE={train_rmse:.2f}, "
          f"Difference={val_rmse-train_rmse:.2f}")

Best parameters: {'knn__n_neighbors': 10, 'knn__weights': 'distance'}
Best validation RMSE: 61025.02

All results:
n_neighbors=3, weights=uniform: Validation RMSE=64410.07, Training RMSE=45363.46, Difference=19046.61
n_neighbors=3, weights=distance: Validation RMSE=64181.35, Training RMSE=-0.00, Difference=64181.35
n_neighbors=5, weights=uniform: Validation RMSE=62392.78, Training RMSE=50729.14, Difference=11663.64
n_neighbors=5, weights=distance: Validation RMSE=61997.80, Training RMSE=-0.00, Difference=61997.80
n_neighbors=7, weights=uniform: Validation RMSE=61731.45, Training RMSE=53501.45, Difference=8230.00
n_neighbors=7, weights=distance: Validation RMSE=61208.55, Training RMSE=-0.00, Difference=61208.55
n_neighbors=10, weights=uniform: Validation RMSE=61683.03, Training RMSE=55841.24, Difference=5841.79
n_neighbors=10, weights=distance: Validation RMSE=61025.02, Training RMSE=-0.00, Difference=61025.02
n_neighbors=15, weights=uniform: Validation RMSE=62179.48, Training RMSE=5803

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Create the best pipeline model
best_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('knn', KNeighborsRegressor(n_neighbors=10, weights='distance'))
])

# Train the model on the entire training set
best_pipeline.fit(training_features, training_labels)

# Prepare the test features and labels
test_features = test_set.drop(["median_house_value"], axis=1)
test_labels = test_set["median_house_value"].copy()

# Make predictions on the test set
test_predictions = best_pipeline.predict(test_features)

# Calculate RMSE on the test set
test_rmse = np.sqrt(mean_squared_error(test_labels, test_predictions))

# Print and compare results
print(f"Test set RMSE: {test_rmse:.2f}")
print(f"Validation RMSE from cross-validation: 61025.02")
print(f"Difference: {abs(test_rmse - 61025.02):.2f}")
print(f"Percentage difference: {abs(test_rmse - 61025.02)/61025.02*100:.2f}%")

Test set RMSE: 61803.74
Validation RMSE from cross-validation: 61025.02
Difference: 778.72
Percentage difference: 1.28%


Your code starts here...