# Table of Contents
* 1. Import and review data  
        * 1.1 Import packages  
        * 1.2 Import data  
* 2. K-Nearest Neighbors
* 3. Random Forest
* 4. Gradient Boosting Machines - LightGBM
* 5. Support Vector Regression - RBF

      

In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt


In [2]:
#confirm the disk location is correct for importing
os.chdir('C:/Users/steve/Documents/springboard/Capstone2/data')
cwd = os.getcwd()
print(cwd)

C:\Users\steve\Documents\springboard\Capstone2\data


In [3]:
%%time
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

CPU times: total: 22.4 s
Wall time: 44.6 s


In [4]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(5605173, 166)
(5605173, 1)
(1401294, 166)
(1401294, 1)


# K-Nearest Neighbors

In [None]:
%%time
from sklearn.neighbors import KNeighborsRegressor

knn=KNeighborsRegressor(n_neighbors=13)
knn.fit(X_train_sample,y_train_sample)

In [None]:
%%time
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV
kf=KFold(n_splits=5, shuffle=True, random_state=42)

param_grid = {'n_neighbors':np.arange(1,15)}
knn = KNeighborsRegressor()
knn_cv= RandomizedSearchCV(knn,param_grid,cv=kf)
knn_cv.fit(X_train_sample,y_train_sample)

In [None]:
print("Best Score:" + str(knn_cv.best_score_))
print("Best Parameters: " + str(knn_cv.best_params_))

In [None]:
%%time
print(knn.score(X_train_sample,y_train_sample))

In [None]:
%%time
print(knn.score(X_test_sample,y_test_sample))

In [None]:
%%time
y_pred_knn = knn.predict(X_test_sample)

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Calculate evaluation metrics
mae = mean_absolute_error(y_test_sample, y_pred_knn)
mse = mean_squared_error(y_test_sample, y_pred_knn)
rmse = np.sqrt(mse)
r2 = r2_score(y_test_sample, y_pred_knn)

# Print evaluation metrics
print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Squared Error (MSE): {mse}')
print(f'Root Mean Squared Error (RMSE): {rmse}')
print(f'R-squared (R2): {r2}')


# Random Forest

In [5]:
# Split the training and test data into a smaller sample
from sklearn.model_selection import train_test_split

sample_fraction = 0.2

X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, test_size=1-sample_fraction, random_state=42)
X_test_sample, _, y_test_sample, _ = train_test_split(X_test, y_test, test_size=1-sample_fraction, random_state=42)

In [6]:
%%time
print(X_train_sample.shape)
print(y_train_sample.shape)
print(X_test_sample.shape)
print(y_test_sample.shape)

(1121034, 166)
(1121034, 1)
(280258, 166)
(280258, 1)
CPU times: total: 0 ns
Wall time: 0 ns


In [7]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import randint


In [8]:
rf = RandomForestRegressor(random_state=42)

# Defining the parameter grid
param_dist = {
    'n_estimators': randint(50, 200),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': randint(10, 50),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 10),
    'bootstrap': [True, False]}

In [9]:
# Setting up the RandomizedSearchCV
random_search_rf = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=5,  
    cv=5,  
    verbose=2,  
    random_state=42,
    n_jobs=-1)

In [10]:
%%time
# Fitting the model
random_search_rf.fit(X_train_sample, y_train_sample.values.ravel())


Fitting 5 folds for each of 5 candidates, totalling 25 fits
CPU times: total: 5min 6s
Wall time: 30min 39s


In [11]:
print(f'Best Parameters: {random_search_rf.best_params_}')

Best Parameters: {'bootstrap': False, 'max_depth': 39, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 9, 'n_estimators': 70}


In [12]:
%%time
# Making predictions with the best estimator
best_model_rf = random_search_rf.best_estimator_





CPU times: total: 0 ns
Wall time: 0 ns


In [13]:
%%time
y_pred_rf = best_model_rf.predict(X_test_sample)

CPU times: total: 6.31 s
Wall time: 6.98 s


In [23]:
%%time
# Evaluating the model
mae_rf = mean_absolute_error(y_test_sample, y_pred_rf)
mse_rf = mean_squared_error(y_test_sample, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test_sample, y_pred_rf)

# Print evaluation metrics
print(f'Mean Absolute Error (MAE): {mae_rf}')
print(f'Mean Squared Error (MSE): {mse_rf}')
print(f'Root Mean Squared Error (RMSE): {rmse_rf}')
print(f'R-squared (R2): {r2_rf}')


Mean Absolute Error (MAE): 0.6780843160333464
Mean Squared Error (MSE): 0.8558391544551969
Root Mean Squared Error (RMSE): 0.9251157519225348
R-squared (R2): 0.14121759948016466
CPU times: total: 0 ns
Wall time: 25.5 ms


In [15]:
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [48]:
# Define the objective function to optimize
def rf_eval(n_estimators, max_depth, min_samples_split, min_samples_leaf):
    """
    Trains a Random Forest with given hyperparameters and returns the cross-validated score.
    """
    # Convert float inputs to int (as hyperparameters must be integers)
    n_estimators = int(n_estimators)
    max_depth = int(max_depth)
    min_samples_split = int(min_samples_split)
    min_samples_leaf = int(min_samples_leaf)
    
    
    # Create Model
    rfbo = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42,
        n_jobs=-1,
        
    )
    
    rfbo.fit(X_train_sample, y_train_sample.values.ravel())
    y_pred_rfbo = rfbo.predict(X_test_sample)
    mse = mean_squared_error(y_test_sample, y_pred_rfbo)
    return -mse  


In [49]:
# Define the parameter search space
pbounds = {
    "n_estimators": (50, 200),  
    "max_depth": (5, 50), 
    "min_samples_split": (2, 10), 
    "min_samples_leaf": (1, 10),
}

In [50]:
# Set up the optimizer
optimizer = BayesianOptimization(
    f=rf_eval,  
    pbounds=pbounds, 
    random_state=42,
    verbose=2
)

In [51]:
%%time
# Run optimization
optimizer.maximize(init_points=4, n_iter=8)

# Print best parameters
print("Best hyperparameters found:", optimizer.max)

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [39m1        [39m | [39m-0.8693  [39m | [39m21.85    [39m | [39m9.556    [39m | [39m7.856    [39m | [39m139.8    [39m |
| [39m2        [39m | [39m-0.9     [39m | [39m12.02    [39m | [39m2.404    [39m | [39m2.465    [39m | [39m179.9    [39m |
| [35m3        [39m | [35m-0.8537  [39m | [35m32.05    [39m | [35m7.373    [39m | [35m2.165    [39m | [35m195.5    [39m |
| [39m4        [39m | [39m-0.8942  [39m | [39m42.46    [39m | [39m2.911    [39m | [39m3.455    [39m | [39m77.51    [39m |
| [39m5        [39m | [39m-0.8537  [39m | [39m32.8     [39m | [39m7.93     [39m | [39m2.832    [39m | [39m194.4    [39m |
| [39m6        [39m | [39m-0.952   [39m | [39m49.16    [39m | [39m2.0      [39m | [39m3.286    [39m | [39m198.5    [39m |
| [39m7        [39m | [39m-0.8648  [39m | [

In [55]:
%%time
# Extract the best parameters
best_params_rf = optimizer.max['params']
best_params_rf['n_estimators'] = int(best_params_rf['n_estimators'])
best_params_rf['max_depth'] = int(best_params_rf['max_depth'])
best_params_rf['min_samples_split'] = int(best_params_rf['min_samples_split'])
best_params_rf['min_samples_leaf'] = int(best_params_rf['min_samples_leaf'])

print(best_params_rf)

{'max_depth': 45, 'min_samples_leaf': 8, 'min_samples_split': 8, 'n_estimators': 150}
CPU times: total: 0 ns
Wall time: 1.12 ms


{'max_depth': 45, 'min_samples_leaf': 8.88475769057052, 'min_samples_split': 8.114762421844707, 'n_estimators': 150}
