## K nearest neighbours

In [11]:
import os
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
from joblib import load

In [4]:
if os.getcwd().split("\\")[-2] == "modeling":
    os.chdir(os.path.join(os.getcwd(), "..", ".."))

import problem

In [5]:
X_train, y_train = problem.get_train_data()
X_test, y_test = problem.get_test_data()

In [7]:
date_encoder = FunctionTransformer(problem._encode_dates, kw_args={"drop_date": False})
date_cols = problem._encode_dates(X_train[["date"]]).columns.tolist()

add_date_encoder = FunctionTransformer(problem._additional_date_variables)
add_date_cols = ["season"]

num_cols = ["temp", "dwpt", "rhum", "prcp", "wspd", "pres"]

categorical_cols = ["counter_name", "site_name", "wdir"]

preprocessor = ColumnTransformer(
    [
        ("date", OneHotEncoder(handle_unknown="ignore"), date_cols + add_date_cols),
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ],
)

regressor = KNeighborsRegressor()

pipe = make_pipeline(
    FunctionTransformer(problem._merge_external_data, validate=False),
    date_encoder,
    add_date_encoder,
    preprocessor,
    regressor,
)
pipe.fit(X_train, y_train)

In [9]:
y_pred_train = pipe.predict(X_train)
y_pred_test = pipe.predict(X_test)

KeyboardInterrupt: 

In [None]:
print(f"Train set, RMSE={mean_squared_error(y_train, y_pred_train, squared=False):.2f}")
print(f"Test set, RMSE={mean_squared_error(y_test, y_pred_test, squared=False):.2f}")

In [None]:
problem.week_plot(X_test, y_pred_test, y_test, "K nearest neighbours")

In [None]:
problem.error_plot(y_pred_test, y_test, "K nearest neighbours")