# knn-nearest-neighbor-classification

In [4]:
# This is the new observation to predict
new_seed <- tibble(area = 12.1,
                        perimeter = 14.2,
                        compactness = 0.9,
                        length = 4.9,
                        width = 2.8,
                        asymmetry_coefficient = 3.0, 
                        groove_length = 5.1)

# Read data and change the column type for our class label to "factor" using "mutate" and "as_factor"
seed_data <- read_table2("https://archive.ics.uci.edu/ml/machine-learning-databases/00236/seeds_dataset.txt",
          col_names = c("area",
                        "perimeter",
                        "compactness",
                        "length",
                        "width",
                        "asymmetry_coefficient",
                        "groove_length",
                        "wheat_variety")) |>
         mutate(wheat_variety = as_factor(wheat_variety))
seed_data

# Specifying what model we use for our analysis; we perform classification analysis with K-nearest neighbors
knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = 5) |>
    set_engine("kknn") |>
    set_mode("classification") 

# Perform all the data processing procedures (scaling, centring...)
seed_recipe <- recipe(wheat_variety ~ ., data = seed_data) |>
    step_scale(all_predictors()) |>
    step_center(all_predictors())

# You do NOT need this step, but what this is doing is that: 
# - prep() will actually go and calculate all the steps needed to preprocess the data 
#   (i.e., actually return the newly calculated centered + scaled values)
# - bake() will actually return a dataframe of your new data.
seed_data_scaled <- seed_recipe |>
    prep() |>
    bake(seed_data)

# Fit the model to the data perform prediction
seed_fit <- workflow () |>
    add_recipe(seed_recipe) |>
    add_model(knn_spec) |> 
    fit(data = seed_data)

# Predict the class label for the given new observation using the model you build 
seed_predict <- predict(seed_fit, new_seed)
seed_predict

ERROR: Error in tibble(area = 12.1, perimeter = 14.2, compactness = 0.9, length = 4.9, : could not find function "tibble"


# Evaluation and Tuning

In [5]:
# Read the data as usual and make sure to change the class label 
# column of our interest to factor type

# This is another way to scale
fruit_data_scaled <- fruit_data |>
    mutate(scaled_mass = scale(mass, center = TRUE),
           scaled_width = scale(width, center = TRUE),
           scaled_height = scale(height, center = TRUE),
           scaled_color_score = scale(color_score, center = TRUE))
fruit_data_scaled

# Splitting the data
fruit_split <- initial_split(fruit_data, prop = 0.75, strata = fruit_name)
fruit_train <- training(fruit_split)
fruit_test <- testing(fruit_split)

# Exactly the same procedures EXCEPT for the data we use is "training data."
fruit_recipe
knn_spec
fruit_fit

# Binding the columns to "test data"
fruit_test_predictions <- predict(fruit_fit, fruit_test) |>
    bind_cols(fruit_test)
fruit_test_predictions

# Metrics: to assess our classifier’s accuracy
fruit_prediction_accuracy <- fruit_test_predictions |>
    metrics(truth = fruit_name, estimate = .pred_class)
fruit_prediction_accuracy

# Confusion metrics: shows the table of predicted labels and correct 
# labels also to assess our classifier’s accuracy
fruit_mat <- fruit_test_predictions |>
    conf_mat(truth = fruit_name, estimate = .pred_class)
fruit_mat

# Cross-validation: to select which K is the most optimal for our data set for k-nn classification.
# We split our overall training data into  𝐶  evenly-sized chunks, 
# and then iteratively use 1 chunk as the validation set and combine 
# the remaining  𝐶−1  chunks as the training set.

# To split into chunk, we use v-fold
fruit_vfold <- vfold_cv(fruit_train, v = 5, strata = fruit_name)

# Resample: for cross-validation
fruit_resample_fit <- workflow() |>
      add_recipe(fruit_recipe) |>
      add_model(knn_spec) |>
      fit_resamples(resamples = fruit_vfold)

# Collect metrics: to aggregate the mean and standard error of the classifier’s 
# validation accuracy across the folds
fruit_metrics <- collect_metrics(fruit_resample_fit)
fruit_metrics

# Tuning to fit the model for each value in a range of parameter values
knn_tune　<- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
      set_engine("kknn") |>
      set_mode("classification")
knn_results <- workflow() |>
    add_recipe(fruit_recipe) |>
    add_model(knn_tune) |>
    tune_grid(resamples = fruit_vfold, grid = 10) |>
    collect_metrics()
knn_results

ERROR: Error in mutate(fruit_data, scaled_mass = scale(mass, center = TRUE), : could not find function "mutate"


# knn_regression

In [6]:
# Distance between the vertical line at x (= 100) to the 4 points closest to the line
# Take the average and show the numeric value
answer <- data_name |> 
 mutate(diff = abs(100 - x_name)) |> 
 arrange(diff) |> 
 slice(1:4) |>  
 summarise(predicted = mean(y_name)) |>
 pull()
answer

# All steps are similar to the classification 
credit_split <- initial_split(credit, prop = 0.6, strata = Balance)
credit_training <- training(credit_split)
credit_testing <- testing(credit_split)

# **** NOT the same ****
credit_knn_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = tune()) |>
    set_engine("kknn") |>
    set_mode("regression")

credit_knn_recipe <- recipe(Balance ~., data = credit_training) |>
    step_center(all_predictors()) |>
    step_scale(all_predictors())

credit_vfold <- vfold_cv(credit_training, v = 5, strata = Balance)

# **** NOT the same ****
credit_knn_workflow <- workflow() |>
    add_recipe(credit_knn_recipe) |>
    add_model(credit_knn_spec)

gridvals <- tibble(neighbors = seq(1,20))

credit_knn_results <- credit_knn_workflow |>
    tune_grid(resamples = credit_vfold, grid = gridvals) |>
    collect_metrics() 

# **** select the value of k resulting in best RMSE ****
kmin <- credit_knn_results |>
   filter(.metric == "rmse") |>
   arrange(mean) |> 
   slice(1) |>
   pull(neighbors)

# **** retrain the model using that final k, predict on held-out data ****
credit_spec <- nearest_neighbor(weight_func = "rectangular", neighbors = kmin) |>
    set_engine("kknn") |>
    set_mode("regression")

credit_fit <- workflow() |>
  add_recipe(credit_knn_recipe) |>
  add_model(credit_spec) |>
  fit(data = credit_training)

knn_rmspe <- credit_fit |>
  predict(credit_testing) |>
  bind_cols(credit_testing) |>
  metrics(truth = Balance, estimate = .pred)|>
  filter(.metric == 'rmse') |>
  pull(.estimate) 

# **** knn-regression plot ****
marathon_preds <- marathon_best_fit |>
                predict(marathon_training) |>
                bind_cols(marathon_training)

marathon_plot <- ggplot(marathon_preds, aes(x = max, y = time_hrs)) +
            geom_point(alpha = 0.4) +
            xlab("Maximum Distance Ran per \n Week During Training (mi)") +
            ylab("Race Time (hours)") + 
            geom_line(data = marathon_preds, aes(x = max, y = .pred), color = "blue") +
            ggtitle(paste0("K = ", k_min)) +
            theme(text = element_text(size = 20))

ERROR: Error in pull(summarise(slice(arrange(mutate(data_name, diff = abs(100 - : could not find function "pull"


In [7]:
# read data
marathon <- read_csv('data/marathon.csv')

# split data
marathon_split <- initial_split(marathon, prop = 0.75, strata = time_hrs)
marathon_training <- training(marathon_split)
marathon_testing <- testing(marathon_split)

# specify the model
lm_spec <- linear_reg() |>
            set_engine("lm") |>
            set_mode("regression")

# recipe and fit
lm_recipe <- recipe(time_hrs ~ max, data = marathon_training)

lm_fit <- workflow() |>
            add_recipe(lm_recipe) |>
            add_model(lm_spec) |>
            fit(data = marathon_training)

# predictions for RMSE
# ***********************************************

lm_test_results_rmse <- lm_fit |>
                    predict(marathon_training) |>
                    bind_cols(marathon_training) |>
                    metrics(truth = time_hrs, estimate = .pred)                 

# extracting the RMSPE (*** BECAUSE WE ARE USING TESTING DATA ***)
lm_rmse <- lm_test_results_rmse |>
                filter(.metric == 'rmse') |>
                select(.estimate) |>
                pull()

# predictions for RMSPE
# ***********************************************

lm_test_results_rmspe <- lm_fit |>
                    predict(marathon_testing) |>
                    bind_cols(marathon_testing) |>
                    metrics(truth = time_hrs, estimate = .pred)                 

# extracting the RMSPE (*** BECAUSE WE ARE USING TESTING DATA ***)
lm_rmspe <- lm_test_results_rmspe |>
                filter(.metric == 'rmse') |>
                select(.estimate) |>
                pull()

# ***********************************************

# **** Simple Linear Regression Plot ***
lm_predictions <- ggplot(marathon_training, aes(x=max, y=time_hrs)) +
    geom_point(alpha = 0.4) +
    xlab("Max)") +
    ylab("Time in Hours") +
    geom_smooth(method = "lm", se = FALSE) + 
    theme(text = element_text(size = 12))
lm_predictions

ERROR: Error in read_csv("data/marathon.csv"): could not find function "read_csv"
