In [8]:
library(repr)
library(tidyverse)
library(tidymodels)
library(dplyr)

In [9]:
#for reproducability
set.seed(1)

#demonstrate the dataset can be read from the web into R
caffeine <-read_csv("caffeine.csv")

#changing column names
colnames(caffeine) <- make.names(colnames(caffeine))

#creating the factor and selecting variables needed for prediction
caffeine_selected <- caffeine %>%
        mutate(type = as_factor(type)) %>%
        select(Calories, Caffeine..mg., type)

Parsed with column specification:
cols(
  drink = [31mcol_character()[39m,
  `Volume (ml)` = [32mcol_double()[39m,
  Calories = [32mcol_double()[39m,
  `Caffeine (mg)` = [32mcol_double()[39m,
  type = [31mcol_character()[39m
)



In [10]:
#returning each categroy in the column
caffiene_pull = caffeine_selected %>%
    pull(type) %>%
    levels()
caffiene_pull

#creates a chart that shows the variables with counts and percentages 
caffiene_obs = nrow(caffeine)
    type %>%
    group_by(type) %>% 
    summarize(
        count = n(),
        percentage = n() / caffiene_obs * 100 )

ERROR: Error in eval(lhs, parent, parent): object 'type' not found


In [None]:
#creating a training set and splitting the data into 75% training, and 25% testing
caffeine_split <- initial_split(caffeine_selected, prop = 0.75, strata = type)
caffeine_train <- training(caffeine_split)
caffeine_test <- testing(caffeine_split)

caffeine_test

#creating classification K nearest neighbours
knn_spec <- nearest_neighbor(weight_func= "rectangular", neighbors = tune()) %>%
    set_engine ("kknn") %>%
    set_mode("classification")

#standardizing data
caffeine_recipe <- recipe(type ~ ., data = data) %>%
  step_scale(all_predictors()) %>%
  step_center(all_predictors())

#creating folds within training set
caffeine_vfold <- vfold_cv(caffeine_train, v = 5, strata = type)

#creating workflow
knn_results <- workflow() %>%
  add_recipe(caffeine_recipe) %>%
  add_model(knn_spec) %>%
  tunegrid(resamples = caffeine_vfold, grid = 10) %>%
  collect_metrics()

#checking the accuracy from the workflow
accuracies <- knn_results %>%
  filter(.metric == "accuracy")

accuracies


In [None]:
#creating scatter plot for caffiene and volume
caffiene_plot = caffeine %>%
    ggplot(aes(x = Caffeine..mg., y = Calories, color = type)) +
    geom_point(alpha = 0.5) +
    labs(x = "Caffiene (mg)", y="Calories",color = "Type of Drink") +
    theme(text = element_text(size = 12))

caffiene_plot