# STEP 1 : Importing Libraries 

In [None]:
library(tidyr)
library(tidyverse)
library(naniar)
library(caTools)
install.packages("kknn")
library(kknn)
library(ggcorrplot)
library(ggplot2)

# STEP 2: Importing Data 

In [None]:
red_wine <- read.csv("red_wine.csv", sep = ";")

View(red_wine)

# STEP 3 : Checking the missing values in the data 

In [None]:
# Checking the missing values in the data 

sum(is.na(red_wine))

#Visualizing the missing values using naniar

vis_miss(red_wine)

# Checking the missing values in each column 

sapply(red_wine, function(x)sum(is.na(x)))

# STEP 4 : Doing feature scaling of the data excluding the last column of quality as it is dependent variable 

In [None]:


red_wine[, 1:11]<- scale(red_wine[, 1:11])


# STEP 5 : Checking the correlation in variables

In [None]:


r <- cor(red_wine, use = "everything")



# STEP 6 : Splitting date into train set and test set 

In [None]:

split_wine <- sample.split(red_wine$quality, SplitRatio = 0.75)

wine_train <- subset(red_wine, split_wine ==TRUE )
wine_test <- subset(red_wine, split_wine ==FALSE )


nrow(wine_test)
nrow(wine_train)


ncol(wine_test)

# STEP 7 : Preparing training set 

In [None]:

library(class)

y_pred = knn(train = wine_train[, -12],
             test = wine_test[, -12],
             cl = wine_train[, 12],
             k = 2,
             prob = TRUE)


# STEP 8 : Making confusion matrix 

In [None]:
cm_wine = table(wine_test[, 12], y_pred)

# When K = 2 Model Accuracy  is 64% 

# Method II Defining outlier function, this is very useful when there are lot of outliers in the data as in our case 

In [None]:
# Defining outlier function : 

outliers <- function(x) {
  Q1 <- quantile(x, probs=.25)
  Q3 <- quantile(x, probs=.75)
  iqr = Q3-Q1
  upper_limit = Q3 + (iqr*1.5)
  lower_limit = Q1 - (iqr*1.5)
  
  x > upper_limit | x < lower_limit
}

remove_outliers <- function(red_wine, cols = names(red_wine)) {
  for (col in cols) {
    red_wine <- red_wine[!outliers(red_wine[[col]]),]
  }
  red_wine
}



#Step 3: Apply outlier function to data frame.

#Lastly, let’s apply this function across multiple columns of the data frame to remove outliers:



test_out <- remove_outliers(red_wine, c('fixed.acidity', 'volatile.acidity', 'citric.acid','residual.sugar', 'chlorides', 'free.sulfur.dioxide','total.sulfur.dioxide', 'density','residual.sugar', 'pH', 'sulphates','alcohol'))

view(test_out)


# Splitting date into train set and test set 

In [None]:
split_wine_out <- sample.split(test_out$quality, SplitRatio = 0.75)

wine_train_out <- subset(test_out, split_wine_out ==TRUE )
wine_test_out <- subset(test_out, split_wine_out ==FALSE )


nrow(wine_test_out)
nrow(wine_train_out)


ncol(wine_test)
library(class)

y_pred_out = knn(train = wine_train_out[, -12],
             test = wine_test_out[, -12],
             cl = wine_train_out[, 12],
             k = 2,
             prob = TRUE)

# Making the Confusion Matrix
cm_wine_out = table(wine_test_out[, 12], y_pred_out)



cm_wine_out
view(cm_wine)



# Model Accuracy is 62.24%

