<center><h1>ML project on KNN</h1> </center>

<center><h4> Project By: Swati Tripathi</h4> </center>

________________________________________________________________________________________________________________________________

# KNN Algorithm - Finding Nearest Neighbors 
## Dataset used :Red Wine Quality

________________________________________________________________________________________________________________________________

> This notebook is divided into two parts A and B which shows two different ways of implementing KNN <br>
<b>PART A<b>: Implementation from scratch<br>
<b>PART B<b>: Implementation using scikit-learn <br> 

In [None]:
import numpy as np 
import pandas as pd

### DETAILS ABOUT THE RED WINE QUALITY DATASET

In [None]:
reddata = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
reddata

In [None]:
df = reddata
df.describe().T

In [None]:
df.isnull().sum()

In [None]:
reddata.head()

In [None]:
gb = reddata.groupby('quality')
print(gb.first())

In [None]:
print(reddata['quality'].head())

### Let us see the steps for implementing KNN
Steps given are: 
1. <b>Handle Data:</b> Open the dataset from CSV and split into test/train datasets.
2. <b>Similarity:</b> Calculate the distance between two data instances.
3. <b>Neighbors:</b> Locate k most similar data instances.
4. <b>Response:</b> Generate a response from a set of data instances.
5. <b>Accuracy:</b> Summarize the accuracy of predictions.
6. <b>Main:</b> Tie it all together.




## PART A: Implementation from scratch

>

In [None]:
import csv
import random
import math
import operator

In [None]:
## only in order to do feature scaling we are using scikit-learn
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

In [None]:
sc1 = MinMaxScaler()
sc2 = StandardScaler()

def loadDataset(filename, split, trainingSet=[] , testSet=[]):
	with open(filename, 'r') as csvfile:
	    lines = csv.reader(csvfile)
	    dataset = list(lines)
	    for x in dataset[1:]:
	        if float(x[-1]) >6.5:
	            x[-1] = 1
	        else: 
	            x[-1] = 0
	    for x in range(1,len(dataset)-1):
	        for y in range(11):
	            dataset[x][y] = float(dataset[x][y])
	        if random.random() < split:
	            trainingSet.append(dataset[x])
	        else:
	            testSet.append(dataset[x])


def euclideanDistance(instance1, instance2, length):
	distance = 0
	for x in range(length):
		distance += pow((instance1[x] - instance2[x]), 2)
	return math.sqrt(distance)

def chebyshevDistance(instance1, instance2, length):
	distance = []
	for x in range(length):
		#distance += pow((instance1[x] - instance2[x]), 2)
		distance.append(abs(instance1[x] - instance2[x]))
	return max(distance)

def getNeighbors(trainingSet, testInstance, k):
	distances = []
	length = len(testInstance)-1
	for x in range(len(trainingSet)):
		dist = euclideanDistance(testInstance, trainingSet[x], length)
		distances.append((trainingSet[x], dist))
	distances.sort(key=operator.itemgetter(1))
	neighbors = []
	for x in range(k):
		neighbors.append(distances[x][0])
	return neighbors

def getNeighborsWithchebyshev(trainingSet, testInstance, k):
	distances = []
	length = len(testInstance)-1
	for x in range(len(trainingSet)):
		dist = chebyshevDistance(testInstance, trainingSet[x], length)
		distances.append((trainingSet[x], dist))
	distances.sort(key=operator.itemgetter(1))
	neighbors = []
	for x in range(k):
		neighbors.append(distances[x][0])
	return neighbors

def getResponse(neighbors):
	classVotes = {}
	for x in range(len(neighbors)):
		response = neighbors[x][-1]
		if response in classVotes:
			classVotes[response] += 1
		else:
			classVotes[response] = 1
	sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
	return sortedVotes[0][0]

def getAccuracy(testSet, predictions):
	correct = 0
	for x in range(len(testSet)):
		if testSet[x][-1] == predictions[x]:
			correct += 1
	return (correct/float(len(testSet))) * 100.0
	
def main():
    # prepare data
    trainingSet=[]
    testSet=[]
    split = 0.67
    loadDataset('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv', split, trainingSet, testSet)
    print('Train set: %d' % len(trainingSet))
    print('Test set: %d' % len(testSet))
    predictions=[]
    k = 3
    print("Here we have taken : K=3  ")
    
    ## ACCURACY BEFORE FEATURE SCALING 
    for x in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[x], k)
        result = getResponse(neighbors)
        predictions.append(result)
        #print('> predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1]))
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy before feature scaling is done: ' + str(accuracy) + '%')
    
    ## ACCURACY AFTER FEATURE SCALING 
    
    ## ACCURACY AFTER MinMax way of scaling was done
    trainingSet=[]
    testSet=[]
    predictions=[]
    loadDataset('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv', split, trainingSet, testSet)
    trainingSet = sc1.fit_transform(trainingSet)
    testSet = sc1.transform(testSet)
    for x in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[x], k)
        result = getResponse(neighbors)
        predictions.append(result)
        #print('> predicted=' + repr(result) + ', actual=' + repr(testSet[x][-1]))
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy after Min-Max scaler was used for feature scaling: ' + str(accuracy) + '%')
    
    ## ACCURACY AFTER StandardScaler was used for scaling
    trainingSet=[]
    testSet=[]
    predictions=[]
    loadDataset('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv', split, trainingSet, testSet)
    trainingSet = sc2.fit_transform(trainingSet)
    testSet = sc2.transform(testSet)
    for x in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[x], k)
        result = getResponse(neighbors)
        predictions.append(result)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy after StandardScaler was used for feature scaling: ' + str(accuracy) + '%')
   

    ## ACCURACY after chebyshev distance is used
    trainingSet=[]
    testSet=[]
    predictions=[]
    loadDataset('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv', split, trainingSet, testSet)
    trainingSet = sc2.fit_transform(trainingSet)
    testSet = sc2.transform(testSet)
    for x in range(len(testSet)):
        neighbors = getNeighborsWithchebyshev(trainingSet, testSet[x], k)
        result = getResponse(neighbors)
        predictions.append(result)
    accuracy = getAccuracy(testSet, predictions)
    print('Accuracy after we use chebyshev distance formula: ' + str(accuracy) + '%')
   
    
main()

In [None]:
def collectallaccuracy(k):
   
    trainingSet=[]
    testSet=[]
    split = 0.67
    loadDataset('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv', split, trainingSet, testSet)
    trainingSet = sc2.fit_transform(trainingSet)
    testSet = sc2.transform(testSet)
   
    predictions=[]
    for x in range(len(testSet)):
        neighbors = getNeighbors(trainingSet, testSet[x], k)
        result = getResponse(neighbors)
        predictions.append(result)
    accur = getAccuracy(testSet, predictions)
    return accur

In [None]:
Accuracies = []
print('Let us see when standardScaler is used for feature scaling and euclidean distance is used \nthen for different k what is the respective accuracy we obtain')
print()
print('For different K its accuracy is : ')

for i in range(1, 21):
        val = collectallaccuracy(i)
        Accuracies.append(val)
        print('K = '+str(i)+' Accuracy = '+str(val))

In [None]:
print("Max Accuracy we get is = "+str(max(Accuracies))+" at k = "+ str(Accuracies.index(max(Accuracies))+1))

>

>

>

>


>

>

>

In [None]:
import matplotlib.pyplot as plt 
plt.figure(figsize=(12, 6))
plt.plot(range(1, 21),  Accuracies, color='green', linestyle='dashed', marker='o',
         markerfacecolor='red', markersize=10)
plt.title('Relationship between K and its respective accuracy')
plt.xlabel('K Value')
plt.ylabel('Accuracy')

___




## PART B: Implementation using scikit-learn

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np 
import pandas as pd

In [None]:
redwine = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df = redwine.copy()

In [None]:
df['quality'] = [1 if x>6.5 else 0 for x in df['quality']]
y = df["quality"]
X = df.drop("quality", axis=1)
X = pd.DataFrame(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=40, stratify=y)

In [None]:
knn_model = KNeighborsClassifier().fit(X_train, y_train)
y_pred = knn_model.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
knn_cv = GridSearchCV(KNeighborsClassifier(), {"n_neighbors": np.arange(1,50)}, cv=10)
knn_cv.fit(X_train, y_train)

In [None]:
print("Best score is:" + str(knn_cv.best_score_),"and Best params is: " + str(knn_cv.best_params_))

In [None]:
knn = KNeighborsClassifier(2)
knn_tuned = knn.fit(X_train, y_train)

In [None]:
knn_tuned.score(X_test, y_test)