# K-Nearest Neighbors Classification

In [65]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [66]:
def euclidian_distance(a, b):
    return np.sqrt(np.sum((a-b)**2, axis=1))

# res = 0
# for j in range(D):
#     res += (a[j] - b[j])**2
# sqrt(res)

In [67]:
def score(X_test, y_test):
    y_pred = predict(X_test)
        
    return float(sum(y_pred == y_test))/ float(len(y_test))

In [68]:
class KNearestNeighbors():
    def __init__(self, X_train, y_train, n_neighbors=5, weights='uniform'):

        self.X_train = X_train
        self.y_train = y_train

        self.n_neighbors = n_neighbors
        self.weights = weights

        self.n_classes = 3

    def euclidian_distance(self, a, b):
        return np.sqrt(np.sum((a - b)**2, axis=1))

    def kneighbors(self, X_test, return_distance=False):

        dist = []
        neigh_ind = []

#         print(X_test, self.X_train. X_test, self.X_train)
        point_dist = []
        for x_test in X_test:
            point_dist += [ self.euclidian_distance(x_test, self.X_train) ]

        for row in point_dist:
            enum_neigh = enumerate(row)
            sorted_neigh = sorted(enum_neigh,
                                  key=lambda x: x[1])[:self.n_neighbors]

            ind_list = [tup[0] for tup in sorted_neigh]
            dist_list = [tup[1] for tup in sorted_neigh]

            dist.append(dist_list)
            neigh_ind.append(ind_list)

        if return_distance:
            return np.array(dist), np.array(neigh_ind)

        return np.array(neigh_ind)

    def predict(self, X_test):

        if self.weights == 'uniform':
            neighbors = self.kneighbors(X_test)
            y_pred = np.array([
                np.argmax(np.bincount(self.y_train[neighbor]))
                for neighbor in neighbors
            ])
            #print("Y_pred: ", y_pred)
            return y_pred

        if self.weights == 'distance':

            print("Realice una implementación de distancia")

            return y_pred

    def score(self, X_test, y_test):
        y_pred = self.predict(X_test)
        return float(sum(y_pred == y_test)) / float(len(y_test)), y_pred, y_test


In [69]:
def metrics(X_test, y_test, y_pred):    
    score = float(sum(y_pred == y_test))/ float(len(y_test))
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))
    recall = recall_score(y_test, y_pred, average='weighted', labels=np.unique(y_pred))

    metrics = [score, accuracy, precision, recall]
    
    return metrics

In [70]:
def compare(our_metrics, sklearn_metrics):
    
    res = pd.DataFrame([[our_metrics[0], sklearn_metrics[0]],
                  [our_metrics[1], sklearn_metrics[1]], 
                  [our_metrics[2], sklearn_metrics[2]],
                  [our_metrics[3], sklearn_metrics[3]]],
             ['Score','Accuracy','Precision','Recall'],    
             ['Our Implementation', 'Sklearn\'s Implementation'])
    return res

In [71]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from matplotlib import pyplot as plt

### Red Wine Quality

In [72]:
df = pd.read_csv("winequality-red.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5


In [73]:
features=['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']
y = df.quality
Outcome = []

for i in y:
    if(y[i] <= 5):
        Outcome.append(0)
    else:
        Outcome.append(1)


df['Outcome'] = Outcome
X=df[features]
y=df.Outcome
y_name = df['Outcome'].name

In [74]:
y.value_counts()

0    1372
1     227
Name: Outcome, dtype: int64

In [75]:
#Para hacer histogramas de los features
#plt.hist(X['petal width (cm)'],10)
#plt.show()

In [1]:
mu = np.mean(X, 0)
sigma = np.std(X, 0)
X = (X - mu ) / sigma

NameError: name 'np' is not defined

In [77]:
y_name, features


('Outcome',
 ['fixed acidity',
  'volatile acidity',
  'citric acid',
  'residual sugar',
  'chlorides',
  'free sulfur dioxide',
  'total sulfur dioxide',
  'density',
  'pH',
  'sulphates',
  'alcohol'])

In [78]:
if isinstance(X, pd.DataFrame):
    X = X.to_numpy()
    y = y.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)


#Modelo con la clase KNearestNeighbors
our_classifier = KNearestNeighbors(X_train, y_train, n_neighbors=30)
y_pred = our_classifier.predict(X_test)
our_metrics = metrics(X_test, y_test, y_pred)

#Modelo con Sklearn
sklearn_classifier = KNeighborsClassifier(n_neighbors=30).fit(X_train, y_train)
y_pred = sklearn_classifier.predict(X_test)
sklearn_metrics = metrics(X_test, y_test, y_pred)

print(y_pred[:20], y_test[:20])

compare(our_metrics, sklearn_metrics)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0] [0 1 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0]


Unnamed: 0,Our Implementation,Sklearn's Implementation
Score,0.877083,0.877083
Accuracy,0.877083,0.877083
Precision,0.848141,0.848141
Recall,0.877083,0.877083


### Notas del curso de Arquitectura de Computadores 1

In [79]:
df = pd.read_csv("grades.csv")
df.head()

Unnamed: 0,Proyecto1,Proyecto2,Examen1,Taller1,Tarea1,Final,Resultado
0,100.0,100.0,53.3,80.0,76.0,80.0,1
1,0.0,0.0,12.6,40.0,0.0,5.0,0
2,100.0,45.0,49.6,100.0,100.0,80.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0
4,100.0,100.0,61.6,100.0,66.7,80.0,1


In [80]:
#Feature Engineering
#Se seleccionaron las caracteristicas solicitadas en el enunciado

features=['Proyecto1','Proyecto2','Examen1','Tarea1']

X=df[features]
y=df['Resultado']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.50,random_state=100)

In [81]:
y.value_counts()

1    73
0    32
Name: Resultado, dtype: int64

In [82]:
'Resultado', features

('Resultado', ['Proyecto1', 'Proyecto2', 'Examen1', 'Tarea1'])

In [83]:
if isinstance(X, pd.DataFrame):
    X = X.to_numpy()
    y = y.to_numpy()

X_train, X_test, y_train, y_test = train_test_split(\
                X, y, test_size=0.3, random_state=45)


#Modelo con la clase KNearestNeighbors
our_classifier = KNearestNeighbors(X_train, y_train, n_neighbors=30)
y_pred = our_classifier.predict(X_test)
our_metrics = metrics(X_test, y_test, y_pred)

#Modelo con Sklearn
sklearn_classifier = KNeighborsClassifier(n_neighbors=30).fit(X_train, y_train)
y_pred = sklearn_classifier.predict(X_test)
sklearn_metrics = metrics(X_test, y_test, y_pred)

print(y_pred[:20], y_test[:20])

compare(our_metrics, sklearn_metrics)

[0 0 1 1 1 1 0 1 1 1 1 1 1 1 1 0 1 1 1 0] [0 0 1 1 1 1 0 1 1 1 1 1 1 0 1 0 1 1 1 0]


Unnamed: 0,Our Implementation,Sklearn's Implementation
Score,0.9375,0.9375
Accuracy,0.9375,0.9375
Precision,0.942308,0.942308
Recall,0.9375,0.9375
