# Naive KNN implementation.

In [2]:
class KNN:
    """
        A simple implementation of the KNN Algorithm.
    """
    def __init__(self, x_train, y_train, k=5):
        self.x_train = x_train
        self.y_train = y_train
        self.k = k
    def single_pred(self, input_data):
        """
            Generate a  prediction given a single row input.
        """
        distance = 0
        if self.x_train.shape[-1] == input_data.shape[-1]:
            train_data = self.x_train.copy()
            for column in train_data.columns:
                distance += (train_data[column] - input_data[column]) ** 2
            train_data["distances"] = distance ** 0.5
            prediction = self.y_train[train_data["distances"].nsmallest(n=self.k).index].mode()[0]
            return prediction
        else:
            raise Exception("The training data and the input data have different number of columns")
    def predict(self, input_data):
        """
            Generates predictions given multiple rows.
            Calls the prediction method of the provided dataframes.
        """
        if len(input_data) == 1:
            return self.single_pred(input_data=input_data)
        else:
            predictions = input_data.apply(lambda x: self.single_pred(x), axis=1)
            return predictions.reset_index(drop=True)
    def accuracy(self, x_test, y_test):
        """
            Calculates the accuracy on the test data.
            Compares how many predictions of the test set match the actual y_test.
        """
        test_data = x_test.copy()
        test_data["predictions"] = test_data.apply(lambda x: self.single_pred(x), axis=1)
        accurate_preds = test_data["predictions"] == y_test
        pred_accuracy = accurate_preds.sum()/len(accurate_preds)
        return round(pred_accuracy, 4) * 100

## Testing the Algorithm

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [4]:
df = pd.read_csv("./subscription_prediction.csv")
df["y"] = df["y"].map({"yes": 1, "no": 0})
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
1,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
2,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
3,57,housemaid,divorced,basic.4y,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0
4,39,management,single,basic.9y,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0


In [5]:
# we"ll use just two columns as our features because of speed.
df = df[["age", "nr.employed", "y"]]
y = df["y"]
X = df[["age", "nr.employed"]]

In [6]:
# Split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

In [7]:
#instantiate our KNN
knn = KNN(X_train, y_train, 7)

In [8]:
accuracy = knn.accuracy(X_test, y_test)
accuracy

70.62

In [9]:
# Create a dummy row and make prediction
row = pd.DataFrame({"age": [37.0], "nr.employed": [4963.6]})
pred = knn.predict(row)
print(f"Prediction: {pred}")

Prediction: 1
