# K-Neighbors Model

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Import the necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
x = pd.read_csv('https://raw.githubusercontent.com/rivera-squared/Educacion_Machine_Learning/main/escuelas_ml.csv')

# Selecting X and y variables
X = x.drop(columns=['consolidada','escuela'])
y = x['consolidada']

In [4]:
# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

In [5]:
# Setup the hyperparameter grid
n_neighbors = np.arange(2, 52, 2)
param_grid = {'n_neighbors': n_neighbors}

# Instantiate a logistic regression classifier: knn
knn = KNeighborsClassifier()

# Instantiate the GridSearchCV object: logreg_cv
knn_cv = GridSearchCV(knn, param_grid, cv=5)

# Fit it to the data
knn_cv.fit(X_train,y_train)

# Print the tuned parameters and score
print("Tuned K-Neighbor Parameters: {}".format(knn_cv.best_params_)) 
print("Best score is {}".format(knn_cv.best_score_))

Tuned K-Neighbor Parameters: {'n_neighbors': 30}
Best score is 0.81375


In [6]:
# Create a k-NN classifier with 46 neighbors: knn
knn = KNeighborsClassifier(n_neighbors=30)

# Fit the classifier to the data
knn.fit(X_train,y_train)

# Predict the labels for the training data X
y_pred = knn.predict(X_test)

# Compute and print the confusion matrix and classification report
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[188  24]
 [ 25  30]]
              precision    recall  f1-score   support

           0       0.88      0.89      0.88       212
           1       0.56      0.55      0.55        55

    accuracy                           0.82       267
   macro avg       0.72      0.72      0.72       267
weighted avg       0.82      0.82      0.82       267



In [9]:
# Compute predicted probabilities: y_pred_prob
y_pred_prob = knn.predict_proba(X_test)[:,1]

In [10]:
# Compute and print AUC score
print("AUC: {}".format(roc_auc_score(y_test, y_pred_prob)))

AUC: 0.8437821612349914


In [8]:
cv_auc = cross_val_score(knn,X_train,y_train, cv=5, scoring = 'roc_auc')

# Print list of AUC scores
print("AUC scores computed using 5-fold cross-validation: {}".format(cv_auc))

AUC scores computed using 5-fold cross-validation: [0.8531586  0.86659946 0.8515905  0.89146505 0.79469086]
