# **KNN Model to group killers based on birthdays and numbers of victims**

In [None]:
# Initial imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingRegressor

# Import the serial killer data
#file_path = Path()
#killers = pd.read_csv(file_path, header=None)
from google.colab import files
 
 
uploaded = files.upload()

In [None]:
# Confirm data has been imported correctly
killers.head()

In [None]:
# Add the column names 
killers.columns = []

In [None]:
# Remove unnecessary columns
killers = killers.drop()

In [None]:
# Generate a histogram to review the data
# killers["Target Variable"].hist(bins=12)

In [None]:
# Find strong correlations
correlation_matrix = killers.corr()
#correlation_matrix["Target Variable"]

In [None]:
# Define "Nearest" and define data points as vectors
a = np.array([2, 2])
b = np.array([4, 4])
np.linalg.norm(a - b)

In [None]:
# Find the k Nearest Neighbors
X = killers.drop("Target Variable", axis=1)
X = X.values
y = killers["Target Variable"]
y = y.values

# Create a NumPy array for the data
new_data_poiont = np.array([])

# Compute the distances between the new data point and each of the data points in the dataset
distances = np.linalg.norm(X - new_data_point, axis=1)

# Sort the array from lowest to highest and take the first k elements to obtain the indices of the knn
k = 3
nearest_neighbor_ids = distances.argsort()[:k]
nearest_neighbor_ids

In [None]:
# Voting or Averaging of multiple neighbors
# Find the ground truth for neighbors
nearest_neighbor_kills = y[nearest_neighbor_ids]
nearest_neighbor_kills 

In [None]:
# Average for Regression
prediction = nearest_neighbor_kills.mean()

In [None]:
# Compute the mode
class_neighbors = np.array(["A", "B", "B", "C"]) #change letters to signs
scipy.stats.mode(class_neighbors)

In [None]:
# Split the data into Training and Testing sets for Model Evaluation
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.2, random_state=12345
... )

In [None]:
# Fitting a kNN Regressionin scikit-learn to the dataset
knn_model = KNeighborsRegressor(n_neighbors=3)

# Fit the model on the training dataset
knn_model.fit(X_train, y_train)

In [None]:
# Inspect Model Fit

# Obtain the RMSE
train_preds = knn_model.predict(X_train)
mse = mean_squared_error(y_train, train_preds)
rmse = sqrt(mse)
rmse

# Compute the RMSE on the training data
test_preds = knn_model.predict(X_test)
mse = mean_squared_error(y_test, test_preds)
rmse = sqrt(mse)
rmse

In [None]:
# Plot the fit of the model
cmap = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
 points = ax.scatter(
...     X_test[:, 0], X_test[:, 1], c=test_preds, s=50, cmap=cmap
... )
f.colorbar(points)
plt.show

In [None]:
# Confirm wether trend exists on actual data
cmap = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, s=50, cmap=cmap)
f.colorbar(points)
plt.show()

In [None]:
# Find the best value for k using GridSearchCV
parameters = {"n_neighbors": range(1, 50)}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)

In [None]:
# Retain the best performing value of k 
# Print the parameters that have the lowest score
gridsearch.best_params_

In [None]:
# See how k value affects train and test performances
train_preds_grid = gridsearch.predict(X_train)
train_mse = mean_squared_error(y_train, train_preds_grid)
train_rmse = sqrt(train_mse)
test_preds_grid = gridsearch.predict(X_test)
test_mse = mean_squared_error(y_test, test_preds_grid)
test_rmse = sqrt(test_mse)
train_rmse
test_rmse


In [None]:
# Test whether the performance of the model will be any better using Weighted Average of neighbors based on distance
parameters = {"n_neighbors": range(1, 50),"weights": ["uniform", "distance"],}
gridsearch = GridSearchCV(KNeighborsRegressor(), parameters)
gridsearch.fit(X_train, y_train)
gridsearch.best_params_
test_preds_grid = gridsearch.predict(X_test)
test_mse = mean_squared_error(y_test, test_preds_grid)
test_rmse = sqrt(test_mse)
test_rmse

In [None]:
# Use bagging to improve kNN

# Create the KNeighborsRegressor
best_k = gridsearch.best_params_["n_neighbors"]
best_weights = gridsearch.best_params_["weights"]
bagged_knn = KNeighborsRegressor(n_neighbors=best_k, weights=best_weights)

# Import the BaggingRegressor class
bagging_model = BaggingRegressor(bagged_knn, n_estimators=100)

# Make a prediction and calculate the RMSE to see if improved
test_preds_grid = bagging_model.predict(X_test)
test_mse = mean_squared_error(y_test, test_preds_grid)
test_rmse = sqrt(test_mse)
test_rmse 