In [13]:
import csv
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets

In [6]:
# Load the data
data = pd.read_csv('data.csv', parse_dates=['Intake-DateTime', 'Outcome-DateTime'])
data.dtypes

AnimalID                         object
ID                               object
Breed                            object
Color                            object
Gender                           object
Name                             object
Intake-DateTime          datetime64[ns]
Intake-Type                      object
Intake-Condition                 object
Intake-Age(days)                  int64
Date-Of-Birth                    object
Outcome-DateTime         datetime64[ns]
Outcome-Type                     object
Outcome-Subtype                  object
Outcome-Age(days)                 int64
Age                              object
IsAdopted                          bool
Category                         object
Intelligence-Ranking              int64
Intelligence-Category            object
Size-Category                    object
Longevity                       float64
Total-Cost($)                     int64
Purchase-Cost($)                  int64
Food-Cost($)                      int64


In [14]:
# Prepare the data, replace object data with numerical values
X = data.drop('IsAdopted', axis=1)
y = data['IsAdopted']

X = data.iloc[:,:].drop(columns=["IsAdopted", "AnimalID", "ID", "Intake-DateTime","Color", "Breed","Outcome-DateTime", "Intelligence-Category", "Date-Of-Birth","Name", "Intake-Condition", "Category","Outcome-Subtype", "Intake-Age(days)", "Cost-Category", "Outcome-Type", "Intake-Type", "Outcome-Age(days)"])
X

Unnamed: 0,Gender,Age,Intelligence-Ranking,Size-Category,Longevity,Total-Cost($),Purchase-Cost($),Food-Cost($)
0,Male,Adult,27,Large,9.00,18062,1725,5679
1,Male,Senior,27,Large,9.00,18062,1725,5679
2,Male,Senior,27,Large,9.00,18062,1725,5679
3,Male,Senior,6,Small,12.53,17469,465,3698
4,Female,Senior,7,Medium,12.04,18422,810,4819
...,...,...,...,...,...,...,...,...
85791,Female,Senior,67,Small,16.50,22640,588,4594
85792,Female,Senior,67,Small,16.50,22640,588,4594
85793,Female,Senior,59,Small,12.25,16073,650,2410
85794,Female,Baby,7,Medium,12.04,18422,810,4819


In [15]:
# Here we replace all the descriptive lable into numerical lable
gender = X.groupby('Gender').size()
X['Gender'].replace(['Female','Male'], [0,1], inplace=True)
SizeCategory = X.groupby('Size-Category').size()
X['Size-Category'].replace(['Large','Medium','Small'], [3,2,1], inplace=True)
Age = X.groupby('Age').size()
X['Age'].replace(['Baby','Young','Adult', 'Senior'], [1,2,3,4], inplace=True)
X

Unnamed: 0,Gender,Age,Intelligence-Ranking,Size-Category,Longevity,Total-Cost($),Purchase-Cost($),Food-Cost($)
0,1,3,27,3,9.00,18062,1725,5679
1,1,4,27,3,9.00,18062,1725,5679
2,1,4,27,3,9.00,18062,1725,5679
3,1,4,6,1,12.53,17469,465,3698
4,0,4,7,2,12.04,18422,810,4819
...,...,...,...,...,...,...,...,...
85791,0,4,67,1,16.50,22640,588,4594
85792,0,4,67,1,16.50,22640,588,4594
85793,0,4,59,1,12.25,16073,650,2410
85794,0,1,7,2,12.04,18422,810,4819


In [16]:
# Perform one-hot encoding for categorical variables
X_encoded = pd.get_dummies(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [17]:
# kNN is sensitive to the scale of features, so it's a good idea to normalize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [18]:
# Choose the number of neighbors (you can experiment with different values)
n_neighbors = 3

# Create kNN classifier
knn = KNeighborsClassifier(n_neighbors=n_neighbors)

# Train the model
knn.fit(X_train_scaled, y_train)

KNeighborsClassifier(n_neighbors=3)

In [19]:
y_pred = knn.predict(X_test_scaled)


In [20]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Accuracy: 0.5501748251748252
Classification Report:
               precision    recall  f1-score   support

       False       0.56      0.54      0.55      8742
        True       0.54      0.56      0.55      8418

    accuracy                           0.55     17160
   macro avg       0.55      0.55      0.55     17160
weighted avg       0.55      0.55      0.55     17160

