# K-Nearest Neighbors (KNN) Algorithm Implementation

This script demonstrates a custom KNN implementation with detailed preprocessing,
hyperparameter tuning, and model evaluation.

## Key Features
- Custom distance calculation (Minkowski distance)
- Manual KNN implementation
- Hyperparameter tuning
- Preprocessing and feature engineering

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_openml
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score


In [2]:
titanic = fetch_openml("titanic", version=1, as_frame=True)
df = titanic.frame
df

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0000,0,0,24160,211.3375,B5,S,2,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.5500,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0000,1,2,113781,151.5500,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0000,1,2,113781,151.5500,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,"Zabour, Miss. Hileni",female,14.5000,1,0,2665,14.4542,,C,,328.0,
1305,3,0,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1306,3,0,"Zakarian, Mr. Mapriededer",male,26.5000,0,0,2656,7.2250,,C,,304.0,
1307,3,0,"Zakarian, Mr. Ortin",male,27.0000,0,0,2670,7.2250,,C,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1309 entries, 0 to 1308
Data columns (total 14 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   pclass     1309 non-null   int64   
 1   survived   1309 non-null   category
 2   name       1309 non-null   object  
 3   sex        1309 non-null   category
 4   age        1046 non-null   float64 
 5   sibsp      1309 non-null   int64   
 6   parch      1309 non-null   int64   
 7   ticket     1309 non-null   object  
 8   fare       1308 non-null   float64 
 9   cabin      295 non-null    object  
 10  embarked   1307 non-null   category
 11  boat       486 non-null    object  
 12  body       121 non-null    float64 
 13  home.dest  745 non-null    object  
dtypes: category(3), float64(3), int64(3), object(5)
memory usage: 116.8+ KB


Encoding and manupulating the df ,droping unnecessary columns

In [4]:
df.describe()

Unnamed: 0,pclass,age,sibsp,parch,fare,body
count,1309.0,1046.0,1309.0,1309.0,1308.0,121.0
mean,2.294882,29.881135,0.498854,0.385027,33.295479,160.809917
std,0.837836,14.4135,1.041658,0.86556,51.758668,97.696922
min,1.0,0.1667,0.0,0.0,0.0,1.0
25%,2.0,21.0,0.0,0.0,7.8958,72.0
50%,3.0,28.0,0.0,0.0,14.4542,155.0
75%,3.0,39.0,1.0,0.0,31.275,256.0
max,3.0,80.0,8.0,9.0,512.3292,328.0


In [5]:
df.drop(['name','ticket','cabin','boat','body','home.dest'],axis=1,inplace=True)

Imputing for age 

In [6]:
# Group by Pclass and Sex, then fill missing Age with group median
df["age"] = df.groupby(["pclass", "sex"])["age"].transform(
    lambda x: x.fillna(x.median())
)

  df["age"] = df.groupby(["pclass", "sex"])["age"].transform(


In [7]:
df["age_missing"] = df["age"].isnull().astype(int)

In [8]:
df=df.dropna()

In [12]:
df['survived']=df['survived'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['survived']=df['survived'].astype(int)


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1306 entries, 0 to 1308
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   pclass       1306 non-null   int64   
 1   survived     1306 non-null   int32   
 2   sex          1306 non-null   category
 3   age          1306 non-null   float64 
 4   sibsp        1306 non-null   int64   
 5   parch        1306 non-null   int64   
 6   fare         1306 non-null   float64 
 7   embarked     1306 non-null   category
 8   age_missing  1306 non-null   int32   
dtypes: category(2), float64(2), int32(2), int64(3)
memory usage: 74.2 KB


In [14]:
df

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,embarked,age_missing
0,1,1,female,29.0000,0,0,211.3375,S,0
1,1,1,male,0.9167,1,2,151.5500,S,0
2,1,0,female,2.0000,1,2,151.5500,S,0
3,1,0,male,30.0000,1,2,151.5500,S,0
4,1,0,female,25.0000,1,2,151.5500,S,0
...,...,...,...,...,...,...,...,...,...
1304,3,0,female,14.5000,1,0,14.4542,C,0
1305,3,0,female,22.0000,1,0,14.4542,C,0
1306,3,0,male,26.5000,0,0,7.2250,C,0
1307,3,0,male,27.0000,0,0,7.2250,C,0


In [15]:
df = pd.get_dummies(df, columns=['embarked'], prefix='Embarked',dtype=int)


In [16]:
df['sex'] = df['sex'].map({'male':1,'female':0})

In [17]:
df

Unnamed: 0,pclass,survived,sex,age,sibsp,parch,fare,age_missing,Embarked_C,Embarked_Q,Embarked_S
0,1,1,0,29.0000,0,0,211.3375,0,0,0,1
1,1,1,1,0.9167,1,2,151.5500,0,0,0,1
2,1,0,0,2.0000,1,2,151.5500,0,0,0,1
3,1,0,1,30.0000,1,2,151.5500,0,0,0,1
4,1,0,0,25.0000,1,2,151.5500,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
1304,3,0,0,14.5000,1,0,14.4542,0,1,0,0
1305,3,0,0,22.0000,1,0,14.4542,0,1,0,0
1306,3,0,1,26.5000,0,0,7.2250,0,1,0,0
1307,3,0,1,27.0000,0,0,7.2250,0,1,0,0


## Minkowski Distance Calculation

Calculates the generalized distance between two feature vectors using the Minkowski distance metric.

---

### Mathematicamula

$$\

d(a, b) = \left(\sum_{i=1}^n |a_i - b_i|^p\right)^{\frac{1}{p}
\
$$\]}
\]p}

\]

---

### Parameters  
- **`a` (numpy.ndarray)**: First feature vector  
- **`b` (numpy.ndarray)**: Second feature vector  
- **`p` (float)**: Distance parameter  
  - \(p=1\): Manhattan distance  
  - \(p=2\): Euclidean distance  
  - \(p>2\): Generalized Minkowski distance  

---

### Returns  
- Calculated distance between vectors (**float**)  

---

### Notes  
- Supports various distance metrics through the **p** parameter  
- Generalizes distance calculation for different use cases  
ses
    """
 different use cases

In [18]:
def distance(a,b,p):#using minkowski distance
    n = a.shape[0]
    d=0
    for i in range(n):
        d+=abs(a[i]-b[i])**p
    return d**(1/p)

## K-Nearest Neighbors Prediction

Performs classification using the K-Nearest Neighbors algorithm with custom distance calculation.

---

### Algorithm Steps
1. Calculate distances between test and training points  
2. Find \(k\) nearest neighbors  
3. Perform majority voting to determine the class  

---

### Parameters  
- **`X_test` (numpy.ndarray)**: Test feature vectors  
- **`X_train` (numpy.ndarray)**: Training feature vectors  
- **`y_train` (numpy.ndarray)**: Training labels  
- **`distance_func` (callable)**: Distance calculation function  
- **`k` (int)**: Number of nearest neighbors  
- **`p` (float)**: Distance parameter  

---

### Returns  
- List of predicted labels for the test set  

---

### Key Characteristics  
- Uses majority voting for classification  
- Supports different distance metrics  
- Flexible neighbor count  


In [19]:
def predict(X_test,X_train,y_train,distance,k,p):
    preds=[]
    y_train = np.array(y_train)
    for i in X_test:
        distances=[]
        for j in X_train:
            distances.append(distance(i,j,p))
        k_indices = np.argsort(distances)[:k]
        y_label = y_train[k_indices]
        preds.append(np.argmax(np.bincount(y_label)))
    return preds

In [20]:
X=df.drop('survived',axis=1)
y=df['survived']

In [21]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_eval, X_test, y_eval, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

y_train = y_train.values
y_eval = y_eval.values
y_test = y_test.values

In [22]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_eval = scaler.transform(X_eval)
X_test = scaler.transform(X_test)

In [23]:
k_values = [1, 3, 5, 7, 9, 11, 13]  # Test odd k to avoid ties in binary classification
p_values = [1, 2, 3, 4, 5]         # p=1 (Manhattan), p=2 (Euclidean), etc.

In [24]:

best_accuracy = 0
best_k = None
best_p = None

for k in k_values:
    for p in p_values:
        # Predict on eval set
        y_pred = predict(X_eval, X_train, y_train, distance, k, p)
        
        # Compute accuracy
        acc = accuracy_score(y_eval, y_pred)
        print(f"k={k}, p={p}: Accuracy = {acc:.4f}")
        
        # Track best parameters
        if acc > best_accuracy:
            best_accuracy = acc
            best_k = k
            best_p = p

print(f"\nBest Parameters: k={best_k}, p={best_p} | Eval Accuracy: {best_accuracy:.4f}")

k=1, p=1: Accuracy = 0.7165
k=1, p=2: Accuracy = 0.6897
k=1, p=3: Accuracy = 0.7050
k=1, p=4: Accuracy = 0.6973
k=1, p=5: Accuracy = 0.7088
k=3, p=1: Accuracy = 0.7586
k=3, p=2: Accuracy = 0.7739
k=3, p=3: Accuracy = 0.7701
k=3, p=4: Accuracy = 0.7701
k=3, p=5: Accuracy = 0.7701
k=5, p=1: Accuracy = 0.7816
k=5, p=2: Accuracy = 0.7931
k=5, p=3: Accuracy = 0.7893
k=5, p=4: Accuracy = 0.7854
k=5, p=5: Accuracy = 0.7854
k=7, p=1: Accuracy = 0.7893
k=7, p=2: Accuracy = 0.8008
k=7, p=3: Accuracy = 0.8084
k=7, p=4: Accuracy = 0.7969
k=7, p=5: Accuracy = 0.7931
k=9, p=1: Accuracy = 0.7778
k=9, p=2: Accuracy = 0.7969
k=9, p=3: Accuracy = 0.8046
k=9, p=4: Accuracy = 0.8008
k=9, p=5: Accuracy = 0.7969
k=11, p=1: Accuracy = 0.7893
k=11, p=2: Accuracy = 0.7893
k=11, p=3: Accuracy = 0.7931
k=11, p=4: Accuracy = 0.7931
k=11, p=5: Accuracy = 0.8008
k=13, p=1: Accuracy = 0.7816
k=13, p=2: Accuracy = 0.7778
k=13, p=3: Accuracy = 0.7931
k=13, p=4: Accuracy = 0.7854
k=13, p=5: Accuracy = 0.7854

Best Para

In [26]:

y_pred_test = predict(X_test, X_train, y_train, distance, best_k, best_p)


test_acc = accuracy_score(y_test, y_pred_test)
print(f"Test Accuracy with k={best_k}, p={best_p}: {test_acc:.4f}")

Test Accuracy with k=7, p=3: 0.7863
