<a href="https://colab.research.google.com/github/sathwikkompalli1/ML-lab/blob/main/AP23110011582_Lab7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split   #splits datset train 80% test 20%
from sklearn.preprocessing import LabelEncoder   #coverts text to numbers
!gdown 1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2 -O groceries.csv
data = pd.read_csv("groceries.csv")   #!gdown .downloads the dataset from Google Drive using its ID.
#loads the CSV file into a DataFrame called data
print("Dataset shape:",data.shape)
print("First 5 rows:\n",data.head())

Downloading...
From: https://drive.google.com/uc?id=1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2
To: /content/groceries.csv
  0% 0.00/14.6k [00:00<?, ?B/s]100% 14.6k/14.6k [00:00<00:00, 37.1MB/s]
Dataset shape: (440, 8)
First 5 rows:
    Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen  class
0       3  12669  9656     7561     214              2674        1338      2
1       3   7057  9810     9568    1762              3293        1776      2
2       3   6353  8808     7684    2405              3516        7844      2
3       3  13265  1196     4221    6404               507        1788      1
4       3  22615  5410     7198    3915              1777        5185      1


In [None]:
#Pre processing
#split
X = data.drop("class", axis=1)   # all features except target
y = data["class"]                # target variable


In [None]:
#handle missing values
print(data.isnull().sum())


Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0
class               0
dtype: int64


In [None]:
X = X.fillna(X.mean())   # fill missing with mean


In [None]:
#Imbalance
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("Before SMOTE:", y.value_counts())
print("After SMOTE:", y_resampled.value_counts())


Before SMOTE: class
2    180
3    173
1     87
Name: count, dtype: int64
After SMOTE: class
2    180
1    180
3    180
Name: count, dtype: int64


In [None]:
#feature scaling
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_resampled)


In [None]:
#train test
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_resampled, test_size=0.2, random_state=42)


In [None]:
#manhattan
import numpy as np
from collections import Counter

# Manhattan distance function
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

class KNN_Scratch_Manhattan:
    def __init__(self, k=3):   # default k=3
        self.k = k

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        return [self._predict(x) for x in X]

    def _predict(self, x):
        # 1. Compute distances
        distances = [manhattan_distance(x, x_train) for x_train in self.X_train]

        # 2. Get indices of k nearest neighbors
        k_idx = np.argsort(distances)[:self.k]

        # 3. Get their labels
        k_neighbor_labels = [self.y_train[i] for i in k_idx]

        # 4. Majority vote
        most_common = Counter(k_neighbor_labels).most_common(1)
        return most_common[0][0]

# -------------------------------
# Usage with user input
# -------------------------------

# user provides k value
k_value = int(input("Enter the value of k: "))

# initialize with user input k
knn_scratch_manhattan = KNN_Scratch_Manhattan(k=k_value)

# fit the model
knn_scratch_manhattan.fit(X_train, y_train)

# predict
predictions = knn_scratch_manhattan.predict(X_test)

# accuracy
acc = np.mean(predictions == y_test)
print(f"KNN Scratch (Manhattan, k={k_value}) Accuracy:", acc)


Enter the value of k: 10
KNN Scratch (Manhattan, k=10) Accuracy: 0.8796296296296297


In [None]:
# Take the first test sample
x_test = X_test[0]

# Compute Manhattan distances to all training points
distances = [np.sum(np.abs(x_test - x_train)) for x_train in X_train]

# Combine distances with their corresponding y_train labels
dist_label = list(zip(distances, y_train))

# Sort in ascending order of distance
dist_label_sorted = sorted(dist_label, key=lambda x: x[0])

print("Distances with labels (sorted ascending):")
for d, label in dist_label_sorted[:10]:  # print top 10 closest
    print(f"Distance: {d:.2f}, Label: {label}")

# Ask user for k
k = int(input("Enter the value of k: "))

# Take k nearest neighbors
k_neighbors = dist_label_sorted[:k]
neighbor_labels = [label for _, label in k_neighbors]

# Predict class by majority vote
from collections import Counter
pred_class = Counter(neighbor_labels).most_common(1)[0][0]

print(f"Predicted class for the test sample: {pred_class}")


Distances with labels (sorted ascending):
Distance: 0.89, Label: 3
Distance: 1.06, Label: 3
Distance: 1.13, Label: 3
Distance: 1.27, Label: 3
Distance: 1.28, Label: 3
Distance: 1.45, Label: 3
Distance: 1.52, Label: 3
Distance: 1.69, Label: 3
Distance: 1.69, Label: 3
Distance: 1.78, Label: 3
Enter the value of k: 3
Predicted class for the test sample: 3


In [None]:
def manhattan_distance(x1, x2):
    return np.sum(np.abs(x1 - x2))

class KNN_Scratch_Auto:
    def __init__(self, k=None):
        self.k = k

    def fit(self, X, y):
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        predictions = []
        for x in X:
            # Compute distances
            distances = [manhattan_distance(x, x_train) for x_train in self.X_train]
            dist_label_sorted = sorted(zip(distances, self.y_train), key=lambda x: x[0])

            # If k not provided, take sqrt of total samples as default
            k_val = self.k or int(np.sqrt(len(self.X_train)))

            # Take k nearest neighbors
            k_neighbors = dist_label_sorted[:k_val]
            neighbor_labels = [label for _, label in k_neighbors]

            # Majority vote
            pred_class = Counter(neighbor_labels).most_common(1)[0][0]
            predictions.append(pred_class)
        return np.array(predictions)

# -------------------------------
# 8) Initialize and run KNN
# -------------------------------
knn_auto = KNN_Scratch_Auto()  # k will be sqrt(n_train) automatically
knn_auto.fit(X_train, y_train)

# Predict on test set
predictions = knn_auto.predict(X_test)

# -------------------------------
# 9) Accuracy
# -------------------------------
accuracy = np.mean(predictions == y_test)
print("KNN Scratch (Manhattan) Accuracy:", accuracy)

KNN Scratch (Manhattan) Accuracy: 0.8888888888888888


In [None]:
# -------------------------------
# Step 3: KNN with Scikit-learn
# -------------------------------

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

if not data.empty:
    print("\n--- Step 3: KNN with Scikit-learn ---")

    # --- Preprocessing (Optional if already done) ---
    # Handle missing values
    X = X.fillna(X.mean())

    # Handle imbalance with SMOTE
    smote = SMOTE(random_state=42)
    X_res, y_res = smote.fit_resample(X, y)

    # Feature scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_res)

    # Train/Test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_res, test_size=0.2, random_state=42)

    # --- Initialize KNN model ---
    knn_sklearn = KNeighborsClassifier(n_neighbors=5, metric='manhattan')  # or 'euclidean'

    # Fit the model
    knn_sklearn.fit(X_train, y_train)

    # Make predictions
    predictions_sklearn = knn_sklearn.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, predictions_sklearn)
    print(f"SKlearn Model Accuracy: {accuracy * 100:.2f}%")

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, predictions_sklearn, zero_division=0))

    # Confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, predictions_sklearn))



--- Step 3: KNN with Scikit-learn ---
SKlearn Model Accuracy: 88.89%

Classification Report:
              precision    recall  f1-score   support

           1       0.79      0.97      0.87        34
           2       0.96      0.73      0.83        37
           3       0.95      0.97      0.96        37

    accuracy                           0.89       108
   macro avg       0.90      0.89      0.89       108
weighted avg       0.90      0.89      0.89       108


Confusion Matrix:
[[33  0  1]
 [ 9 27  1]
 [ 0  1 36]]
