In [58]:
# Loading the dataset
import gdown
import pandas as pd

# Download dataset
url = "https://drive.google.com/uc?id=1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2"
output = "groceries.csv"
gdown.download(url, output, quiet=False)

# Load dataset
df = pd.read_csv(output)
print("✅ Loaded:", df.shape)
df.head()

Downloading...
From: https://drive.google.com/uc?id=1ZdhRqYv-JizWV6DxO6C4R_k1kxPhmlF2
To: /content/groceries.csv
100%|██████████| 14.6k/14.6k [00:00<00:00, 17.6MB/s]

✅ Loaded: (440, 8)





Unnamed: 0,Region,Fresh,Milk,Grocery,Frozen,Detergents_Paper,Delicassen,class
0,3,12669,9656,7561,214,2674,1338,2
1,3,7057,9810,9568,1762,3293,1776,2
2,3,6353,8808,7684,2405,3516,7844,2
3,3,13265,1196,4221,6404,507,1788,1
4,3,22615,5410,7198,3915,1777,5185,1


In [59]:
# Printing it's shape, first 5 rows, column's info(name of the colomns, dtypes and no.of non-null rows)
print("Shape of dataset:", df.shape)
print("\nFirst 5 rows:\n", df.head())
print("\nColumn info:\n", df.info())

Shape of dataset: (440, 8)

First 5 rows:
    Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen  class
0       3  12669  9656     7561     214              2674        1338      2
1       3   7057  9810     9568    1762              3293        1776      2
2       3   6353  8808     7684    2405              3516        7844      2
3       3  13265  1196     4221    6404               507        1788      1
4       3  22615  5410     7198    3915              1777        5185      1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 440 entries, 0 to 439
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype
---  ------            --------------  -----
 0   Region            440 non-null    int64
 1   Fresh             440 non-null    int64
 2   Milk              440 non-null    int64
 3   Grocery           440 non-null    int64
 4   Frozen            440 non-null    int64
 5   Detergents_Paper  440 non-null    int64
 6   Delicassen        440 non-

In [60]:
# prints no. of missing values / null values in each column
print("\nMissing values per column:\n", df.isna().sum())


Missing values per column:
 Region              0
Fresh               0
Milk                0
Grocery             0
Frozen              0
Detergents_Paper    0
Delicassen          0
class               0
dtype: int64


In [61]:
df = df.dropna()

In [62]:
# Simple cleaning: drop duplicates
before = df.shape[0]
df = df.drop_duplicates().reset_index(drop=True)
print("\nDropped duplicates:", before - df.shape[0])


Dropped duplicates: 0


In [63]:
print(df.shape)

(440, 8)


In [64]:
# Encoding (categorical -> numerical)
# Apply one-hot encoding to all categorical columns

# Step 1: Separate features and target first
TARGET_COL = df.columns[-1]   # assume last column is the target
X = df.drop(columns=[TARGET_COL])   # features only
y = df[TARGET_COL]                   # target column

# Step 2: One-hot encode ONLY features
X_encoded = pd.get_dummies(X, drop_first=True)
print("✅ Encoded features shape:", X_encoded.shape)
print(X_encoded.head())

# Step 3: Encode target separately if it's categorical
from sklearn.preprocessing import LabelEncoder
import numpy as np

if y.dtype == 'object' or str(y.dtype).startswith('category'):
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    print("\nTarget classes:", le.classes_)
    print("Encoded y:", np.unique(y_encoded))
else:
    y_encoded = y.values

✅ Encoded features shape: (440, 7)
   Region  Fresh  Milk  Grocery  Frozen  Detergents_Paper  Delicassen
0       3  12669  9656     7561     214              2674        1338
1       3   7057  9810     9568    1762              3293        1776
2       3   6353  8808     7684    2405              3516        7844
3       3  13265  1196     4221    6404               507        1788
4       3  22615  5410     7198    3915              1777        5185


In [65]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded  # keeps class distribution same
)
print("Train shape:", X_train.shape, " Test shape:", X_test.shape)

Train shape: (352, 7)  Test shape: (88, 7)


In [66]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit only on training set
X_train_scaled = scaler.fit_transform(X_train)

# Apply same transformation to test set
X_test_scaled = scaler.transform(X_test)

print("Scaled train shape:", X_train_scaled.shape)
print("Scaled test shape:", X_test_scaled.shape)

Scaled train shape: (352, 7)
Scaled test shape: (88, 7)


In [67]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

print("Before SMOTE:", X_train.shape, "Class distribution:", dict(pd.Series(y_train).value_counts()))
print("After SMOTE:", X_train_resampled.shape, "Class distribution:", dict(pd.Series(y_train_resampled).value_counts()))

Before SMOTE: (352, 7) Class distribution: {2: np.int64(144), 3: np.int64(138), 1: np.int64(70)}
After SMOTE: (432, 7) Class distribution: {2: np.int64(144), 3: np.int64(144), 1: np.int64(144)}


In [68]:
import numpy as np
from collections import Counter

def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))

In [69]:
class KNN_Scratch:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        predictions = [self._predict(x) for x in X]
        return np.array(predictions)

    def _predict(self, x):
        # Compute distances to all training points
        distances = [euclidean_distance(x, x_train) for x_train in self.X_train]

        # Get k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        k_nearest_labels = [self.y_train[i] for i in k_indices]

        # Majority vote
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]

In [70]:
# Initialize custom KNN
knn_scratch = KNN_Scratch(k=5)

# Train on resampled data
knn_scratch.fit(X_train_resampled, y_train_resampled)

# Predict on test set
y_pred_scratch = knn_scratch.predict(X_test_scaled)

In [71]:
from sklearn.metrics import accuracy_score
print("Scratch KNN Results")
print("Accuracy:", accuracy_score(y_test, y_pred_scratch))

Scratch KNN Results
Accuracy: 0.9318181818181818


In [72]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

knn_sklearn = KNeighborsClassifier(n_neighbors=5)
knn_sklearn.fit(X_train, y_train)
y_pred_sklearn = knn_sklearn.predict(X_test)
print("Sklearn KNN Accuracy:", accuracy_score(y_test, y_pred_sklearn))

Sklearn KNN Accuracy: 0.9545454545454546
