**KNN from Scratch**

**Step 1: Load the data**

In [2]:
import pandas as pd
import numpy as np

# Load the diamonds dataset
file_path = r"C:\Users\DELL\Downloads\diamonds (1).csv"
df = pd.read_csv(file_path)

# Display basic information about the dataset
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
print(df.head())
print("\nDataset Info:")
print(df.info())
print("\nBasic Statistics:")
print(df.describe())

Dataset Shape: (53940, 10)

First few rows:
   carat      cut color clarity  depth  table  price     x     y     z
0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-nu

**Step 2: Identify input and output variables**


In [3]:
# Target variable: price
# Features: all columns except price

X = df.drop('price', axis=1)
y = df['price']

print("Features shape:", X.shape)
print("Target shape:", y.shape)
print("\nFeature columns:", X.columns.tolist())

Features shape: (53940, 9)
Target shape: (53940,)

Feature columns: ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z']


**Step 3: Split the data (75:25 split)**

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (40455, 9)
Test set shape: (13485, 9)


**Step 4: Data Preprocessing on X_train**

In [5]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Identify column types
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()

print("Categorical columns:", categorical_cols)
print("Numeric columns:", numeric_cols)

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_cols)  # Set sparse=False
    ])

# Fit and transform training data
X_train_processed = preprocessor.fit_transform(X_train)

print("\nProcessed training data shape:", X_train_processed.shape)
print("Type:", type(X_train_processed))

Categorical columns: ['cut', 'color', 'clarity']
Numeric columns: ['carat', 'depth', 'table', 'x', 'y', 'z']

Processed training data shape: (40455, 26)
Type: <class 'numpy.ndarray'>




**Step 5: Data Preprocessing on X_test**

In [6]:
# Transform test data using same preprocessor
X_test_processed = preprocessor.transform(X_test)

print("Processed test data shape:", X_test_processed.shape)
print("Type:", type(X_test_processed))

Processed test data shape: (13485, 26)
Type: <class 'numpy.ndarray'>


**Step 6: Build KNN model from scratch and predict**

In [7]:
import numpy as np

class UltraMemoryEfficientKNN:
    def __init__(self, k=5, train_batch_size=5000):
        """
        Initialize KNN with ultra-low memory usage
        k: number of nearest neighbors
        train_batch_size: process training data in chunks
        """
        self.k = k
        self.train_batch_size = train_batch_size
    
    def fit(self, X, y):
        """Store training data"""
        self.X_train = np.array(X, dtype=np.float32)
        self.y_train = np.array(y, dtype=np.float32)
        return self
    
    def predict(self, X):
        """Predict by processing one test sample and training batches at a time"""
        X_test = np.array(X, dtype=np.float32)
        n_test = X_test.shape[0]
        n_train = self.X_train.shape[0]
        predictions = np.zeros(n_test, dtype=np.float32)
        
        # Process ONE test sample at a time
        for i in range(n_test):
            all_distances = np.zeros(n_train, dtype=np.float32)
            
            # Calculate distances in training batches to save memory
            for j in range(0, n_train, self.train_batch_size):
                batch_end = min(j + self.train_batch_size, n_train)
                X_train_batch = self.X_train[j:batch_end]
                
                # Distance for this batch: (batch_size,)
                distances_batch = np.sqrt(
                    np.sum((X_test[i] - X_train_batch) ** 2, axis=1)
                )
                all_distances[j:batch_end] = distances_batch
            
            # Get k nearest neighbors from all distances
            nearest_indices = np.argpartition(all_distances, self.k)[:self.k]
            
            # Predict by averaging
            predictions[i] = np.mean(self.y_train[nearest_indices])
            
            # Progress update
            if (i + 1) % 500 == 0:
                print(f"Processed {i + 1}/{n_test} test samples...", end='\r')
        
        return predictions

# --- Train and Predict ---
print("Training KNN model...")
knn_model = UltraMemoryEfficientKNN(k=5, train_batch_size=5000)  # Adjust this lower if needed
knn_model.fit(X_train_processed, y_train)

print("\nMaking predictions (ultra memory efficient)...")
y_pred_scratch = knn_model.predict(X_test_processed)

print("\nSample predictions:", y_pred_scratch[:5])

Training KNN model...

Making predictions (ultra memory efficient)...
Processed 13000/13485 test samples...
Sample predictions: [ 586.4 2427.6 1083.4 1196.2 9642.2]


**Step 7: Evaluate your model**

In [8]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

mse = mean_squared_error(y_test, y_pred_scratch)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred_scratch)
r2 = r2_score(y_test, y_pred_scratch)

print(f"\n=== Ultra Memory-Efficient KNN ===")
print(f"RMSE: {rmse:.2f} | MAE: {mae:.2f} | R²: {r2:.4f}")


=== Ultra Memory-Efficient KNN ===
RMSE: 785.87 | MAE: 408.18 | R²: 0.9607


**Step 8: Compare with sklearn KNN**

In [10]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

cat_cols = ["cut", "color", "clarity"]
num_cols = [c for c in X.columns if c not in cat_cols]

preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse=False), cat_cols),
    ("num", StandardScaler(), num_cols)
])

X_train_p = preprocessor.fit_transform(X_train)
X_test_p  = preprocessor.transform(X_test)




In [17]:
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Apply PCA to reduce dimensionality for faster execution
pca = PCA(n_components=5)

X_train_pca = pca.fit_transform(X_train_p[:1000])
X_test_pca  = pca.transform(X_test_p[:50])

# Train sklearn KNN model
knn_sklearn = KNeighborsRegressor(n_neighbors=3)
y_pred_sklearn = knn_sklearn.fit(
    X_train_pca, y_train.values[:1000]
).predict(X_test_pca)

# Calculate RMSE for sklearn KNN
rmse_sklearn = np.sqrt(
    mean_squared_error(y_test.values[:50], y_pred_sklearn)
)

# Comparison output
print("Scratch KNN RMSE : (computed in Step 7)")
print("Sklearn KNN RMSE :", rmse_sklearn)


Scratch KNN RMSE : (computed in Step 7)
Sklearn KNN RMSE : 1395.8127763660377
