<a href="https://colab.research.google.com/github/sudipta12344/Assign-ment/blob/main/Untitled11.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
# Step 1: Import Required Libraries
import numpy as np
import pandas as pd
from collections import Counter

# For dataset and preprocessing
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# For comparison with scikit-learn's KNN and metrics
from sklearn.neighbors import KNeighborsClassifier as SklearnKNN
from sklearn.metrics import classification_report


In [5]:
# Step 2: Custom Evaluation Metrics

def accuracy_score(y_true, y_pred):
    """Calculate the accuracy: (correct predictions / total predictions)"""
    return np.mean(np.array(y_true) == np.array(y_pred))

def confusion_matrix(y_true, y_pred):
    """Generate a confusion matrix as a pandas DataFrame"""
    labels = np.unique(np.concatenate((y_true, y_pred)))
    matrix = np.zeros((len(labels), len(labels)), dtype=int)

    for i, actual in enumerate(labels):
        for j, predicted in enumerate(labels):
            matrix[i, j] = np.sum((y_true == actual) & (y_pred == predicted))

    return pd.DataFrame(matrix, index=labels, columns=labels)

def precision_score(y_true, y_pred):
    """Calculate macro-averaged precision"""
    cm = confusion_matrix(y_true, y_pred).values
    precisions = []
    for i in range(len(cm)):
        tp = cm[i][i]
        fp = np.sum(cm[:, i]) - tp
        precision = tp / (tp + fp + 1e-9)
        precisions.append(precision)
    return np.mean(precisions)

def recall_score(y_true, y_pred):
    """Calculate macro-averaged recall"""
    cm = confusion_matrix(y_true, y_pred).values
    recalls = []
    for i in range(len(cm)):
        tp = cm[i][i]
        fn = np.sum(cm[i, :]) - tp
        recall = tp / (tp + fn + 1e-9)
        recalls.append(recall)
    return np.mean(recalls)

def f1_score(y_true, y_pred):
    """Calculate macro-averaged F1-score"""
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    return 2 * (precision * recall) / (precision + recall + 1e-9)


In [6]:
# Step 3: KNN Classifier Implementation from Scratch

class KNNClassifier:
    def __init__(self, k=3):
        """
        Initialize the classifier with the number of neighbors (k).
        """
        self.k = k

    def fit(self, X, y):
        """
        Store the training data.
        """
        self.X_train = np.array(X)
        self.y_train = np.array(y)

    def predict(self, X):
        """
        Predict the class labels for the given test data.
        """
        X = np.array(X)
        return np.array([self._predict_single(x) for x in X])

    def _predict_single(self, x):
        """
        Predict the class label for a single test instance.
        """
        # Compute Euclidean distances from x to all training points
        distances = np.linalg.norm(self.X_train - x, axis=1)

        # Get the indices of the k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]

        # Get the labels of the k nearest neighbors
        k_nearest_labels = self.y_train[k_indices]

        # Return the most common label
        most_common = Counter(k_nearest_labels).most_common(1)[0][0]
        return most_common


In [7]:
# Step 4: Load and Prepare the Iris Dataset

# Load dataset from scikit-learn
iris = load_iris()

# Extract features and target labels
X_iris = np.array(iris.data)
y_iris = np.array(iris.target)

# (Optional) Label encoding – not needed here since labels are already numeric
# If your dataset has string labels, you could use:
# from sklearn.preprocessing import LabelEncoder
# y_iris = LabelEncoder().fit_transform(iris.target)


In [12]:
# STEP 5: Load Your Own News Dataset (Upload CSV)
from google.colab import files
uploaded = files.upload()

# Suppose your file is "news.csv" with 'text' and 'label' columns
news_df = pd.read_csv(list(uploaded.keys())[0])
news_df.dropna(inplace=True)
news_df['label'] = LabelEncoder().fit_transform(news_df['label'])

# For simplicity, use bag-of-words TF feature (can replace with TF-IDF)
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(max_features=1000)
X_news = vec.fit_transform(news_df['text']).toarray()
y_news = news_df['label'].values


KeyboardInterrupt: 

In [None]:
# STEP 6: Function to evaluate a dataset
def evaluate_knn(X, y, dataset_name="Dataset", max_k=10):
    print(f"\nEvaluating on {dataset_name}")
    best_acc = 0
    best_k = 1
    best_split = 0.2
    for split in [0.2, 0.3, 0.4]:
        for k in range(1, max_k+1):
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=42)
            knn = KNNClassifier(k=k)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_test)

            acc = accuracy_score(y_test, y_pred)
            if acc > best_acc:
                best_acc = acc
                best_k = k
                best_split = split

    print(f"\nBest Accuracy = {best_acc:.4f} with k={best_k}, split={best_split}")
    # Final evaluation
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=best_split, random_state=42)
    knn = KNNClassifier(k=best_k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)

    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")

    # Compare with sklearn
    print("\n--- Sklearn KNN ---")
    sk_knn = SklearnKNN(n_neighbors=best_k)
    sk_knn.fit(X_train, y_train)
    sk_pred = sk_knn.predict(X_test)
    print(classification_report(y_test, sk_pred))


In [None]:
# STEP 7: Run Evaluation
evaluate_knn(X_iris, y_iris, dataset_name="Iris Dataset", max_k=10)
evaluate_knn(X_news, y_news, dataset_name="News Dataset", max_k=10)
