In [7]:
!pip install numpy pandas scikit-learn matplotlib joblib




In [8]:
from google.colab import files
uploaded = files.upload()   # use the upload dialog


Saving Iris.csv to Iris.csv


In [9]:
!python knn_task6.py --dataset /content/your.csv --save_model --output_dir outputs


python3: can't open file '/content/knn_task6.py': [Errno 2] No such file or directory


In [11]:
%%bash
cat > knn_task6.py <<'PY'
#!/usr/bin/env python3
"""
knn_task6.py

A self-contained K-Nearest Neighbors script:
- loads Iris (default) or your CSV (last column = label)
- uses StandardScaler + CV to pick best k
- trains final model, evaluates test set (accuracy, confusion matrix, classification report)
- creates plots (accuracy vs k, confusion matrix, PCA decision boundary)
- saves outputs to outputs/
"""
import argparse
import os
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

def load_dataset(path=None):
    if path:
        df = pd.read_csv(path)
        if df.shape[1] < 2:
            raise ValueError("CSV must have at least one feature column and one label column.")
        X = df.iloc[:, :-1].values
        y = df.iloc[:, -1].values
        feature_names = list(df.columns[:-1])
        return X, y, feature_names
    else:
        data = datasets.load_iris()
        return data.data, data.target, data.feature_names

def evaluate_k_values(X, y, k_values, cv=5):
    results = {}
    for k in k_values:
        pipe = Pipeline([('scaler', StandardScaler()), ('knn', KNeighborsClassifier(n_neighbors=k))])
        scores = cross_val_score(pipe, X, y, cv=cv, scoring='accuracy', n_jobs=-1)
        results[k] = float(np.mean(scores))
    return results

def train_and_evaluate(X, y, k, test_size=0.2, random_state=42):
    # stratify when multiple classes exist
    strat = y if len(np.unique(y)) > 1 else None
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=strat)

    scaler = StandardScaler().fit(X_train)
    X_train_s = scaler.transform(X_train)
    X_test_s = scaler.transform(X_test)

    clf = KNeighborsClassifier(n_neighbors=k)
    clf.fit(X_train_s, y_train)
    y_pred = clf.predict(X_test_s)

    acc = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return {
        'model': clf,
        'scaler': scaler,
        'accuracy': acc,
        'confusion_matrix': cm,
        'report': report,
        'X_train_s': X_train_s,
        'y_train': y_train,
        'X_test': X_test,
        'y_test': y_test,
        'y_pred': y_pred
    }

def plot_accuracy_vs_k(results, out_path):
    ks = sorted(results.keys())
    scores = [results[k] for k in ks]
    plt.figure(figsize=(6,4))
    plt.plot(ks, scores, marker='o')
    plt.title('CV Accuracy vs K')
    plt.xlabel('K')
    plt.ylabel('Mean CV Accuracy')
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

def plot_confusion_matrix(cm, class_names, out_path):
    plt.figure(figsize=(6,5))
    plt.imshow(cm, interpolation='nearest')
    plt.title('Confusion Matrix')
    plt.colorbar()
    tick_marks = np.arange(len(class_names))
    plt.xticks(tick_marks, class_names, rotation=45)
    plt.yticks(tick_marks, class_names)

    thresh = cm.max() / 2. if cm.size else 0
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, format(cm[i, j], 'd'),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

def plot_2d_decision_boundary(X_train_s, y_train, k, out_path):
    """
    Fit PCA(2) on scaled training data, fit a KNN on the PCA projection,
    and plot decision regions in the 2D PCA space.
    """
    pca = PCA(n_components=2)
    Xp = pca.fit_transform(X_train_s)

    knn2d = KNeighborsClassifier(n_neighbors=k)
    knn2d.fit(Xp, y_train)

    x_min, x_max = Xp[:, 0].min() - 1, Xp[:, 0].max() + 1
    y_min, y_max = Xp[:, 1].min() - 1, Xp[:, 1].max() + 1
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 200), np.linspace(y_min, y_max, 200))
    grid = np.c_[xx.ravel(), yy.ravel()]
    Z = knn2d.predict(grid).reshape(xx.shape)

    plt.figure(figsize=(7,5))
    plt.contourf(xx, yy, Z, alpha=0.3)
    plt.scatter(Xp[:, 0], Xp[:, 1], c=y_train)
    plt.title('Decision regions (PCA projection, training data)')
    plt.xlabel('PC1')
    plt.ylabel('PC2')
    plt.tight_layout()
    plt.savefig(out_path)
    plt.close()

def ensure_dir(d):
    if not os.path.exists(d):
        os.makedirs(d)

def parse_k_values(s):
    return [int(x) for x in s.split(',') if x.strip()]

def main():
    parser = argparse.ArgumentParser(description='KNN Task 6 — K-Nearest Neighbors classification')
    parser.add_argument('--dataset', type=str, default=None, help='Path to CSV dataset. Last column must be label. If omitted, uses Iris dataset.')
    parser.add_argument('--k_values', type=str, default='1,3,5,7,9', help='Comma-separated k values to evaluate')
    parser.add_argument('--cv', type=int, default=5, help='Cross-validation folds for k selection')
    parser.add_argument('--test_size', type=float, default=0.2, help='Test split fraction')
    parser.add_argument('--random_state', type=int, default=42)
    parser.add_argument('--output_dir', type=str, default='outputs')
    parser.add_argument('--save_model', action='store_true', help='Save the final model to outputs/')
    args = parser.parse_args()

    X, y, feature_names = load_dataset(args.dataset)
    ensure_dir(args.output_dir)

    k_list = parse_k_values(args.k_values)
    print(f"Evaluating K values: {k_list} with {args.cv}-fold CV...")
    results = evaluate_k_values(X, y, k_list, cv=args.cv)

    best_k = max(results, key=lambda k: results[k])
    print('CV results (k: mean_accuracy):')
    for k in sorted(results.keys()):
        print(f"  k={k}: {results[k]:.4f}")
    print(f"Best k by CV: {best_k}")

    plot_accuracy_vs_k(results, os.path.join(args.output_dir, 'accuracy_vs_k.png'))

    res = train_and_evaluate(X, y, best_k, test_size=args.test_size, random_state=args.random_state)
    print(f"Test accuracy with k={best_k}: {res['accuracy']:.4f}")
    print('Classification report:\n', res['report'])

    class_names = [str(c) for c in np.unique(y)]
    plot_confusion_matrix(res['confusion_matrix'], class_names, os.path.join(args.output_dir, 'confusion_matrix.png'))

    try:
        plot_2d_decision_boundary(res['X_train_s'], res['y_train'], best_k, os.path.join(args.output_dir, 'decision_boundary.png'))
    except Exception as e:
        print('Warning: decision boundary plot failed:', e)

    if args.save_model:
        model_path = os.path.join(args.output_dir, 'knn_model.joblib')
        joblib.dump({'model': res['model'], 'scaler': res['scaler']}, model_path)
        print('Saved model to', model_path)

    out_df = pd.DataFrame({
        'y_true': res['y_test'],
        'y_pred': res['y_pred']
         })
    out_df.to_csv(os.path.join(args.output_dir, 'predictions.csv'), index=False)
    print('Saved predictions to', os.path.join(args.output_dir, 'predictions.csv'))

if __name__ == '__main__':
    main()
PY


In [12]:
!python knn_task6.py --dataset mydata.csv --output_dir outputs --save_model --k_values 1,3,5,7,9 --cv 5


Traceback (most recent call last):
  File "/content/knn_task6.py", line 198, in <module>
    main()
  File "/content/knn_task6.py", line 158, in main
    X, y, feature_names = load_dataset(args.dataset)
                          ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/content/knn_task6.py", line 29, in load_dataset
    df = pd.read_csv(path)
         ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/pandas/io/parsers/readers.py", line 1620, in __init__
    self._engine = self._make_engine(f, self.engine)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/pyth

In [13]:
from IPython.display import Image, display
import os, pandas as pd

for fname in ['outputs/accuracy_vs_k.png', 'outputs/confusion_matrix.png', 'outputs/decision_boundary.png']:
    if os.path.exists(fname):
        display(Image(fname))
    else:
        print(fname, "not found")

# preview predictions
if os.path.exists('outputs/predictions.csv'):
    display(pd.read_csv('outputs/predictions.csv').head())
else:
    print("outputs/predictions.csv not found")


outputs/accuracy_vs_k.png not found
outputs/confusion_matrix.png not found
outputs/decision_boundary.png not found
outputs/predictions.csv not found
