In [None]:
! wget https://classes.engr.oregonstate.edu/eecs/fall2023/ai534-400/unit1/hw1/hw1-data.tgz

--2023-10-22 00:23:47--  https://classes.engr.oregonstate.edu/eecs/fall2023/ai534-400/unit1/hw1/hw1-data.tgz
Resolving classes.engr.oregonstate.edu (classes.engr.oregonstate.edu)... 128.193.40.20
Connecting to classes.engr.oregonstate.edu (classes.engr.oregonstate.edu)|128.193.40.20|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 54557 (53K) [application/x-gzip]
Saving to: ‘hw1-data.tgz.1’


2023-10-22 00:23:48 (172 KB/s) - ‘hw1-data.tgz.1’ saved [54557/54557]



In [None]:
! tar -xzvf hw1-data.tgz

hw1-data/random_output.py
hw1-data/validate.py
hw1-data/toy.txt
hw1-data/income.train.txt.5k
hw1-data/income.dev.txt
hw1-data/income.test.blind


In [None]:
import numpy as np

class KNN:
    def __init__(self, k=3, distance_metric='euclidean'):
        self.k = k
        self.distance_metric = distance_metric

    def fit(self, X, y):
        self.X_train = X
        self.y_train = y

    def predict(self, X):
        y_pred = [self._predict(x) for x in X]
        return np.array(y_pred)

    def _predict(self, x):
        # Compute the distance based on the chosen metric
        if self.distance_metric == 'euclidean':
            distances = np.linalg.norm(self.X_train - x, axis=1)
        elif self.distance_metric == 'manhattan':
            distances = np.sum(np.abs(self.X_train - x), axis=1)
        else:
            raise ValueError("Invalid distance metric")

        # Use np.argpartition to get indices of the k smallest distances
        k_indices = np.argpartition(distances, self.k)[:self.k]

        # Extract the labels of the k nearest neighbor training samples
        k_nearest_labels = self.y_train[k_indices]

        # Return the most common class label using NumPy
        labels, counts = np.unique(k_nearest_labels, return_counts=True)
        most_common_label = labels[np.argmax(counts)]

        return most_common_label

    def kneighbors(self, X, return_distance=True):
        neighbors = []

        for x in X:
            # Compute the distance based on the chosen metric
            if self.distance_metric == 'euclidean':
                distances = np.linalg.norm(self.X_train - x, axis=1)
            elif self.distance_metric == 'manhattan':
                distances = np.sum(np.abs(self.X_train - x), axis=1)
            else:
                raise ValueError("Invalid distance metric")

            # Use np.argpartition to get indices of the k smallest distances
            k_indices = np.argpartition(distances, self.k)[:self.k]

            if return_distance:
                k_distances = distances[k_indices]
                neighbors.append((k_distances, k_indices))
            else:
                neighbors.append(k_indices)

        return neighbors


In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.compose import ColumnTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder

In [None]:
columns=["age","sector","edu","marriage","occupation","race","sex","hours","country","target"]
dev_data = pd.read_csv("hw1-data/income.dev.txt", sep="," , names =columns )
train_data = pd.read_csv("hw1-data/income.train.txt.5k", sep=",", names=columns)
dev_data_features = dev_data.drop('target', axis=1)
train_data_features = train_data.drop('target', axis=1)
dev_data_label = dev_data['target']
train_data_label = train_data['target']
num_processor=MinMaxScaler(feature_range=(0,2))
cat_processor=OneHotEncoder(sparse_output=False,handle_unknown='ignore')
preprocessor=ColumnTransformer([ ('num',num_processor,['age','hours']), ('cat',cat_processor,["sector","edu","marriage","occupation","race","sex","country"]) ])
preprocessor.fit(train_data_features)
binary_train_data_features=preprocessor.transform(train_data_features)
binary_dev_data_features= preprocessor.transform(dev_data_features)
best_dev_error_rate = float('inf')
best_k = None
for k in range(1,99,2):

  knn = KNN(k=k, distance_metric='manhattan')
  knn.fit(binary_train_data_features, train_data_label)


  train_pred = knn.predict(binary_train_data_features)
  dev_pred = knn.predict(binary_dev_data_features)

  train_positive_count = sum(train_pred == " >50K")
  train_positive_rate = train_positive_count / len(train_pred) * 100

  dev_positive_count = sum(dev_pred == " >50K")
  dev_positive_rate = dev_positive_count / len(dev_pred) * 100

  train_err = (1 - accuracy_score(train_data_label, train_pred)) * 100
  dev_err = (1 - accuracy_score(dev_data_label, dev_pred)) * 100


  if dev_err < best_dev_error_rate:
    best_dev_error_rate = dev_err
    best_k = k


  print(f"k={k} train_err: {train_err:.2f}% (+: {train_positive_rate:.2f}%) dev_err: {dev_err:.2f}% (+: {dev_positive_rate:.2f}%)")
print(f"\nBest development error rate: {best_dev_error_rate:.2f}% for k={best_k}")

k=1 train_err: 1.52% (+: 25.06%) dev_err: 24.00% (+: 27.20%)
k=3 train_err: 11.64% (+: 23.94%) dev_err: 19.70% (+: 26.10%)
k=5 train_err: 14.00% (+: 23.86%) dev_err: 17.60% (+: 25.00%)
k=7 train_err: 14.54% (+: 23.96%) dev_err: 16.60% (+: 24.00%)
k=9 train_err: 15.46% (+: 23.60%) dev_err: 16.20% (+: 22.20%)
k=11 train_err: 16.16% (+: 23.42%) dev_err: 16.30% (+: 21.90%)
k=13 train_err: 16.48% (+: 23.58%) dev_err: 16.50% (+: 22.30%)
k=15 train_err: 16.82% (+: 22.80%) dev_err: 16.30% (+: 21.70%)
k=17 train_err: 16.86% (+: 22.84%) dev_err: 15.50% (+: 21.10%)
k=19 train_err: 16.96% (+: 22.46%) dev_err: 16.00% (+: 21.00%)
k=21 train_err: 16.96% (+: 22.22%) dev_err: 16.60% (+: 21.20%)
k=23 train_err: 17.08% (+: 22.26%) dev_err: 16.20% (+: 21.60%)
k=25 train_err: 17.04% (+: 22.10%) dev_err: 15.70% (+: 21.10%)
k=27 train_err: 16.90% (+: 21.84%) dev_err: 15.80% (+: 20.60%)
k=29 train_err: 16.76% (+: 21.34%) dev_err: 16.00% (+: 20.60%)
k=31 train_err: 16.98% (+: 21.24%) dev_err: 15.90% (+: 20.50%

In [None]:
len(preprocessor.get_feature_names_out())

92

In [None]:
preprocessor.get_feature_names_out()

distance verification using  manhattan distance

In [None]:
columns=["age","sector","edu","marriage","occupation","race","sex","hours","country","target"]
dev_data = pd.read_csv("hw1-data/income.dev.txt", sep="," , names =columns )
train_data = pd.read_csv("hw1-data/income.train.txt.5k", sep=",", names=columns)
dev_data_features = dev_data.drop('target', axis=1)
train_data_features = train_data.drop('target', axis=1)
dev_data_label = dev_data['target']
train_data_label = train_data['target']
num_processor=MinMaxScaler(feature_range=(0,2))
cat_processor=OneHotEncoder(sparse_output=False,handle_unknown='ignore')
preprocessor=ColumnTransformer([ ('num',num_processor,['age','hours']), ('cat',cat_processor,["sector","edu","marriage","occupation","race","sex","country"]) ])
preprocessor.fit(train_data_features)
binary_train_data_features=preprocessor.transform(train_data_features)
binary_dev_data_features= preprocessor.transform(dev_data_features)
best_dev_error_rate = float('inf')
best_k = None
for k in range(1,99,2):

  knn = KNN(k=k, distance_metric='euclidean')
  knn.fit(binary_train_data_features, train_data_label)


  train_pred = knn.predict(binary_train_data_features)
  dev_pred = knn.predict(binary_dev_data_features)

  train_positive_count = sum(train_pred == " >50K")
  train_positive_rate = train_positive_count / len(train_pred) * 100

  dev_positive_count = sum(dev_pred == " >50K")
  dev_positive_rate = dev_positive_count / len(dev_pred) * 100

  train_err = (1 - accuracy_score(train_data_label, train_pred)) * 100
  dev_err = (1 - accuracy_score(dev_data_label, dev_pred)) * 100


  if dev_err < best_dev_error_rate:
    best_dev_error_rate = dev_err
    best_k = k


  print(f"k={k} train_err: {train_err:.2f}% (+: {train_positive_rate:.2f}%) dev_err: {dev_err:.2f}% (+: {dev_positive_rate:.2f}%)")
print(f"\nBest development error rate: {best_dev_error_rate:.2f}% for k={best_k}")

k=1 train_err: 1.52% (+: 25.06%) dev_err: 23.90% (+: 27.30%)
k=3 train_err: 11.40% (+: 23.90%) dev_err: 19.60% (+: 25.40%)
k=5 train_err: 13.74% (+: 23.64%) dev_err: 17.80% (+: 24.40%)
k=7 train_err: 14.32% (+: 23.82%) dev_err: 16.60% (+: 24.00%)
k=9 train_err: 15.42% (+: 23.56%) dev_err: 15.30% (+: 22.50%)
k=11 train_err: 16.50% (+: 23.60%) dev_err: 16.40% (+: 21.20%)
k=13 train_err: 16.48% (+: 23.50%) dev_err: 16.70% (+: 21.90%)
k=15 train_err: 16.38% (+: 23.12%) dev_err: 15.90% (+: 21.90%)
k=17 train_err: 16.66% (+: 22.80%) dev_err: 15.80% (+: 21.60%)
k=19 train_err: 16.70% (+: 22.48%) dev_err: 16.40% (+: 20.80%)
k=21 train_err: 16.94% (+: 22.20%) dev_err: 16.20% (+: 21.00%)
k=23 train_err: 17.04% (+: 22.26%) dev_err: 15.50% (+: 21.70%)
k=25 train_err: 16.90% (+: 22.40%) dev_err: 15.60% (+: 21.40%)
k=27 train_err: 16.94% (+: 22.24%) dev_err: 15.60% (+: 20.80%)
k=29 train_err: 17.04% (+: 21.82%) dev_err: 15.10% (+: 21.10%)
k=31 train_err: 16.92% (+: 21.86%) dev_err: 15.20% (+: 21.20%