In [1]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [2]:
import pandas as pd

In [3]:
num_processor = 'passthrough' # i.e., no transformation
cat_processor = OneHotEncoder(sparse=False, handle_unknown='ignore')


In [4]:
names = ["age", "sector", "education", "marital-status", "occupation", "race", "sex", "hours-per-week", "country-of-origin", "target"]

In [5]:
data = pd.read_csv("income.train.txt.5k", names = names)

In [14]:
#  includes transformations for all the categorical columns in the dataset
preprocessor = ColumnTransformer([
           ('num', num_processor, ['age', 'hours-per-week']),
           ('sector', cat_processor, ['sector']),
           ('education', cat_processor, ['education']),
           ('marital-status', cat_processor, ['marital-status']),
           ('occupation', cat_processor, ['occupation']),
           ('race', cat_processor, ['race']),
           ('sex', cat_processor, ['sex']),
           ('country', cat_processor, ['country-of-origin']),
       ])
preprocessor.fit(data)
processed_data = preprocessor.transform(data)



In [15]:
processed_data

array([[50., 13.,  0., ...,  1.,  0.,  0.],
       [38., 40.,  0., ...,  1.,  0.,  0.],
       [53., 40.,  0., ...,  1.,  0.,  0.],
       ...,
       [61.,  5.,  0., ...,  1.,  0.,  0.],
       [42., 40.,  0., ...,  1.,  0.,  0.],
       [21., 40.,  0., ...,  1.,  0.,  0.]])

In [18]:
# After fitting the ColumnTransformer to the data, 
# We can retrieve the feature names using the get_feature_names_out() method
feature_names = preprocessor.get_feature_names_out()
feature_names

array(['num__age', 'num__hours-per-week', 'sector__sector_ Federal-gov',
       'sector__sector_ Local-gov', 'sector__sector_ Private',
       'sector__sector_ Self-emp-inc', 'sector__sector_ Self-emp-not-inc',
       'sector__sector_ State-gov', 'sector__sector_ Without-pay',
       'education__education_ 10th', 'education__education_ 11th',
       'education__education_ 12th', 'education__education_ 1st-4th',
       'education__education_ 5th-6th', 'education__education_ 7th-8th',
       'education__education_ 9th', 'education__education_ Assoc-acdm',
       'education__education_ Assoc-voc',
       'education__education_ Bachelors',
       'education__education_ Doctorate', 'education__education_ HS-grad',
       'education__education_ Masters', 'education__education_ Preschool',
       'education__education_ Prof-school',
       'education__education_ Some-college',
       'marital-status__marital-status_ Divorced',
       'marital-status__marital-status_ Married-AF-spouse',
      

In [17]:
count = len(feature_names)
count

92

In [33]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [34]:
dev_data = pd.read_csv("income.dev.txt", names = names)

In [55]:
processed_dev_data = preprocessor.transform(dev_data)
X_dev = processed_dev_data
y_dev = (dev_data["target"].str.strip() == '>50K').astype(int).values  # Convert target to binary (assuming '>50K' is the positive class)

In [56]:
X_train = processed_data
y_train = (data["target"].str.strip() == '>50K').astype(int).values

In [57]:
training_error_rates = []
dev_error_rates = []
training_positive_rates = []
dev_positive_rates = []

In [58]:
y_train[-4]

1

In [59]:
for k in range(1, 101, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    
    train_preds = knn.predict(X_train)
    dev_preds = knn.predict(X_dev)
    
    train_error_rate = 1 - accuracy_score(y_train, train_preds)
    dev_error_rate = 1 - accuracy_score(y_dev, dev_preds)
    
    training_error_rates.append(train_error_rate)
    dev_error_rates.append(dev_error_rate)
    
    print(f"k={k}: Training Error Rate = {train_error_rate:.4f}, Dev Error Rate = {dev_error_rate:.4f}")

k=1: Training Error Rate = 0.0152, Dev Error Rate = 0.2690
k=3: Training Error Rate = 0.1300, Dev Error Rate = 0.2400
k=5: Training Error Rate = 0.1562, Dev Error Rate = 0.2340
k=7: Training Error Rate = 0.1684, Dev Error Rate = 0.2330
k=9: Training Error Rate = 0.1828, Dev Error Rate = 0.2220
k=11: Training Error Rate = 0.1882, Dev Error Rate = 0.2190
k=13: Training Error Rate = 0.1898, Dev Error Rate = 0.2170
k=15: Training Error Rate = 0.1924, Dev Error Rate = 0.2150
k=17: Training Error Rate = 0.1948, Dev Error Rate = 0.2210
k=19: Training Error Rate = 0.1966, Dev Error Rate = 0.2140
k=21: Training Error Rate = 0.1978, Dev Error Rate = 0.2170
k=23: Training Error Rate = 0.2028, Dev Error Rate = 0.2230
k=25: Training Error Rate = 0.2010, Dev Error Rate = 0.2210
k=27: Training Error Rate = 0.2078, Dev Error Rate = 0.2200
k=29: Training Error Rate = 0.2146, Dev Error Rate = 0.2130
k=31: Training Error Rate = 0.2094, Dev Error Rate = 0.2140
k=33: Training Error Rate = 0.2136, Dev Error

In [61]:
# Question - 2
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder


In [62]:
num_processor = MinMaxScaler(feature_range=(0, 2))
cat_processor = OneHotEncoder(sparse=False, handle_unknown='ignore')

In [63]:
preprocessor = ColumnTransformer([
           ('num', num_processor, ['age', 'hours-per-week']),
           ('sector', cat_processor, ['sector']),
           ('education', cat_processor, ['education']),
           ('marital-status', cat_processor, ['marital-status']),
           ('occupation', cat_processor, ['occupation']),
           ('race', cat_processor, ['race']),
           ('sex', cat_processor, ['sex']),
           ('country', cat_processor, ['country-of-origin']),
       ])
preprocessor.fit(data)
processed_data = preprocessor.transform(data)



In [64]:
X_train = processed_data
y_train = (data["target"].str.strip() == '>50K').astype(int).values

In [65]:
processed_dev_data = preprocessor.transform(dev_data)
X_dev = processed_dev_data
y_dev = (dev_data["target"].str.strip() == '>50K').astype(int).values
training_error_rates = []
dev_error_rates = []
training_positive_rates = []
dev_positive_rates = []
for k in range(1, 101, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    
    train_preds = knn.predict(X_train)
    dev_preds = knn.predict(X_dev)
    
    train_error_rate = 1 - accuracy_score(y_train, train_preds)
    dev_error_rate = 1 - accuracy_score(y_dev, dev_preds)
    
    training_error_rates.append(train_error_rate)
    dev_error_rates.append(dev_error_rate)
    
    print(f"k={k}: Training Error Rate = {train_error_rate:.4f}, Dev Error Rate = {dev_error_rate:.4f}")

k=1: Training Error Rate = 0.0152, Dev Error Rate = 0.2370
k=3: Training Error Rate = 0.1152, Dev Error Rate = 0.1920
k=5: Training Error Rate = 0.1376, Dev Error Rate = 0.1830
k=7: Training Error Rate = 0.1410, Dev Error Rate = 0.1680
k=9: Training Error Rate = 0.1552, Dev Error Rate = 0.1590
k=11: Training Error Rate = 0.1634, Dev Error Rate = 0.1630
k=13: Training Error Rate = 0.1634, Dev Error Rate = 0.1640
k=15: Training Error Rate = 0.1642, Dev Error Rate = 0.1570
k=17: Training Error Rate = 0.1660, Dev Error Rate = 0.1610
k=19: Training Error Rate = 0.1690, Dev Error Rate = 0.1640
k=21: Training Error Rate = 0.1702, Dev Error Rate = 0.1590
k=23: Training Error Rate = 0.1708, Dev Error Rate = 0.1540
k=25: Training Error Rate = 0.1696, Dev Error Rate = 0.1500
k=27: Training Error Rate = 0.1692, Dev Error Rate = 0.1560
k=29: Training Error Rate = 0.1700, Dev Error Rate = 0.1520
k=31: Training Error Rate = 0.1706, Dev Error Rate = 0.1520
k=33: Training Error Rate = 0.1712, Dev Error

In [66]:
# top-3 version
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Preprocessing
processed_dev_data = preprocessor.transform(dev_data)
X_dev = processed_dev_data
y_dev = (dev_data["target"].str.strip() == '>50K').astype(int).values

training_error_rates = []
dev_error_rates = []
training_positive_rates = []
dev_positive_rates = []

# Consider using a dictionary to store distances for easier organization
top_3_distances = {
    'euclidean': None,
    'manhattan': None
}

for metric in ['euclidean', 'manhattan']:
    for k in range(1, 101, 2):
        knn = KNeighborsClassifier(n_neighbors=k, metric=metric)
        knn.fit(X_train, y_train)
        
        train_preds = knn.predict(X_train)
        dev_preds = knn.predict(X_dev)
        
        train_error_rate = 1 - accuracy_score(y_train, train_preds)
        dev_error_rate = 1 - accuracy_score(y_dev, dev_preds)
        
        training_error_rates.append(train_error_rate)
        dev_error_rates.append(dev_error_rate)
        
        print(f"Metric={metric}, k={k}: Training Error Rate = {train_error_rate:.4f}, Dev Error Rate = {dev_error_rate:.4f}")
        
        # If k equals 3, get the distances for top-3 neighbors
        if k == 3:
            distances, _ = knn.kneighbors(X_dev, n_neighbors=3)
            top_3_distances[metric] = distances

# Accessing the distances:
euclidean_distances_top_3 = top_3_distances['euclidean']
manhattan_distances_top_3 = top_3_distances['manhattan']


Metric=euclidean, k=1: Training Error Rate = 0.0152, Dev Error Rate = 0.2370
Metric=euclidean, k=3: Training Error Rate = 0.1152, Dev Error Rate = 0.1920
Metric=euclidean, k=5: Training Error Rate = 0.1376, Dev Error Rate = 0.1830
Metric=euclidean, k=7: Training Error Rate = 0.1410, Dev Error Rate = 0.1680
Metric=euclidean, k=9: Training Error Rate = 0.1552, Dev Error Rate = 0.1590
Metric=euclidean, k=11: Training Error Rate = 0.1634, Dev Error Rate = 0.1630
Metric=euclidean, k=13: Training Error Rate = 0.1634, Dev Error Rate = 0.1640
Metric=euclidean, k=15: Training Error Rate = 0.1642, Dev Error Rate = 0.1570
Metric=euclidean, k=17: Training Error Rate = 0.1660, Dev Error Rate = 0.1610
Metric=euclidean, k=19: Training Error Rate = 0.1690, Dev Error Rate = 0.1640
Metric=euclidean, k=21: Training Error Rate = 0.1702, Dev Error Rate = 0.1590
Metric=euclidean, k=23: Training Error Rate = 0.1708, Dev Error Rate = 0.1540
Metric=euclidean, k=25: Training Error Rate = 0.1696, Dev Error Rate 

In [67]:
euclidean_distances_top_3

array([[0.33441929, 1.41527469, 1.41674697],
       [2.48462326, 2.4886983 , 2.49291722],
       [0.08219178, 0.16438356, 0.16554809],
       ...,
       [0.        , 0.02739726, 0.04915875],
       [1.41894847, 1.43473041, 1.4625771 ],
       [0.08219178, 0.21917808, 0.26530612]])

In [68]:
manhattan_distances_top_3

array([[0.38999161, 2.05479452, 2.10204082],
       [6.49035505, 6.5649986 , 6.57254683],
       [0.08219178, 0.16438356, 0.19066257],
       ...,
       [0.        , 0.02739726, 0.06821359],
       [2.15683534, 2.3212189 , 2.41515236],
       [0.08219178, 0.21917808, 0.26530612]])