In [1]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

In [2]:
# Read the data
names = ["age", "sector", "education", "marital-status", "occupation", "race", "sex", "hours-per-week", "country-of-origin", "target"]
data = pd.read_csv("income.train.txt.5k", names = names)
dev_data = pd.read_csv("income.dev.txt", names = names)

In [3]:
num_processor = MinMaxScaler(feature_range=(0, 2))
cat_processor = OneHotEncoder(sparse=False, handle_unknown='ignore')

preprocessor = ColumnTransformer([
           ('num', num_processor, ['age', 'hours-per-week']),
           ('sector', cat_processor, ['sector']),
           ('education', cat_processor, ['education']),
           ('marital-status', cat_processor, ['marital-status']),
           ('occupation', cat_processor, ['occupation']),
           ('race', cat_processor, ['race']),
           ('sex', cat_processor, ['sex']),
           ('country', cat_processor, ['country-of-origin']),
       ])
preprocessor.fit(data)
processed_data = preprocessor.transform(data)



In [4]:
X_train = processed_data
y_train = (data["target"].str.strip() == '>50K').astype(int).values

In [5]:
processed_dev_data = preprocessor.transform(dev_data)
X_dev = processed_dev_data
y_dev = (dev_data["target"].str.strip() == '>50K').astype(int).values

In [7]:
X_train

array([[0.90410959, 0.24489796, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.57534247, 0.79591837, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.98630137, 0.79591837, 0.        , ..., 1.        , 0.        ,
        0.        ],
       ...,
       [1.20547945, 0.08163265, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.68493151, 0.79591837, 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.10958904, 0.79591837, 0.        , ..., 1.        , 0.        ,
        0.        ]])

In [6]:
import numpy as np
from sklearn.metrics import accuracy_score
from collections import Counter

In [22]:
def euclidean_distance(X, p):
    #diff = X - p
    #return np.sqrt(np.sum(diff ** 2, axis = 1))
    return np.linalg.norm(X - p, axis=1)

In [23]:
def manhattan_distance(X, p):
    #return np.sum(np.abs(X - p), axis = 1)
     return np.linalg.norm(X - p, ord=1, axis=1)

In [1]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
knn.fit(X_train, y_train)
distances, indices = knn.kneighbors(X_dev[0])
# Print results
print("Top 3 closest individuals' indices:", indices [0])
print("Their Euclidean distances:", distances [0])
closest_people = X_train[indices[0]]
print(closest_people)

NameError: name 'X_train' is not defined

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=3, metric='euclidean' )
knn.fit(X_train, y_train)
euclidean_distances, _= knn.kneighbors (IX_dev[011)
knn_manhattan = KNeighborsClassifier(n_neighbors=3, metric='manhattan')
knn_manhattan. fit (X_train, y_train)
manhattan_distances, _ = knn_manhattan.kneighbors(X_dev[0])
print ("Sklearn Euclidean distances:",euclidean_distances [0])
print ("Sklearn Manhattan distances:",manhattan_distances [0])

In [27]:
def my_knn(X_train, y_train, x, k, distance_func):
    distances = distance_func(X_train, x) # compute the distances between X_train and the data point ehich we want to predict the label
    k_indices = np.argsort(distances)[:k] # Get the top k nearest neighbors
    k_nearest_labels = y_train[k_indices] # Get their corresponding labels from y_train.
    most_common = Counter(k_nearest_labels).most_common(1) # Majority voting
    return most_common[0][0]


In [36]:
import time
start_time = time.time()
for k in range(1, 101, 2):
    train_preds = []
    for i in range(X_train.shape[0]):
        pred = my_knn(X_train, y_train, X_train[i], k, manhattan_distance)
        train_preds.append(pred)
    dev_preds = [my_knn(X_train, y_train, X_dev[i], k, manhattan_distance) for i in range(X_dev.shape[0])]
    
    train_error_rate = 1 - accuracy_score(y_train, train_preds)
    dev_error_rate = 1 - accuracy_score(y_dev, dev_preds)
    
    print(f"k={k}: Training Error Rate = {train_error_rate:.4f}, Dev Error Rate = {dev_error_rate:.4f}")

# Calculate positive rates
    # train_positive_rate = sum(train_preds) / len(train_preds)
    # dev_positive_rate = sum(dev_preds) / len(dev_preds)
    
    # print(f"k={k}: Training Error Rate = {train_error_rate:.4f}, Dev Error Rate = {dev_error_rate:.4f}, Train Positive Rate = {train_positive_rate:.4f}, Dev Positive Rate = {dev_positive_rate:.4f}")
end_time = time.time()
elapsed_time = end_time - start_time

print(f"Elapsed time: {elapsed_time:.2f} seconds")


k=41: Training Error Rate = 0.1728, Dev Error Rate = 0.1410
k=41: Training Error Rate = 0.1728, Dev Error Rate = 0.1410, Train Positive Rate = 0.2110, Dev Positive Rate = 0.2070
k=43: Training Error Rate = 0.1752, Dev Error Rate = 0.1440
k=43: Training Error Rate = 0.1752, Dev Error Rate = 0.1440, Train Positive Rate = 0.2074, Dev Positive Rate = 0.2020


KeyboardInterrupt: 

In [None]:
# 5 deployment

In [32]:
# 1. Read the `income.test.blind` dataset
# Note: Since the 'target' column doesn't exist in the blind dataset, we'll exclude it from the names list when reading the data.
blind_names = names[:-1]
blind_data = pd.read_csv("income.test.blind", names=blind_names)

# 2. Preprocess the `income.test.blind` dataset
processed_blind_data = preprocessor.transform(blind_data)
X_blind = processed_blind_data

# 3. Predict the outcomes using my kNN model.
best_k = 41
blind_preds = [my_knn(X_train, y_train, X_blind[i], best_k, euclidean_distance) for i in range(X_blind.shape[0])]

# Convert integer predictions back to string format
blind_predictions = [">50K" if pred == 1 else "<=50K" for pred in blind_preds]

# 4. Write the predictions to the `income.test.predicted` file
with open("income.test.predicted", "w") as f:
    for idx, pred in enumerate(blind_predictions):
        # Combine the original data with the prediction
        original_data = ", ".join(map(str, blind_data.iloc[idx].values))
        f.write(original_data + ", " + pred + "\n")


In [37]:
# Positve analysis
# Overall
true_positives = sum((y_dev == 1) & (np.array(dev_preds) == 1))
actual_positives = sum(y_dev == 1)
predicted_positives = sum(dev_preds)

true_positive_percentage = true_positives / actual_positives
predicted_positive_percentage = predicted_positives / len(dev_preds)

print(f"Overall True Positive %: {true_positive_percentage:.4f}")
print(f"Overall Predicted Positive %: {predicted_positive_percentage:.4f}")


Overall True Positive %: 0.6229
Overall Predicted Positive %: 0.2020


In [38]:
# Given Gender
gender = 'Female'
female_indices = dev_data['sex'].str.strip() == gender

female_true_labels = y_dev[female_indices]
female_predictions = np.array(dev_preds)[female_indices]

true_positives_female = sum((female_true_labels == 1) & (female_predictions == 1))
actual_positives_female = sum(female_true_labels == 1)
predicted_positives_female = sum(female_predictions)

true_positive_percentage_female = true_positives_female / actual_positives_female
predicted_positive_percentage_female = predicted_positives_female / len(female_predictions)

print(f"For {gender} - True Positive %: {true_positive_percentage_female:.4f}")
print(f"For {gender} - Predicted Positive %: {predicted_positive_percentage_female:.4f}")


For Female - True Positive %: 0.4500
For Female - Predicted Positive %: 0.0667


In [39]:
race = 'White'  # for example
race_indices = dev_data['race'].str.strip() == race

race_true_labels = y_dev[race_indices]
race_predictions = np.array(dev_preds)[race_indices]

true_positives_race = sum((race_true_labels == 1) & (race_predictions == 1))
actual_positives_race = sum(race_true_labels == 1)
predicted_positives_race = sum(race_predictions)

true_positive_percentage_race = true_positives_race / actual_positives_race
predicted_positive_percentage_race = predicted_positives_race / len(race_predictions)

print(f"For {race} - True Positive %: {true_positive_percentage_race:.4f}")
print(f"For {race} - Predicted Positive %: {predicted_positive_percentage_race:.4f}")


For White - True Positive %: 0.6324
For White - Predicted Positive %: 0.2180
