# COMP 551 Assignment 1 : Getting Started With Machine Learning

### K- Nearest Neighbors Experiments

#### Group 1: Rudi Kischer, Ben Hepditch

# Setup

- make sure to install the requirements.txt file, and to use the correct virtual environment with juptyer notebook

In [31]:


from ucimlrepo import fetch_ucirepo 
import pandas as pd
import numpy as np

pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.precision', 3)

# Data

Dataset 1: NHANES age prediction.csv (National Health and Nutrition Health Sur- vey 2013-2014 (NHANES) Age Prediction Subset): https://archive.ics.uci.edu/dataset/887/national+health+and+nutrition+health+survey+2013-2014+(nhanes)+age+prediction+subset

Dataset 2: Breast Cancer Wisconsin (Original) dataset: https://archive.ics.uci.edu/dataset/15/breast+cancer+wisconsin+original

### Load Data

In [1]:


# # DATASET 1: NHANES age prediction.csv
national_health_and_nutrition_health_survey_2013_2014_nhanes_age_prediction_subset = fetch_ucirepo(id=887) 
dataset_1 = national_health_and_nutrition_health_survey_2013_2014_nhanes_age_prediction_subset.data
X_1 = dataset_1.features 
y_1 = dataset_1.targets 

# # DATASET 2: Breast Cancer Wisconsin
breast_cancer_wisconsin_original = fetch_ucirepo(id=15) 
dataset_2 = breast_cancer_wisconsin_original.data

### Clean Data

- We want to remove all rows from our data sets which have null values in the targets or in the features.

In [2]:
# Define Cleaning Function
def clean(dataset):
  X = dataset.features
  Y = dataset.targets
  missing_rows_features = X.isnull().any(axis=1)
  missing_rows_targets = Y.isnull().any(axis=1)
  missing_rows = missing_rows_features | missing_rows_targets
  
  print(f"features_missing: {missing_rows_features.sum()}")
  print(f"targets_missing: {missing_rows_targets.sum()}")

  X_clean = X[-missing_rows]
  Y_clean = Y[-missing_rows]
  print(f'{missing_rows.sum()} rows deleted')
  dataset.features = X_clean
  dataset.targets = Y_clean

  return dataset


In [3]:
# Clean the DataSets
dataset_1 = clean(dataset_1)
dataset_2 = clean(dataset_2)

features_missing: 0
targets_missing: 0
0 rows deleted
features_missing: 16
targets_missing: 0
16 rows deleted


### Target Statistics

- We want to get some statistics about our target values. We want to know the mean and the squared difference.

In [19]:
# Define mean
def grouped_target_means(dataset):
    # grouped by the target
    X = dataset.features
    Y = dataset.targets

    XY = pd.concat([X,Y], axis=1)
    XY_grouped = XY.groupby(Y.columns[0])
    XY_mean = XY_grouped.mean()
    return XY_mean

# Define Feature Distance
def grouped_feature_distance(dataset):
    XY_mean = grouped_target_means(dataset)

    sqr_diff = (XY_mean.iloc[0] - XY_mean.iloc[1]) ** 2
    df_sqr_diff = pd.DataFrame([sqr_diff], index=['squarred_diff'])
    return df_sqr_diff

# Print Col Ranking
def feature_ranking(dataset):
    df_sqr_diff = grouped_feature_distance(dataset)
    row = df_sqr_diff.iloc[0]
    sorted_row = row.sort_values(ascending=False)

    ranking_df = pd.DataFrame({
      'Feature': sorted_row.index,
      'Value': sorted_row.values,
      'Rank': range(1, len(sorted_row) + 1)
    })

    return ranking_df


##### Feature Means

In [17]:
# Get grouped means
print(f'Dataset 1 Feature Means:')
XY_1_bar = grouped_target_means(dataset_1)
print(XY_1_bar)

print(f'Dataset 2 Feature Means: ')
XY_2_bar = grouped_target_means(dataset_2)
print(XY_2_bar)


Dataset 1 Feature Means:
           RIAGENDR  PAQ605  BMXBMI   LBXGLU  DIQ010   LBXGLT   LBXIN
age_group                                                            
Adult         1.512   1.806  27.968   98.645   2.014  109.991  12.107
Senior        1.508   1.909  27.886  104.330   2.027  141.209  10.405
Dataset 2 Feature Means: 
       Clump_thickness  Uniformity_of_cell_size  Uniformity_of_cell_shape  Marginal_adhesion  Single_epithelial_cell_size  Bare_nuclei  Bland_chromatin  Normal_nucleoli  Mitoses
Class                                                                                                                                                                            
2                2.964                    1.306                     1.414              1.347                        2.108        1.347            2.083            1.261    1.065
4                7.188                    6.577                     6.561              5.586                        5.326        7.628 

##### Group Feature Distance

In [18]:
print('Dataset 1:')
XY_1_fd = grouped_feature_distance(dataset_1)
print(XY_1_fd)

print('Dataset 2:')
XY_2_fd = grouped_feature_distance(dataset_2)
print(XY_2_fd)

Dataset 1:
                RIAGENDR  PAQ605  BMXBMI  LBXGLU     DIQ010   LBXGLT  LBXIN
squarred_diff  1.425e-05   0.011   0.007  32.319  1.786e-04  974.576  2.895
Dataset 2:
               Clump_thickness  Uniformity_of_cell_size  Uniformity_of_cell_shape  Marginal_adhesion  Single_epithelial_cell_size  Bare_nuclei  Bland_chromatin  Normal_nucleoli  Mitoses
squarred_diff           17.845                   27.784                    26.484             17.969                       10.357       39.448           15.144           21.128    2.363


##### Features Ranked By Squared Difference

In [20]:

print("Dataset 1 Feature Ranking")
d1_feature_ranking = feature_ranking(dataset_1)
print(d1_feature_ranking)

print("Dataset 2 Feature Ranking")
d2_feature_ranking = feature_ranking(dataset_2)
print(d2_feature_ranking)


Dataset 1 Feature Ranking
    Feature      Value  Rank
0    LBXGLT  9.746e+02     1
1    LBXGLU  3.232e+01     2
2     LBXIN  2.895e+00     3
3    PAQ605  1.065e-02     4
4    BMXBMI  6.728e-03     5
5    DIQ010  1.786e-04     6
6  RIAGENDR  1.425e-05     7
Dataset 1 Feature Ranking
                       Feature   Value  Rank
0                  Bare_nuclei  39.448     1
1      Uniformity_of_cell_size  27.784     2
2     Uniformity_of_cell_shape  26.484     3
3              Normal_nucleoli  21.128     4
4            Marginal_adhesion  17.969     5
5              Clump_thickness  17.845     6
6              Bland_chromatin  15.144     7
7  Single_epithelial_cell_size  10.357     8
8                      Mitoses   2.363     9


-  TODO: *Description goes here analyzing if the features that are strongly different are associated with the target*

- Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.

# Models

### K-Nearest Neighbors

- k nearest neighbour uses a data set to predict the classification of a new data point. The algorithm works by finding the k nearest neighbours to the new data point and classifying the new data point as the most common classification of the k nearest neighbours. The algorithm can be used for both classification and regression problems. For classification problems, the algorithm uses the most common classification of the k nearest neighbours. For regression problems, the algorithm uses the average of the k nearest neighbours.

##### Distance function

In [24]:
 

#x1 : series of a df
#x2 : series of a df

def euclidean(x1, x2):
  return np.linalg.norm(x1 - x2)

def manhattan(x1,x2):
  return (x1 - x2).abs().sum()

def chebyshev(x1, x2):
  return (x1 - x2).abs().max()

def hamming(x1, x2):
  return (~(x1 == x2)).sum()

def cosineSim(x1, x2):
  prod = x1.dot(x2)
  mag = np.linalg.norm(x1) * np.linalg.norm(x2)
  return prod / mag

### KNN Model

In [145]:

# K nearest Neighbour
# Mostly following the design from :
# https://github.com/yueliyl/comp551-notebooks/blob/master/KNN.ipynb

class KNN():

  def __init__(self, k=1, similiarity_fn=None):
    if not similiarity_fn:
      similiarity_fn = euclidean
    self.similiarity_fn = similiarity_fn
    self.k = k
  
  def standardize_features(self):
    x_d_bar = self.X.mean()
    x_d_sd = self.X.std(ddof=0)
    self.X = (self.X - x_d_bar) / x_d_sd

  def fit(self, X, Y, k):
    self.X = X
    self.Y = Y
    self.k = k
    # self.C = self.Y[:, 0].nunique()
    # self.standardize_features()

  def predict_sample(self, x_i):
    similiarity_scores = self.X.apply(lambda x_j: self.similiarity_fn(x_i,x_j), axis=1)
    top_k_idx = similiarity_scores.nsmallest(self.k).index 
    top_k_neighbour_labels =  self.Y.loc[top_k_idx][self.Y.columns[0]]
    labels = self.Y.iloc[:, 0].unique()
    label_probs = top_k_neighbour_labels.value_counts(normalize=True).reindex(labels, fill_value=0)
    return pd.Series(label_probs, index=labels)
  
  def predict(self, x):
    prob_df = x.apply(self.predict_sample, axis=1)
    y = prob_df.idxmax(axis=1)
    predicted_probabilities = prob_df
    y_df = pd.DataFrame(y, columns=self.Y.columns)
    return y_df, predicted_probabilities



In [67]:

def train_test_split(dataset, split, shuffle=True, seed=1):
    X = dataset.features
    Y = dataset.targets
    XY = pd.concat([X, Y], axis=1)
    XY_shuffled = XY.sample(frac=1, random_state=seed).reset_index(drop=True)
    split_idx = int(len(XY) * split)
    
    y_size = -Y.shape[1]

    # Train Data Set
    train_set = XY_shuffled[split_idx:]
    X_train = train_set.iloc[:, :y_size]
    Y_train = train_set.iloc[:, y_size:]

    # Test Data Set
    test_set = XY_shuffled[:split_idx]
    X_test = test_set.iloc[:, :y_size]
    Y_test = test_set.iloc[:, y_size:]

    return X_train, Y_train, X_test, Y_test

In [156]:
test_size = 0.1

X1_train, Y1_train, X1_test, Y1_test = train_test_split(dataset_1, test_size)

X2_train, Y2_train, X2_test, Y2_test = train_test_split(dataset_1, test_size)



In [157]:
knn = KNN()

knn.fit(X1_train, Y1_train, k=3)


In [158]:
prediction, label_probs = knn.predict(X1_test)

In [164]:
print(prediction.columns)
print('Prediction')
print(prediction)

print('Correct Labels')
print(Y1_test)

# print the number of correct predictions
print('Number of Correct Predictions')

# align operands
# prediction, Y1_test = prediction.align(Y1_test, join='inner')

print(len(prediction))
print(len(Y1_test))

correct = (prediction == Y1_test)[Y1_test.columns[0]].sum()
# percentage of correct predictions
print(f'Percentage of Correct Predictions: {correct / len(prediction) * 100}%')

Index(['age_group'], dtype='object')
Prediction
    age_group
0       Adult
1       Adult
2       Adult
3       Adult
4       Adult
..        ...
222     Adult
223     Adult
224     Adult
225     Adult
226    Senior

[227 rows x 1 columns]
Correct Labels
    age_group
0       Adult
1       Adult
2       Adult
3      Senior
4       Adult
..        ...
222     Adult
223     Adult
224     Adult
225     Adult
226     Adult

[227 rows x 1 columns]
Number of Correct Predictions
227
227
Percentage of Correct Predictions: 85.90308370044053%
