In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.cluster import KMeans  # For unsupervised task
import matplotlib.pyplot as plt

In [4]:
#Load the dataset for all 70 patients
data = []
for i in range(1, 71):
    filename = f"data-{i:02d}"
    data.append(pd.read_csv(filename, sep="\t", header=None, usecols=[2, 3]))
data = pd.concat(data)  # Combine into a single DataFrame

In [5]:
# One-hot encoding for the third column
encoder = OneHotEncoder(handle_unknown='ignore')  # Handle potential unseen values
encoded_data = encoder.fit_transform(data.iloc[:, 0].values.reshape(-1, 1)).toarray()

In [6]:
# Exploratory Data Analysis (EDA)
print(data.info())  # Check data types and missing values
print(data.describe())  # Summary statistics
print(data.iloc[:, 0].value_counts())  # Frequency of unique features (column 3)
print(data.iloc[:, 1].value_counts())  # Frequency of unique labels (column 4)

<class 'pandas.core.frame.DataFrame'>
Index: 29330 entries, 0 to 340
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   2       29330 non-null  int64 
 1   3       29297 non-null  object
dtypes: int64(1), object(1)
memory usage: 687.4+ KB
None
                  2
count  29330.000000
mean      46.428606
std       13.453219
min        0.000000
25%       33.000000
50%       48.000000
75%       60.000000
max       72.000000
2
33    9518
34    3830
58    3518
62    3160
60    2771
48    1883
35    1053
57     990
64     904
65     331
67     326
63     219
66     154
70     139
56     119
71      98
72      94
69      68
61      66
68      34
0       33
59      20
4        1
36       1
Name: count, dtype: int64
3
6        1522
0        1223
3        1183
2        1049
4         951
         ... 
327         1
364         1
258         1
276         1
378.0       1
Name: count, Length: 744, dtype: int64


In [7]:
# Data Splitting function
def split_data(data, train_size=0.6, val_size=0.1, test_size=0.3, random_state=42):
    np.random.seed(random_state)
    indices = np.random.permutation(len(data))
    train_idx = indices[:int(train_size * len(data))]
    val_idx = indices[int(train_size * len(data)):int((train_size + val_size) * len(data))]
    test_idx = indices[int((train_size + val_size) * len(data)):]
    train_data = data.iloc[train_idx].reset_index(drop=True)
    val_data = data.iloc[val_idx].reset_index(drop=True)
    test_data = data.iloc[test_idx].reset_index(drop=True)
    return train_data, val_data, test_data

In [8]:
# KNN implementation from scratch
def knn(train_data, test_point, k):
    distances = []
    for train_point in train_data:
        # Calculate Euclidean distance between test_point and train_point
        distance = np.sqrt(np.sum((test_point - train_point) ** 2))
        distances.append((train_point, distance))
    distances.sort(key=lambda x: x[0])  # Sort by distance
    nearest_neighbors = distances[:k]  # Select k nearest neighbors
    # Get most frequent label among nearest neighbors
    neighbor_labels = [neighbor[1] for neighbor in nearest_neighbors]
    predicted_label = max(set(neighbor_labels), key=neighbor_labels.count)
    return predicted_label

In [9]:
# Classification and Evaluation loop
k_values = range(1, 11)
accuracies = []
for k in k_values:
    accuracies_per_split = []
    precisions_per_split = []
    recalls_per_split = []
    f1_scores_per_split = []
    roc_auc_scores_per_split = []
    for i in range(5):
        train_data, val_data, test_data = split_data(data)
        # Train (no explicit training needed for KNN)
        # Make predictions on validation set
        predictions = [knn(train_data.iloc[:,:-1], test_point, k) for test_point in val_data.iloc[:, :-1].values]# Use .values for NumPy array
        print(val_data.iloc[:, -1].isnull().sum())  # Check for NaNs in labels
        print(predictions)  # Check for NaNs in predictions
        print(len(val_data.iloc[:, -1]))
        print(len(predictions))
        print(val_data.iloc[:, -1].isnull().sum())
        print(np.isnan(predictions).sum())
        val_data.iloc[:, -1] = val_data.iloc[:, -1].fillna(val_data.iloc[:, -1].mode().iloc[0])
        # Print unique values in the target labels column
        print(val_data.iloc[:, -1].unique())
        # Remove rows with non-numeric labels
        val_data = val_data[pd.to_numeric(val_data.iloc[:, -1], errors='coerce').notnull()]
        # Ensure the target labels are of integer type
        val_data.iloc[:, -1] = val_data.iloc[:, -1].astype(int)
        val_data = val_data.dropna()
        
        # Evaluate performance
        accuracy = accuracy_score(val_data.iloc[:, -1], predictions)
        precision = precision_score(val_data.iloc[:, -1], predictions, average='weighted')
        recall = recall_score(val_data.iloc[:, -1], predictions, average='weighted')
        f1 = f1_score(val_data.iloc[:, -1], predictions, average='weighted')
        roc_auc = roc_auc_score(val_data.iloc[:, -1], predictions)
        accuracies_per_split.append(accuracy)
        precisions_per_split.append(precision)
        recalls_per_split.append(recall)
        f1_scores_per_split.append(f1)
        roc_auc_scores_per_split.append(roc_auc)
    accuracy_mean = np.mean(accuracies_per_split)
    accuracy_std = np.std(accuracies_per_split)
    accuracies.append((k, accuracy_mean, accuracy_std))
    # Similarly, calculate mean and std for other metrics

4
[56.0, 33.0, 65.0, 60.0, 56.0, 60.0, 56.0, 31.0, 31.0, 56.0, 31.0, 60.0, 31.0, 58.0, 31.0, 60.0, 60.0, 31.0, 33.0, 58.0, 31.0, 55.0, 60.0, 60.0, 31.0, 60.0, 32.0, 58.0, 58.0, 31.0, 31.0, 58.0, 62.0, 63.0, 31.0, 58.0, 55.0, 31.0, 31.0, 31.0, 31.0, 31.0, 58.0, 58.0, 60.0, 31.0, 31.0, 56.0, 31.0, 32.0, 64.0, 65.0, 58.0, 32.0, 65.0, 56.0, 32.0, 69.0, 60.0, 60.0, 56.0, 56.0, 63.0, 32.0, 62.0, 56.0, 31.0, 32.0, 31.0, 31.0, 31.0, 31.0, 32.0, 60.0, 31.0, 60.0, 62.0, 31.0, 31.0, 46.0, 60.0, 31.0, 31.0, 31.0, 60.0, 31.0, 46.0, 31.0, 58.0, 31.0, 55.0, 56.0, 31.0, 31.0, 31.0, 32.0, 31.0, 60.0, 58.0, 56.0, 46.0, 56.0, 56.0, 31.0, 63.0, 31.0, 58.0, 31.0, 31.0, 31.0, 62.0, 56.0, 62.0, 31.0, 31.0, 32.0, 61.0, 56.0, 46.0, 56.0, 56.0, 32.0, 46.0, 60.0, 56.0, 62.0, 69.0, 60.0, 32.0, 56.0, 32.0, 56.0, 31.0, 56.0, 56.0, 60.0, 58.0, 32.0, 46.0, 31.0, 46.0, 31.0, 31.0, 31.0, 60.0, 32.0, 58.0, 46.0, 58.0, 56.0, 31.0, 46.0, 60.0, 60.0, 32.0, 46.0, 58.0, 31.0, 60.0, 56.0, 56.0, 32.0, 31.0, 31.0, 31.0, 31.0, 5

ValueError: Found input variables with inconsistent numbers of samples: [2932, 2933]

In [10]:
# Visualization
ks, mean_accuracies, std_accuracies = zip(*accuracies)
plt.plot(ks, mean_accuracies)
plt.errorbar(ks, mean_accuracies, yerr=std_accuracies)
plt.xlabel("k")
plt.ylabel("Accuracy")
plt.title("Accuracy vs. k for KNN")
plt.show()

ValueError: not enough values to unpack (expected 3, got 0)