# Prac W5 - Performance

In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd

### Q1

In [5]:
w3classif = pd.read_csv('w3classif.csv')

In [7]:
def create_train_test_data(w3classif, test_size=0.3):
    # For storing data
    trains, tests = [], []
    
    for i in range(10):
        # TODO: Shuffle the dataset
        shuffled_data = w3classif.sample(frac=1, random_state=i)
        
        # TODO: Split the dataset
        train_data, test_data = train_test_split(shuffled_data, test_size=test_size, random_state=i)
        
        # Store data
        trains.append(train_data)
        tests.append(test_data)
        
    return trains, tests

### Q2

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

def repeat_knn(w3classif, trials=10, test_size=0.3, n_neighbors=5):
    # For storing
    train_losses, test_losses = [], []
    
    # Create train and test datasets (calling the function you created previously)
    trains, tests = create_train_test_data(w3classif, test_size)
    
    for i in range(trials):
        # TODO: Split train data into features and target
        X_train = trains[i].iloc[:, :-1]  # Assuming the target is the last column
        y_train = trains[i].iloc[:, -1]
        
        # TODO: Split test data into features and target
        X_test = tests[i].iloc[:, :-1]  # Assuming the target is the last column
        y_test = tests[i].iloc[:, -1]
        
        # TODO: Initialize the k-NN classifier
        knn = KNeighborsClassifier(n_neighbors=n_neighbors)
        
        # TODO: Train the classifier on the training data
        knn.fit(X_train, y_train)
        
        # TODO: Make predictions on the training and test data
        y_train_pred = knn.predict(X_train)
        y_test_pred = knn.predict(X_test)
        
        # TODO: Calculate training and test accuracy
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)
        
        # TODO: Calculate training and test loss (misclassification rate)
        train_loss = 1 - train_accuracy
        test_loss = 1 - test_accuracy
        
        # TODO: Store train and test losses
        train_losses.append(train_loss)
        test_losses.append(test_loss)
    
    return train_losses, test_losses

In [16]:
train_losses, test_losses = repeat_knn(w3classif, trials=10, test_size=0.3)
print(f'Avg Training Loss (Misclassification Rate): {np.array(train_losses).mean() * 100:.2f}%')
print(f'Avg Test Loss (Misclassification Rate): {np.array(test_losses).mean() * 100:.2f}%')

Avg Training Loss (Misclassification Rate): 3.30%
Avg Test Loss (Misclassification Rate): 5.25%


### Q3

In [18]:
test_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]

for ts in test_sizes:
    # TODO: Repeat Q1 and Q2 and print the average loss for 10 trials for each test set size
    trains, tests = create_train_test_data(w3classif, test_size=ts)
    train_losses, test_losses = repeat_knn(w3classif, trials=10, test_size=ts)
    
    print(f"\nTest size: {ts}")
    print(f'Avg Training Loss (Misclassification Rate): {np.array(train_losses).mean() * 100:.2f}%')
    print(f'Avg Test Loss (Misclassification Rate): {np.array(test_losses).mean() * 100:.2f}%')


Test size: 0.1
Avg Training Loss (Misclassification Rate): 3.09%
Avg Test Loss (Misclassification Rate): 5.25%

Test size: 0.2
Avg Training Loss (Misclassification Rate): 3.13%
Avg Test Loss (Misclassification Rate): 5.00%

Test size: 0.3
Avg Training Loss (Misclassification Rate): 3.30%
Avg Test Loss (Misclassification Rate): 5.25%

Test size: 0.4
Avg Training Loss (Misclassification Rate): 3.26%
Avg Test Loss (Misclassification Rate): 5.12%

Test size: 0.5
Avg Training Loss (Misclassification Rate): 3.32%
Avg Test Loss (Misclassification Rate): 4.65%


### Q4

In [22]:
test_sizes = [0.1, 0.2, 0.3, 0.4, 0.5]

for ts in test_sizes:
    # TODO: Calculate the sample standard deviation of your training and test set error values over the 10 trials from Q2 and Q3
    trains, tests = create_train_test_data(w3classif, test_size=ts)
    train_losses, test_losses = repeat_knn(w3classif, trials=10, test_size=ts)
    
    # Calculate mean losses
    mean_train_loss = np.array(train_losses).mean() * 100
    mean_test_loss = np.array(test_losses).mean() * 100
    
    # Calculate standard deviations
    std_train_loss = np.array(train_losses).std() * 100
    std_test_loss = np.array(test_losses).std() * 100
    
    print(f"\nTest size: {ts}")
    print(f'Avg Training Loss: {mean_train_loss:.2f}% ± {std_train_loss:.2f}%')
    print(f'Avg Test Loss: {mean_test_loss:.2f}% ± {std_test_loss:.2f}%')


Test size: 0.1
Avg Training Loss: 3.09% ± 0.36%
Avg Test Loss: 5.25% ± 2.36%

Test size: 0.2
Avg Training Loss: 3.13% ± 0.40%
Avg Test Loss: 5.00% ± 1.37%

Test size: 0.3
Avg Training Loss: 3.30% ± 0.78%
Avg Test Loss: 5.25% ± 1.83%

Test size: 0.4
Avg Training Loss: 3.26% ± 0.81%
Avg Test Loss: 5.12% ± 1.04%

Test size: 0.5
Avg Training Loss: 3.32% ± 0.85%
Avg Test Loss: 4.65% ± 0.67%


### Q5

In [24]:
from sklearn.model_selection import cross_val_score

# TODO: Shuffle the original dataset
shuffled_data = w3classif.sample(frac=1, random_state=42)

# TODO: Split the dataset into features and target
X = shuffled_data.iloc[:, :-1]  # Assuming target is in the last column
y = shuffled_data.iloc[:, -1]

# TODO: Initialize the k-NN classifier
knn = KNeighborsClassifier(n_neighbors=5)

# TODO: Define the number of folds for cross-validation
num_folds = 10

# TODO: Perform cross-validation
cv_scores = cross_val_score(knn, X, y, cv=num_folds)

# TODO: Calculate mean and standard deviation of cross-validation error
cv_errors = 1 - cv_scores
mean_cv_error = cv_errors.mean() * 100
std_cv_error = cv_errors.std() * 100

# TODO: Print the errors
print(f'Cross-Validation Error: {mean_cv_error:.2f}% ± {std_cv_error:.2f}%')
print(f'Individual fold errors: {[f"{err*100:.2f}%" for err in cv_errors]}')

Cross-Validation Error: 4.26% ± 2.51%
Individual fold errors: ['7.50%', '2.50%', '7.50%', '2.50%', '2.50%', '5.00%', '7.50%', '0.00%', '5.00%', '2.56%']
