<a href="https://colab.research.google.com/github/salman-uq2024/Pracs-notebook/blob/main/PracW5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Mount google colab to drive to access to the dataset (uncomment if you use Google Colab + Drive)
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


# Q1


In [22]:
# TODO: Load dataset
w3classif = pd.read_csv('/content/drive/My Drive/Colab Notebooks//w3classif.csv', header=None).to_numpy()
# print("Loaded dataset shape:", w3classif.shape)

In [23]:
def create_train_test_data(data, test_size=0.3):
    """
    Create 10 different shuffled train/test set pairs from the given dataset.

    Parameters:
        data (numpy.ndarray): The dataset to be split.
        test_size (float): Proportion of the dataset to include in the test split.

    Returns:
        trains (list): List of training sets.
        tests (list): List of test sets.
    """
    trains, tests = [], []

    for i in range(10):
        # train_test_split shuffles data by default.
        # Use a different random_state for each iteration to get different splits.
        train_data, test_data = train_test_split(data, test_size=test_size, random_state=i, shuffle=True)
        trains.append(train_data)
        tests.append(test_data)

    return trains, tests

# Create the 10 different train/test splits
trains, tests = create_train_test_data(w3classif, test_size=0.3)
print("Number of splits created:", len(trains))

Number of splits created: 10


# Q2

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import numpy as np

def repeat_knn(trials=10, test_size=0.3):
    """
    Repeat k-NN classification for multiple trials using different train/test splits.

    Parameters:
        trials (int): Number of trials (should be 10).
        test_size (float): Proportion of the dataset to use as the test split.

    Returns:
        train_losses (list): List of training losses (misclassification rates) for each trial.
        test_losses (list): List of test losses (misclassification rates) for each trial.
    """
    # For storing losses
    train_losses, test_losses = [], []

    # Create train and test datasets (using the function defined in W5 Q1)
    trains, tests = create_train_test_data(w3classif, test_size=test_size)

    for i in range(trials):
        # Get the i-th train/test split
        train_data = trains[i]
        test_data = tests[i]

        # Split train data into features and target
        # Assuming the target (class label) is the last column and features are the preceding columns.
        X_train = train_data[:, :-1]
        y_train = train_data[:, -1]

        # Split test data into features and target
        X_test = test_data[:, :-1]
        y_test = test_data[:, -1]

        # Initialize the k-NN classifier (using k=3)
        knn = KNeighborsClassifier(n_neighbors=3)

        # Train the classifier on the training data
        knn.fit(X_train, y_train)

        # Make predictions on the training and test data
        y_train_pred = knn.predict(X_train)
        y_test_pred = knn.predict(X_test)

        # Calculate training and test accuracy
        train_accuracy = accuracy_score(y_train, y_train_pred)
        test_accuracy = accuracy_score(y_test, y_test_pred)

        # Calculate training and test loss (misclassification rate)
        train_loss = 1 - train_accuracy
        test_loss = 1 - test_accuracy

        # Store losses
        train_losses.append(train_loss)
        test_losses.append(test_loss)

    return train_losses, test_losses


In [17]:
# Print the average training and test losses for 10 trials using the function implemented above
train_losses, test_losses = repeat_knn(trials=10, test_size=0.3)
print(f'Avg Training Loss (Misclassification Rate): {np.array(train_losses).mean() * 100:.2f}%')
print(f'Avg Test Loss (Misclassification Rate): {np.array(test_losses).mean() * 100:.2f}%')

Avg Training Loss (Misclassification Rate): 2.46%
Avg Test Loss (Misclassification Rate): 4.58%


In [24]:
train_losses, test_losses = repeat_knn(trials=10, test_size=0.3)
avg_train_loss_70 = np.mean(train_losses)
avg_test_loss_70 = np.mean(test_losses)
std_train_70 = np.std(train_losses, ddof=1)
std_test_70 = np.std(test_losses, ddof=1)
print("70/30 Split")
print("Avg Training Loss: {:.3f}".format(avg_train_loss_70))
print("Avg Test Loss: {:.3f}".format(avg_test_loss_70))
print("Std Dev Training Loss: {:.3f}".format(std_train_70))
print("Std Dev Test Loss: {:.3f}".format(std_test_70))

70/30 Split
Avg Training Loss: 0.025
Avg Test Loss: 0.046
Std Dev Training Loss: 0.007
Std Dev Test Loss: 0.016


# Q3

In [25]:
# TODO: Define all possible test set sizes
test_sizes = [0.5, 0.1]

# Loop over each test set size, repeat the experiments, and print average losses
for ts in test_sizes:
    # Get the train/test losses using our previously defined function repeat_knn
    train_losses, test_losses = repeat_knn(trials=10, test_size=ts)

    print("Test Set Size: {}".format(ts))
    print("Average Training Loss: {:.3f}".format(np.mean(train_losses)))
    print("Average Test Loss: {:.3f}".format(np.mean(test_losses)))
    print("-" * 40)

Test Set Size: 0.5
Average Training Loss: 0.029
Average Test Loss: 0.046
----------------------------------------
Test Set Size: 0.1
Average Training Loss: 0.027
Average Test Loss: 0.048
----------------------------------------


In [26]:
test_sizes = [0.5, 0.1]  # 50/50 and 90/10 splits
for ts in test_sizes:
    train_losses, test_losses = repeat_knn(trials=10, test_size=ts)
    avg_train_loss = np.mean(train_losses)
    avg_test_loss = np.mean(test_losses)
    print(f"Split for test_size={ts}")
    print("Avg Training Loss: {:.3f}".format(avg_train_loss))
    print("Avg Test Loss: {:.3f}".format(avg_test_loss))
    print("-" * 40)

Split for test_size=0.5
Avg Training Loss: 0.029
Avg Test Loss: 0.046
----------------------------------------
Split for test_size=0.1
Avg Training Loss: 0.027
Avg Test Loss: 0.048
----------------------------------------


# Q4

In [27]:
# TODO: Define all possible test set sizes
test_sizes = [0.3, 0.5, 0.1]

for ts in test_sizes:
    # Get the train and test losses over 10 trials for the given test size.
    train_losses, test_losses = repeat_knn(trials=10, test_size=ts)

    # Calculate sample standard deviation for both training and test losses.
    std_train = np.std(train_losses, ddof=1)
    std_test = np.std(test_losses, ddof=1)

    print(f"Test set size: {ts}")
    print(f"Standard Deviation of Training Losses: {std_train:.3f}")
    print(f"Standard Deviation of Test Losses: {std_test:.3f}")
    print("-" * 40)

Test set size: 0.3
Standard Deviation of Training Losses: 0.007
Standard Deviation of Test Losses: 0.016
----------------------------------------
Test set size: 0.5
Standard Deviation of Training Losses: 0.010
Standard Deviation of Test Losses: 0.008
----------------------------------------
Test set size: 0.1
Standard Deviation of Training Losses: 0.005
Standard Deviation of Test Losses: 0.043
----------------------------------------


In [29]:
test_sizes = [0.3, 0.5, 0.1]  # 70/30, 50/50, 90/10 splits
for ts in test_sizes:
    train_losses, test_losses = repeat_knn(trials=10, test_size=ts)
    std_train = np.std(train_losses, ddof=1)
    std_test = np.std(test_losses, ddof=1)
    print(f"Test set size: {ts}")
    print("Std Dev Training Loss: {:.3f}".format(std_train))
    print("Std Dev Test Loss: {:.3f}".format(std_test))
    print("-" * 40)

Test set size: 0.3
Std Dev Training Loss: 0.007
Std Dev Test Loss: 0.016
----------------------------------------
Test set size: 0.5
Std Dev Training Loss: 0.010
Std Dev Test Loss: 0.008
----------------------------------------
Test set size: 0.1
Std Dev Training Loss: 0.005
Std Dev Test Loss: 0.043
----------------------------------------


# Q5

In [30]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
import pandas as pd

np.random.seed(42)
w3classif_shuffled = w3classif[np.random.permutation(w3classif.shape[0])]

# Split the dataset into features and target
# Assuming the target is in the last column and the features are all preceding columns.
X = w3classif_shuffled[:, :-1]
y = w3classif_shuffled[:, -1]

# Initialize the k-NN classifier (using k=3)
knn = KNeighborsClassifier(n_neighbors=3)

# Define the number of folds for cross-validation
num_folds = 10

# Perform cross-validation; scoring returns accuracy scores by default.
cv_scores = cross_val_score(knn, X, y, cv=num_folds, scoring='accuracy')

# Convert accuracies to errors (misclassification rate)
cv_errors = 1 - cv_scores

# Calculate mean and sample standard deviation (ddof=1 for sample std)
mean_error = np.mean(cv_errors)
std_error = np.std(cv_errors, ddof=1)

# Print the results
print("10-Fold Cross Validation:")
print("Mean Error: {:.3f}".format(mean_error))
print("Standard Deviation of Error: {:.3f}".format(std_error))

10-Fold Cross Validation:
Mean Error: 0.050
Standard Deviation of Error: 0.049


In [31]:
from sklearn.model_selection import cross_val_score
np.random.seed(42)
w3classif_shuffled = w3classif[np.random.permutation(w3classif.shape[0])]
X = w3classif_shuffled[:, :-1]
y = w3classif_shuffled[:, -1]
knn = KNeighborsClassifier(n_neighbors=3)
num_folds = 10
cv_scores = cross_val_score(knn, X, y, cv=num_folds, scoring='accuracy')
cv_errors = 1 - cv_scores
mean_cv_error = np.mean(cv_errors)
std_cv_error = np.std(cv_errors, ddof=1)
print("10-Fold Cross Validation")
print("Mean Error: {:.3f}".format(mean_cv_error))
print("Std Dev of Error: {:.3f}".format(std_cv_error))

10-Fold Cross Validation
Mean Error: 0.050
Std Dev of Error: 0.049
