# ------------------------- CSE-6363-001 ML Project 2 -------------------------

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [6]:
def read_data(filename='./data/data.txt'):
    """
    Load the dataset for question 1 provided with the homework.
    
    Parameters
    ----------
    filename - string
    
    Returns
    -------
    data - numpy array of floats
    labels - numpy array of integers
    """
    features = []
    labels = []
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip().replace('\n','').split(',')
            features.append([float(i) for i in line[:4]])
            labels.append(line[-1])
    return np.array(features), np.array(labels)

In [11]:
features, labels = read_data()
print(features.shape, labels.shape)

(120, 4) (120,)


In [31]:
print(f'\nlabel categories are - {[i for i in np.unique(labels)]}\n')


label categories are - ['Ceramic', 'Metal', 'Plastic']



In [33]:
## encode labels to integers
labels = np.array([0 if i=='Plastic' else 1 if i=='Metal' else 2 for i in labels])

In [39]:
def train_test_split(features, labels):
    """
    split the dataset into train-test as specified in the problem statement
    considering the first 6 values of each material type to make the testing dataset
    rest of the data is training dataset
    
    Parameters
    ----------
    features - numpy array of floats
    labels - numpy array of strings
    
    Returns
    -------
    train_features - numpy array of floats
    test_features - numpy array of floats
    train_labels - numpy array of strings
    test_labels - numpy array of strings
    
    """
    # Shuffle the indices
    np.random.seed(21)
    shuffled_indices = np.random.permutation(len(features))
    
    # Split the shuffled indices into train and test sets
    train_indices = shuffled_indices[:int(len(features) * 0.75)]
    test_indices = shuffled_indices[int(len(features) * 0.75):]
    
    # Use the train and test indices to split the features and labels
    train_features = features[train_indices]
    train_labels = labels[train_indices]
    test_features = features[test_indices]
    test_labels = labels[test_indices]
    
    return train_features, train_labels, test_features, test_labels

In [40]:
train_features, train_labels, test_features, test_labels = train_test_split(features, labels)

In [41]:
print(train_features.shape, train_labels.shape, test_features.shape, test_labels.shape)

(90, 4) (90,) (30, 4) (30,)


In [53]:
np.zeros((train_features.shape[1], 3))

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [50]:
def softmax(z):
    e_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return e_z / e_z.sum(axis=1, keepdims=True)

def initialize_params(dim):
    W = np.zeros((dim, 3))
    b = np.zeros((1, 3))
    return W, b

def propagate(X, Y, W, b):
    m = X.shape[0]
    A = softmax(np.dot(X, W) + b)
    cost = -np.mean(np.sum(Y * np.log(A), axis=1))
    dZ = A - Y
    dW = (1 / m) * np.dot(X.T, dZ)
    db = (1 / m) * np.sum(dZ, axis=0)
    return dW, db, cost

def optimize(X, Y, W, b, num_iter, lr):
    costs = []
    for i in range(num_iter):
        dW, db, cost = propagate(X, Y, W, b)
        W -= lr * dW
        b -= lr * db
        if i % 100 == 0:
            costs.append(cost)
    return W, b, costs

def predict(X, W, b):
    A = softmax(np.dot(X, W) + b)
    return np.argmax(A, axis=1)

def bagging(X_train, y_train, X_test, y_test, num_bagging):
    bagging_pred = np.zeros((y_test.shape[0], num_bagging))
    for i in range(num_bagging):
        idx = np.random.choice(X_train.shape[0], X_train.shape[0])
        X_bag, y_bag = X_train[idx], y_train[idx]
        W, b = initialize_params(X_train.shape[1])
        W, b, _ = optimize(X_bag, y_bag, W, b, num_iter=100, lr=0.1)
        bagging_pred[:, i] = predict(X_test, W, b)
    pred = np.argmax(np.apply_along_axis(lambda x: np.bincount(x, minlength=3), axis=1, arr=bagging_pred), axis=1)
    accuracy = np.mean(pred == y_test)
    return accuracy

# Example usage
# Assuming that X_train, y_train, X_test, y_test are already defined
print("Single Classifier Accuracy: ", bagging(train_features, train_labels, test_features, test_features, num_bagging=1))
print("Bagging 10 Accuracy: ", bagging(train_features, train_labels, test_features, test_features, num_bagging=10))
print("Bagging 50 Accuracy: ", bagging(train_features, train_labels, test_features, test_features, num_bagging=50))
print("Bagging 100 Accuracy: ", bagging(train_features, train_labels, test_features, test_features, num_bagging=100))

ValueError: cannot reshape array of size 360 into shape (30,3)