# ------------------------- CSE-6363-001 ML Project 2 -------------------------

In [5]:
import numpy as np
import matplotlib.pyplot as plt

In [6]:
def read_data(filename='./data/data.txt'):
    """
    Load the dataset for question 1 provided with the homework.
    
    Parameters
    ----------
    filename - string
    
    Returns
    -------
    data - numpy array of floats
    labels - numpy array of integers
    """
    features = []
    labels = []
    with open(filename, 'r') as f:
        for line in f:
            line = line.strip().replace('\n','').split(',')
            features.append([float(i) for i in line[:4]])
            labels.append(line[-1])
    return np.array(features), np.array(labels)

In [34]:
features, labels = read_data()
print(features.shape, labels.shape)

(120, 4) (120,)


In [51]:
def one_hot_encoding(y):
    num_samples = len(y)
    y_encoded = np.zeros((num_samples, len(np.unique(y))))
    for i in range(num_samples):
        if y[i] == 'Plastic':
            y_encoded[i, 0] = 1
        elif y[i] == 'Metal':
            y_encoded[i, 1] = 1
        elif y[i] == 'Ceramic':
            y_encoded[i, 2] = 1
    return y_encoded

In [52]:
labels_enc = one_hot_encoding(labels)

In [53]:
labels_enc

array([[0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1

In [8]:
print(f'\nlabel categories are - {[i for i in np.unique(labels)]}\n')


label categories are - ['Ceramic', 'Metal', 'Plastic']



In [9]:
## encode labels to integers
# labels = np.array([0 if i=='Plastic' else 1 if i=='Metal' else 2 for i in labels])

In [54]:
def train_test_split(features, labels):
    """
    split the dataset into train-test as specified in the problem statement
    considering the first 6 values of each material type to make the testing dataset
    rest of the data is training dataset
    
    Parameters
    ----------
    features - numpy array of floats
    labels - numpy array of strings
    
    Returns
    -------
    train_features - numpy array of floats
    test_features - numpy array of floats
    train_labels - numpy array of strings
    test_labels - numpy array of strings
    
    """
    # Shuffle the indices
    np.random.seed(21)
    shuffled_indices = np.random.permutation(len(features))
    
    # Split the shuffled indices into train and test sets
    train_indices = shuffled_indices[:int(len(features) * 0.75)]
    test_indices = shuffled_indices[int(len(features) * 0.75):]
    
    # Use the train and test indices to split the features and labels
    train_features = features[train_indices]
    train_labels = labels[train_indices]
    test_features = features[test_indices]
    test_labels = labels[test_indices]
    
    return train_features, train_labels, test_features, test_labels

In [55]:
train_features, train_labels, test_features, test_labels = train_test_split(features, labels_enc)

In [56]:
print(train_features.shape, train_labels.shape, test_features.shape, test_labels.shape)

(90, 4) (90, 3) (30, 4) (30, 3)


In [61]:
test_labels

array([[1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [103]:
def softmax(z):
    e_z = np.exp(z - np.max(z, axis=1, keepdims=True))
    return e_z / e_z.sum(axis=1, keepdims=True)

def initialize_params(dim):
    W = np.zeros((dim, 3))
    b = np.zeros((1, 3))
    return W, b

def propagate(X, Y, W, b):
    m = X.shape[0]
    A = softmax(np.dot(X, W) + b)
    cost = -np.mean(np.sum(Y * np.log(A), axis=1))
    dZ = A - Y
    dW = (1 / m) * np.dot(X.T, dZ)
    db = (1 / m) * np.sum(dZ, axis=0)
    return dW, db, cost

def optimize(X, Y, W, b, num_iter, lr):
    costs = []
    for i in range(num_iter):
        dW, db, cost = propagate(X, Y, W, b)
        W -= lr * dW
        b -= lr * db
        if i % 100 == 0:
            costs.append(cost)
    return W, b, costs

def predict(X, W, b):
    A = softmax(np.dot(X, W) + b)
    return np.argmax(A, axis=1)

def bagging(train_features, train_labels, test_features, test_labels, num_bagging):
    bagging_pred = np.zeros((test_labels.shape[0], num_bagging))
    for i in range(num_bagging):
        idx = np.random.choice(train_features.shape[0], train_features.shape[0])
        X_bag, y_bag = train_features[idx], train_labels[idx]
        W, b = initialize_params(train_features.shape[1])
        W, b, _ = optimize(X_bag, y_bag, W, b, num_iter=10000, lr=0.1)
        bagging_pred[:, i] = predict(test_features, W, b)
    pred = np.argmax(np.apply_along_axis(lambda x: np.bincount(x.astype('int64'), minlength=3), axis=1, arr=bagging_pred), axis=1)
    return pred

In [104]:
pred_1 = bagging(train_features, train_labels, test_features, test_features, num_bagging=1)
pred_10 = bagging(train_features, train_labels, test_features, test_features, num_bagging=10)
pred_50 = bagging(train_features, train_labels, test_features, test_features, num_bagging=50)
pred_100 = bagging(train_features, train_labels, test_features, test_features, num_bagging=100)

pred_actual = np.array([np.argmax(i) for i in test_labels])

In [105]:
print(f"Single Classifier Accuracy: {round(100*np.mean(pred_1==pred_actual),2)}")
print(f"Bagging 10 Accuracy: {round(100*np.mean(pred_10==pred_actual),2)}")
print(f"Bagging 50 Accuracy: {round(100*np.mean(pred_50==pred_actual),2)}")
print(f"Bagging 100 Accuracy: {round(100*np.mean(pred_100==pred_actual),2)}")

Single Classifier Accuracy: 80.0
Bagging 10 Accuracy: 73.33
Bagging 50 Accuracy: 80.0
Bagging 100 Accuracy: 80.0
