<a href="https://colab.research.google.com/github/showrin20/Machine-Learning-Learning-Path/blob/main/Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score

def read_data(filename):
    #reads in a csv and sements the data
    #randomizes the order of the data, then splits it into different sets
    #returns separate inputs (x) and outputs (y) for each of training, test, and validation
    #also returns a list of column names, which may be useful for determining heavily weighted features
    df = pd.read_csv(filename)
    data = df.to_numpy()
    np.random.shuffle(data)
    test_size = int(data.shape[0]/10)
    data_test = data[:test_size]
    data_val = data[test_size:2*test_size]
    data_train = data[2*test_size:]
    x_train = data_train[:,1:]
    y_train = data_train[:,0]
    x_val = data_val[:,1:]
    y_val = data_val[:,0]
    x_test = data_test[:,1:]
    y_test = data_test[:,0]
    return x_train, y_train, x_val, y_val, x_test, y_test, df.columns.values


def add_ones(x):
    #takes an array of feature vectors and adds a column of 1s to the start
    #useful for logistic regression, since x_0 is always 1
    return np.insert(x, 0, np.ones(x.shape[0]), axis = 1)

def compute_hypothesis(x, weights):
    #computes the hypothesis function for logistic function given data x and weights
    #if x is a single feature vector, will return a scalar
    #if x is a matrix of feature vectors, will return a vector containing the hypothesis for each row
    prod = x.dot(weights)
    return 1/(1 + np.exp(-prod))

def rank_features(weights, feats):
    #takes in a weight vector and an array of feature names
    #returns a sorted array of features, sorted from most negatively weighted to most positively weighted
    #note that feats MUST be a numpy array of the same length as weights
    #if feats[i] does not correspond to weights[i], this will not return accurate results
    imp = np.argsort(weights)
    return feats[imp]
def perceptron(x_train, y_train, x_val, y_val, learning_rate=0.01, epochs=1000):
    weights = np.zeros(x_train.shape[1])
    bias = 0
    best_validation_accuracy = 0
    for epoch in range(epochs):
        for x, y in zip(x_train, y_train):
            linear_output = np.dot(x, weights) + bias
            predicted_result = 1 if linear_output > 0 else -1
            if y != predicted_result:
                update = learning_rate * (y - predicted_result)
                weights += update * x
                bias += update

        val_predicted_results = np.dot(x_val, weights) + bias
        val_predicted_results = np.where(val_predicted_results > 0, 1, -1)
        validation_accuracy = accuracy_score(y_val, val_predicted_results)

        validation_error = 1 - validation_accuracy
        print(f"Epoch {epoch+1}: Validation Error: {validation_error:.4f}")

        if validation_accuracy <= best_validation_accuracy:
            print(f"Early stopping at epoch {epoch+1} as no improvement in validation accuracy.")
            break
        best_validation_accuracy = validation_accuracy
    return weights, bias


def evaluate_performance(x, y, weights, bias):
    predicted_results = np.dot(x, weights) + bias
    predicted_results = np.where(predicted_results > 0, 1, -1)
    accuracy = accuracy_score(y, predicted_results)
    precision = precision_score(y, predicted_results, pos_label=1)
    recall = recall_score(y, predicted_results, pos_label=1)
    return accuracy, precision, recall

filename = 'mushrooms_perceptron.csv'
x_train, y_train, x_val, y_val, x_test, y_test, feature_names = read_data(filename)

weights, bias = perceptron(x_train, y_train, x_val, y_val)

training_accuracy, training_precision, train_recall = evaluate_performance(x_train, y_train, weights, bias)
validation_accuracy, validation_precision, validation_recall = evaluate_performance(x_val, y_val, weights, bias)
test_accuracy, test_precision, test_recall = evaluate_performance(x_test, y_test, weights, bias)

print(f"Training Accuracy: {training_accuracy}, Precision: {training_precision}, Recall: {train_recall}")
print(f"Validation Accuracy: {validation_accuracy}, Precision: {validation_precision}, Recall: {validation_recall}")
print(f"Test Accuracy: {test_accuracy}, Precision: {test_precision}, Recall: {test_recall}")


ranked_features = rank_features(weights, feature_names[1:])
print(f"Most indicative of poisonous: {ranked_features[-3:]}")
print(f"Most indicative of edible: {ranked_features[:3]}")


Epoch 1: Validation Error: 0.0062
Epoch 2: Validation Error: 0.0062
Early stopping at epoch 2 as no improvement in validation accuracy.
Training Accuracy: 0.9996923076923077, Precision: 1.0, Recall: 0.9993612264452252
Validation Accuracy: 0.9938423645320197, Precision: 1.0, Recall: 0.9874055415617129
Test Accuracy: 0.9987684729064039, Precision: 1.0, Recall: 0.9974226804123711
Most indicative of poisonous: ['odor_foul' 'odor_creosote' 'spore-print-color_green']
Most indicative of edible: ['odor_none' 'odor_almond' 'odor_anise']


In [None]:
def logistic_regression_sgd(x_train, y_train, learning_rate=0.01, epochs=1000):
    x_train = add_ones(x_train)
    weights = np.zeros(x_train.shape[1])
    for epoch in range(epochs):
        for x, y in zip(x_train, y_train):
            predicted_result = compute_hypothesis(x, weights)
            error = y - predicted_result
            weights += learning_rate * error * x
    return weights


def evaluate_performance_logistic(x, y, weights):
    x = add_ones(x)
    predicted_results = compute_hypothesis(x, weights)
    predicted_results = np.where(predicted_results >= 0.5, 1, 0)
    accuracy = accuracy_score(y, predicted_results)
    precision = precision_score(y, predicted_results)
    recall = recall_score(y, predicted_results)
    return accuracy, precision, recall

filename1 = 'mushrooms_logistic.csv'
x_train, y_train, x_val, y_val, x_test, y_test, feature_names = read_data(filename1)

weights = logistic_regression_sgd(x_train, y_train)

train_metrics = evaluate_performance_logistic(x_train, y_train, weights)
val_metrics = evaluate_performance_logistic(x_val, y_val, weights)
test_metrics = evaluate_performance_logistic(x_test, y_test, weights)

print("Training Metrics (Accuracy, Precision, Recall):", train_metrics)
print("Validation Metrics (Accuracy, Precision, Recall):", val_metrics)
print("Test Metrics (Accuracy, Precision, Recall):", test_metrics)

ranked_features = rank_features(weights[1:], feature_names[1:])
print("Features most indicative of being poisonous:", ranked_features[-3:])
print("Features most indicative of being edible:", ranked_features[:3])

Training Metrics (Accuracy, Precision, Recall): (1.0, 1.0, 1.0)
Validation Metrics (Accuracy, Precision, Recall): (1.0, 1.0, 1.0)
Test Metrics (Accuracy, Precision, Recall): (1.0, 1.0, 1.0)
Features most indicative of being poisonous: ['odor_foul' 'odor_creosote' 'spore-print-color_green']
Features most indicative of being edible: ['odor_none' 'odor_almond' 'odor_anise']
