# <center> UFOs and the Weather: A Technical Report

## <center> Sebastion Matthews, Ethan France

### <center> CPSC 322, Fall 2024

In [None]:
import openpyxl
import math
import random
from collections import Counter, defaultdict
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np

# Import classifiers
from MyKNNClassifier import MyKNNClassifier
from MyNaiveBayesClassifier import MyNaiveBayesClassifier
from MyRandomForestClassifier import MyRandomForestClassifier

# Define utility functions
def read_excel(file_path):
    workbook = openpyxl.load_workbook(file_path)
    sheet = workbook.active
    data = []
    for row in sheet.iter_rows(values_only=True):
        data.append(list(row))
    return data[1:]  # Skip the header

def normalize_units(row, indices):
    normalized_row = []
    for i in indices:
        value = row[i]
        if i in range(13, 16):
            normalized_row.append(value / 100 if value is not None else None)
        else:
            normalized_row.append(value)
    return normalized_row

def load_filtered_dataset(file_path):
    data = read_excel(file_path)
    filtered_data = []
    relevant_indices = list(range(10, 26)) + [-1]

    for row in data:
        if any(row[i] is None for i in relevant_indices):
            continue

        label = row[-1]
        if isinstance(label, str):
            label = label.strip().lower()
            label = 1 if label == "yes" else 0 if label == "no" else None
        
        if label is None or not all(isinstance(row[i], (int, float)) for i in relevant_indices[:-1]):
            continue

        normalized_row = normalize_units(row, relevant_indices[:-1])
        filtered_data.append(normalized_row + [label])

    if not filtered_data:
        raise ValueError("No valid rows found in the dataset. Please check the data and column indices.")

    features = [row[:-1] for row in filtered_data]
    labels = [row[-1] for row in filtered_data]
    return features, labels

def split_data(X, y, test_ratio=0.2):
    combined = list(zip(X, y))
    random.shuffle(combined)
    split_idx = int(len(combined) * (1 - test_ratio))
    train_set, test_set = combined[:split_idx], combined[split_idx:]
    return (
        [x for x, _ in train_set], [y for _, y in train_set],
        [x for x, _ in test_set], [y for _, y in test_set]
    )

def calculate_metrics(y_true, y_pred):
    true_positive = sum(1 for true, pred in zip(y_true, y_pred) if true == pred == 1)
    false_positive = sum(1 for true, pred in zip(y_true, y_pred) if true == 0 and pred == 1)
    false_negative = sum(1 for true, pred in zip(y_true, y_pred) if true == 1 and pred == 0)

    precision = true_positive / (true_positive + false_positive) if (true_positive + false_positive) > 0 else 0
    recall = true_positive / (true_positive + false_negative) if (true_positive + false_negative) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    accuracy = sum(1 for true, pred in zip(y_true, y_pred) if true == pred) / len(y_true)
    return accuracy, precision, recall, f1_score

# Main logic for the notebook
def run_classifiers(file_path):
    features, labels = load_filtered_dataset(file_path)
    X_train, y_train, X_test, y_test = split_data(features, labels)

    metrics = {}

    # Run KNN
    knn_classifier = MyKNNClassifier(k=5)
    knn_classifier.fit(X_train, y_train)
    knn_predictions = knn_classifier.predict(X_test)
    metrics["KNN"] = calculate_metrics(y_test, knn_predictions)

    # Run Naive Bayes
    nb_classifier = MyNaiveBayesClassifier()
    nb_classifier.fit(X_train, y_train)
    nb_predictions = nb_classifier.predict(X_test)
    metrics["Naive Bayes"] = calculate_metrics(y_test, nb_predictions)

    # Run Random Forest
    rf_classifier = MyRandomForestClassifier(n_trees=10, m_trees=3, f_features=2)
    rf_classifier.fit(X_train, y_train)
    rf_predictions = rf_classifier.predict(X_test)
    metrics["Random Forest"] = calculate_metrics(y_test, rf_predictions)

    return metrics

def visualize_metrics(metrics):
    classifiers = list(metrics.keys())
    accuracy = [metrics[clf][0] for clf in classifiers]
    precision = [metrics[clf][1] for clf in classifiers]
    recall = [metrics[clf][2] for clf in classifiers]
    f1_scores = [metrics[clf][3] for clf in classifiers]

    x = np.arange(len(classifiers))
    width = 0.2

    plt.figure(figsize=(12, 6))

    plt.bar(x - width, accuracy, width, label="Accuracy", color="skyblue")
    plt.bar(x, precision, width, label="Precision", color="orange")
    plt.bar(x + width, recall, width, label="Recall", color="green")

    plt.xticks(x, classifiers)
    plt.ylabel("Score")
    plt.title("Classifier Metrics Comparison")
    plt.legend()
    plt.show()

    plt.figure(figsize=(8, 6))
    plt.bar(classifiers, f1_scores, color="purple")
    plt.ylabel("F1 Score")
    plt.title("F1 Score Comparison")
    plt.show()

if __name__ == "__main__":
    try:
        file_path = 'merged_weather_ufo.xlsx'
        metrics = run_classifiers(file_path)
        visualize_metrics(metrics)
    except Exception as e:
        print(f"Error: {e}")


Error: 'module' object is not callable. Did you mean: 'MyKNNClassifier.MyKNNClassifier(...)'?
