In [3]:
import pandas as pd
import math
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from collections import defaultdict

# Initialize encoders
encoders = defaultdict(LabelEncoder)

# Load and preprocess data
data = pd.read_csv('exp4DataSet.csv')
data = data.apply(lambda x: encoders[x.name].fit_transform(x))

output_column = data.columns[-1]
output_values = data[output_column].unique()

# Gaussian probability function
def calculateGaussianProbability(x, mean, stdev):
    try:
        exponent = math.exp(-(math.pow(float(x) - mean, 2) / (2 * math.pow(stdev, 2))))
        return (1 / (math.sqrt(2 * math.pi) * stdev)) * exponent
    except ZeroDivisionError:
        return 0

# Prediction function
def predict(train, inputdata):
    probabilities = {}
    for x in output_values:
        probabilities[x] = 1
        sample = train[train[output_column] == x]
        means = sample.mean()[:-1]
        stds = sample.std()[:-1]
        
        for feature in sample.columns[:-1]:
            probabilities[x] *= calculateGaussianProbability(inputdata[feature], means[feature], stds[feature])
    
    prediction = max(probabilities, key=probabilities.get)
    print("Predicted class:", encoders[output_column].inverse_transform([prediction])[0])

# Accuracy calculation function
def calculateAccuracy(train, test):
    correct = 0
    means = {}
    stds = {}
    y_pred, y_test = [], []

    # Calculate means and standard deviations for each class
    for x in output_values:
        means[x] = train[train[output_column] == x].mean()[:-1]
        stds[x] = train[train[output_column] == x].std()[:-1]

    # Predict for each test sample
    for _, row in test.iterrows():
        y_test.append(row[output_column])
        probabilities = {}

        for x in output_values:
            probabilities[x] = 1
            for feature in train.columns[:-1]:  # iterate over features
                probabilities[x] *= calculateGaussianProbability(row[feature], means[x][feature], stds[x][feature])
        
        # Make prediction and track accuracy
        predicted = max(probabilities, key=probabilities.get)
        y_pred.append(predicted)
        if predicted == row[output_column]:
            correct += 1

    # Print evaluation metrics
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)
    print(classification_report(y_test, y_pred))
    print("Precision:", cm[0][0] / (cm[0][0] + cm[1][0]) if (cm[0][0] + cm[1][0]) > 0 else 0)
    return correct / len(test) * 100.0

# Split data into training and testing sets
data = data.sample(n=min(1000, len(data)), random_state=1)
test = data.sample(frac=0.33, random_state=1)
train = data.drop(test.index)

# Calculate accuracy
print("Test data size:", test.shape[0], "Train data size:", train.shape[0])
print("Accuracy:", calculateAccuracy(train, test), "%")

# Input prompt for prediction
print("Training completed, enter data to predict:")

Test data size: 4 Train data size: 8
Confusion Matrix:
 [[1 0 0]
 [2 0 0]
 [1 0 0]]
              precision    recall  f1-score   support

           0       0.25      1.00      0.40         1
           1       0.00      0.00      0.00         2
           2       0.00      0.00      0.00         1

    accuracy                           0.25         4
   macro avg       0.08      0.33      0.13         4
weighted avg       0.06      0.25      0.10         4

Precision: 0.3333333333333333
Accuracy: 25.0 %
Training completed, enter data to predict:


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
