### **Disclaimer**
This program was developed as part of the Programming Assignment for the "Introduction to Artificial Intelligence" course by:  
- Filza Rahma Muflihah (1301201261)  
- Ummu Husnul Khatimah (1301204120)  

<br>

---

### **Instructions**
1. **Running on Google Colab**:  
   - Upload the `traintest.xlsx` file before executing the code.  
   - Use the "Files" tab on the left-hand side to upload the file.  

2. **Running Locally**:  
   - Install Python and the required libraries on your machine.  
   - Ensure that the `traintest.xlsx` file is in the same directory as this script.  
   - Execute the script using a Python IDE or the command line.  

<br>

---

### **Code Implementation**
Below is the refactored Python code with clear structure and comments to explain each section's purpose.

In [1]:
# Import necessary libraries
import pandas as pd

### STEP 1: Load Training and Testing Data

In [2]:
# Read the Excel file
excel_file = pd.ExcelFile('traintest.xlsx')

# Load the training and testing datasets
train_data = pd.read_excel(excel_file, 'train')
test_data = pd.read_excel(excel_file, 'test')

# Display the training data
print("---- Training Data ----")
print(train_data)

# Convert training data to a dictionary for processing
train_dict = train_data.to_dict('records')

---- Training Data ----
      id  x1  x2  x3  y
0      1  60  64   0  1
1      2  54  60  11  0
2      3  65  62  22  0
3      4  34  60   0  1
4      5  38  69  21  0
..   ...  ..  ..  .. ..
291  292  59  64   1  1
292  293  65  67   0  1
293  294  53  65  12  0
294  295  57  64   1  0
295  296  54  59   7  1

[296 rows x 5 columns]


### STEP 2: Data Preprocessing and Statistics Calculation

In [3]:
# Initialize data tracking dictionaries
class_counts = {'0': 0, '1': 0}
feature_x1_stats = {}
feature_x2_stats = {}
feature_x3_stats = {}

# Define helper function to count feature occurrences
def count_feature_occurrences(value, label, feature_dict):
    if value not in feature_dict:
        feature_dict[value] = {'0': 0, '1': 0}
    feature_dict[value][label] += 1
    return feature_dict

# Calculate counts for each class and feature
for record in train_dict:
    label = str(record['y'])
    feature_x1 = record['x1']
    feature_x2 = record['x2']
    feature_x3 = record['x3']

    class_counts[label] += 1
    feature_x1_stats = count_feature_occurrences(feature_x1, label, feature_x1_stats)
    feature_x2_stats = count_feature_occurrences(feature_x2, label, feature_x2_stats)
    feature_x3_stats = count_feature_occurrences(feature_x3, label, feature_x3_stats)

# Sort features for consistent display
feature_x1_stats = dict(sorted(feature_x1_stats.items()))
feature_x2_stats = dict(sorted(feature_x2_stats.items()))
feature_x3_stats = dict(sorted(feature_x3_stats.items()))

# Display class statistics
print("\n---- Class Statistics ----")
print("Class Counts:")
for key, value in class_counts.items():
    print(f"  Class {key}: {value}")

# Define a helper function to display feature statistics
def display_feature_statistics(feature_name, stats):
    print(f"\nFeature: {feature_name}")
    print("+-------+-------------------+")
    print("| Value |   Count (y=0)    |   Count (y=1)    |")
    print("+-------+-------------------+")
    for value, counts in stats.items():
        print(f"| {value:<5} | {counts['0']:<16} | {counts['1']:<16} |")
    print("+-------+-------------------+")

# Display statistics for each feature
display_feature_statistics("x1", feature_x1_stats)
display_feature_statistics("x2", feature_x2_stats)
display_feature_statistics("x3", feature_x3_stats)


---- Class Statistics ----
Class Counts:
  Class 0: 78
  Class 1: 218

Feature: x1
+-------+-------------------+
| Value |   Count (y=0)    |   Count (y=1)    |
+-------+-------------------+
| 30    | 0                | 3                |
| 31    | 0                | 2                |
| 33    | 0                | 2                |
| 34    | 2                | 5                |
| 35    | 0                | 2                |
| 36    | 0                | 2                |
| 37    | 0                | 6                |
| 38    | 1                | 9                |
| 39    | 1                | 5                |
| 40    | 0                | 3                |
| 41    | 3                | 7                |
| 42    | 1                | 7                |
| 43    | 3                | 7                |
| 44    | 3                | 4                |
| 45    | 3                | 5                |
| 46    | 4                | 3                |
| 47    | 3                | 8          

### STEP 3: Apply Laplace Smoothing

In [4]:
def apply_laplace_smoothing(data_dict, classes):
    for value in data_dict.keys():
        for cls in classes:
            data_dict[value][cls] += 1
    return data_dict

# Apply Laplace correction to class counts and features
class_counts['0'] += 1
class_counts['1'] += 1

feature_x1_stats = apply_laplace_smoothing(feature_x1_stats, ['0', '1'])
feature_x2_stats = apply_laplace_smoothing(feature_x2_stats, ['0', '1'])
feature_x3_stats = apply_laplace_smoothing(feature_x3_stats, ['0', '1'])

### STEP 4: Train Naive Bayes Model

In [5]:
# Calculate prior probabilities
total_samples = class_counts['0'] + class_counts['1']
prior_probabilities = {
    '0': class_counts['0'] / total_samples,
    '1': class_counts['1'] / total_samples,
}

# Calculate likelihood probabilities for each feature
def calculate_likelihoods(feature_stats, class_counts):
    likelihoods = {}
    for value, counts in feature_stats.items():
        likelihoods[value] = {
            '0': counts['0'] / class_counts['0'],
            '1': counts['1'] / class_counts['1'],
        }
    return likelihoods

likelihood_x1 = calculate_likelihoods(feature_x1_stats, class_counts)
likelihood_x2 = calculate_likelihoods(feature_x2_stats, class_counts)
likelihood_x3 = calculate_likelihoods(feature_x3_stats, class_counts)

### STEP 5: Test the Model

In [6]:
# Define Naive Bayes prediction function
def naive_bayes_predict(record, priors, likelihoods_x1, likelihoods_x2, likelihoods_x3):
    prob_y0 = (
        priors['0']
        * likelihoods_x1[record['x1']]['0']
        * likelihoods_x2[record['x2']]['0']
        * likelihoods_x3[record['x3']]['0']
    )
    prob_y1 = (
        priors['1']
        * likelihoods_x1[record['x1']]['1']
        * likelihoods_x2[record['x2']]['1']
        * likelihoods_x3[record['x3']]['1']
    )
    return 0 if prob_y0 > prob_y1 else 1

# Predict labels for the test set and update the DataFrame directly
predictions = []
for record in test_data.to_dict('records'):
    prediction = naive_bayes_predict(
        record,
        prior_probabilities,
        likelihood_x1,
        likelihood_x2,
        likelihood_x3,
    )
    predictions.append(prediction)

# Update the 'y' column in the test_data DataFrame
test_data['y'] = predictions

# Display predictions
print("\n---- Predictions ----")
print(test_data)



---- Predictions ----
    id  x1  x2  x3  y
0  297  43  59   2  1
1  298  67  66   0  1
2  299  58  60   3  1
3  300  49  63   3  1
4  301  45  60   0  1
5  302  54  58   1  1
6  303  56  66   3  1
7  304  42  69   1  1
8  305  50  59   2  1
9  306  59  60   0  1


### STEP 6: Evaluate Model Accuracy

In [7]:
def calculate_accuracy(dataset, priors, likelihoods_x1, likelihoods_x2, likelihoods_x3):
    correct_predictions = 0
    for record in dataset:
        predicted_label = naive_bayes_predict(
            record, priors, likelihoods_x1, likelihoods_x2, likelihoods_x3
        )
        if predicted_label == record['y']:
            correct_predictions += 1
    accuracy = correct_predictions / len(dataset) * 100
    print(f"\nAccuracy: {accuracy:.2f}%")

# Calculate and display accuracy on training data
print("\n---- Model Accuracy ----")
calculate_accuracy(train_dict, prior_probabilities, likelihood_x1, likelihood_x2, likelihood_x3)

# Save predictions to an Excel file
output_filename = "predictions.xlsx"
test_data.to_excel(output_filename, index=False)
print(f"\nPredictions saved to {output_filename}")


---- Model Accuracy ----

Accuracy: 82.77%

Predictions saved to predictions.xlsx
