In [3]:
import numpy as np
import pandas as pd
import cv2 as cv
import os
import sklearn.model_selection as sms
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

# Directory containing the image files
images = os.listdir("./image")

def extract_colors(files):
    """
    Extracts the average RGB color values from each image file and returns a DataFrame.
    
    Parameters:
        files (list of str): List of image file names.
        
    Returns:
        pd.DataFrame: DataFrame containing image labels, average RGB values, and file names.
    """
    dataset = pd.DataFrame({'label': [], 'R': [], 'G': [], 'B': [], 'name': []})
    
    for j in range(len(files)):
        # Read the image
        img = cv.imread("./image/" + files[j])
        # Calculate average R, G, B values
        avgR = round(np.mean(img[:, :, 2]), 3)
        avgG = round(np.mean(img[:, :, 1]), 3)
        avgB = round(np.mean(img[:, :, 0]), 3)
        # Append data to the DataFrame
        dataset.loc[j] = [files[j][0], avgR, avgG, avgB, files[j]]
    
    return dataset

# Extract color features from the images
raw_dataset = extract_colors(images)

# Print the data types of each column in the dataset
print(raw_dataset.dtypes)

# Check for missing values in each column
print(raw_dataset.isnull().sum())

# Display the dataset
print(raw_dataset)

# Convert qualitative labels to numerical ones
def label_to_numeric(column):
    """
    Converts a categorical column to numeric codes.
    
    Parameters:
        column (pd.Series): A column of categorical data.
        
    Returns:
        pd.Series: Column with numeric codes.
    """
    if column.dtype == 'object':
        unique_labels, _ = pd.factorize(column)
        return pd.Series(unique_labels, index=column.index)
    return column

# Apply the label conversion function
dataset = raw_dataset.apply(label_to_numeric)

# Understanding which number is attributed to which label
label_map = dict(zip(raw_dataset['label'], dataset['label']))
print(label_map)

name_map = dict(zip(raw_dataset['name'], dataset['name']))
print(name_map)

# Define features and labels
features = list(set(dataset.columns) - {'label', 'name'})
label = dataset['label']
data = dataset[features]

# Split the dataset into training and test sets
data_trainset, data_testset, label_trainset, label_testset = sms.train_test_split(
    data, label, test_size=0.2, random_state=42
)
# Train a Gaussian Naive Bayes model
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(data_trainset, label_trainset)

# Make predictions on the test set
prediction = naive_bayes_model.predict(data_testset)

# Print accuracy score
accuracy = accuracy_score(label_testset, prediction)
print("Accuracy:", accuracy, '\n\n')

# Create a DataFrame for comparison
Comparison = pd.concat([label_testset.reset_index(drop=True), pd.Series(prediction)], axis=1)
Comparison.rename(columns={0: 'classifier'}, inplace=True)
print(Comparison)

# Calculate confusion matrix for category 'j'
j_TP = ((Comparison['label'] == 0) & (Comparison['classifier'] == 0)).sum()
j_FN = ((Comparison['label'] == 0) & (Comparison['classifier'] == 1)).sum()
j_TN = ((Comparison['label'] == 1) & (Comparison['classifier'] == 1)).sum()
j_FP = ((Comparison['label'] == 1) & (Comparison['classifier'] == 0)).sum()

j_confusion_matrix = confusion_matrix(label_testset, prediction)
j_confusion_matrix = pd.DataFrame(j_confusion_matrix, columns=['Actual Positive', 'Actual Negative'], index=['Predicted Positive', 'Predicted Negative'])

# Calculate confusion matrix for category 's'
s_TP = ((Comparison['label'] == 1) & (Comparison['classifier'] == 1)).sum()
s_FN = ((Comparison['label'] == 1) & (Comparison['classifier'] == 0)).sum()
s_TN = ((Comparison['label'] == 0) & (Comparison['classifier'] == 0)).sum()
s_FP = ((Comparison['label'] == 0) & (Comparison['classifier'] == 1)).sum()

s_confusion_matrix = pd.DataFrame({
    'Actual Positive': [s_TP, s_FN],
    'Actual Negative': [s_FP, s_TN]
}, columns=['Actual Positive', 'Actual Negative'], index=['Predicted Positive', 'Predicted Negative'])

# Print confusion matrices
print('Category j\n', j_confusion_matrix, '\n\n', 'Category s\n', s_confusion_matrix, '\n\n')

# Calculate and print precision and recall for category 'j'
j_Precision = j_TP / (j_TP + j_FP)
j_Recall = j_TP / (j_TP + j_FN)
print('Category j', 'Precision =', j_Precision, 'Recall =', j_Recall)

# Calculate and print precision and recall for category 's'
s_Precision = s_TP / (s_TP + s_FP)
s_Recall = s_TP / (s_TP + s_FN)
print('Category s', 'Precision =', s_Precision, 'Recall =', s_Recall)

# Find misclassified images
index_series = pd.Series(label_testset.index, index=pd.Series(prediction).index)
x = pd.concat([index_series, Comparison], axis=1)
x.rename(columns={0: 'name'}, inplace=True)
y = (x['label'] != x['classifier'])

# Get list of misclassified image names
result_list = x.loc[y, 'name'].tolist()
Incorrect_isolation = pd.DataFrame(list(name_map)).rename(columns={0: 'name'}).loc[result_list]

# Print misclassified images
print('Incorrect isolation is \n', Incorrect_isolation)


label     object
R        float64
G        float64
B        float64
name      object
dtype: object
label    0
R        0
G        0
B        0
name     0
dtype: int64
   label        R        G        B     name
0      j   71.228   98.647   52.458   j1.jpg
1      j   77.510   86.576   64.981  j10.jpg
2      j   76.126   97.610   45.073  j11.jpg
3      j  102.759  124.683   54.944  j12.jpg
4      j   79.914   78.027   26.893  j13.jpg
..   ...      ...      ...      ...      ...
77     s  103.521  165.871  205.652   s5.jpg
78     s   78.385   98.208  119.339   s6.jpg
79     s   49.863   81.472  100.099   s7.jpg
80     s  118.600  153.979  167.570   s8.jpg
81     s  111.104  141.390  174.625   s9.jpg

[82 rows x 5 columns]
{'j': 0, 's': 1}
{'j1.jpg': 0, 'j10.jpg': 1, 'j11.jpg': 2, 'j12.jpg': 3, 'j13.jpg': 4, 'j14.jpg': 5, 'j15.jpg': 6, 'j16.jpg': 7, 'j17.jpg': 8, 'j18.jpg': 9, 'j19.jpg': 10, 'j2.jpg': 11, 'j20.jpg': 12, 'j21.jpg': 13, 'j22.jpg': 14, 'j23.jpg': 15, 'j24.jpg': 16, 'j25.jpg'