In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[6]:


import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define dataset paths (assuming train.txt and val.txt are in the same directory as the notebook)
train_file = "train.txt"
val_file = "val.txt"

# Ensure results directories exist
results_dir = "results/decision_tree"
classification_reports_dir = os.path.join(results_dir, "classification_reports")
confusion_matrices_dir = os.path.join(results_dir, "confusion_matrices")
models_dir = "models"

os.makedirs(results_dir, exist_ok=True)
os.makedirs(classification_reports_dir, exist_ok=True)
os.makedirs(confusion_matrices_dir, exist_ok=True)
os.makedirs(models_dir, exist_ok=True)

# Load Data
def load_data(file_path):
    """Load image paths and labels from a dataset file."""
    paths, labels = [], []
    with open(file_path, 'r') as f:
        for line in f:
            img_path = line.strip()
            if os.path.exists(img_path):
                category = img_path.split('/')[1]  # Adjust based on your file structure
                paths.append(img_path)
                labels.append(category)
    return pd.DataFrame({'image': paths, 'label': labels})

train_df = load_data(train_file)
val_df = load_data(val_file)

# Feature extraction (example: histogram features)
def compute_histogram_features(img_path):
    """Extracts histogram features from an image."""
    img = cv2.imread(img_path)
    img = cv2.resize(img, (64, 64))  # Resize for consistency
    hist_features = [cv2.calcHist([img], [i], None, [256], [0, 256]).flatten() for i in range(3)]
    return np.concatenate(hist_features)

# Compute features for training and validation data
X_train = np.array([compute_histogram_features(path) for path in train_df['image']])
X_val = np.array([compute_histogram_features(path) for path in val_df['image']])

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'])
y_val = label_encoder.transform(val_df['label'])

# Define hyperparameter configurations for comparison
HYPERPARAM_CONFIGS = []
depth_values = [5, 10, 20]  # Shallow and Deep Trees
split_values = [2, 10, 20]  # Different min_samples_split values

for depth in depth_values:
    for split in split_values:
        for criterion in ["gini", "entropy"]:
            HYPERPARAM_CONFIGS.append({
                'name': f"{criterion}_d{depth}_s{split}",
                'criterion': criterion,
                'max_depth': depth,
                'min_samples_leaf': 5,  # Keeping this constant
                'min_samples_split': split,  # Varying this value
                'class_weight': 'balanced',
                'description': f"{criterion.capitalize()}, max_depth={depth}, min_samples_split={split}"
            })

# Function to evaluate and save results
def evaluate_and_save(model, X_val, y_val, config_name, description, results_dir):
    """Evaluate the model and save results."""
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred, output_dict=True)
    cm = confusion_matrix(y_val, y_pred)
    
    # Save classification report
    report_df = pd.DataFrame(report).transpose()
    report_df.to_csv(os.path.join(classification_reports_dir, f"{config_name}_report.csv"))
    
    # Save confusion matrix plot
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Confusion Matrix - {description}')
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.savefig(os.path.join(confusion_matrices_dir, f"{config_name}_confusion_matrix.png"))
    plt.close()
    
    # Save model
    joblib.dump(model, os.path.join(models_dir, f"{config_name}_model.pkl"))
    
    return accuracy

# List to store models with their configurations
models_list = []

# Train and evaluate models
all_metrics = []

for config in HYPERPARAM_CONFIGS:
    print(f"\n{'=' * 40}")
    print(f"Training: {config['description']}")
    print(f"{'=' * 40}")

    model = DecisionTreeClassifier(
        criterion=config['criterion'],
        max_depth=config['max_depth'],
        min_samples_leaf=config['min_samples_leaf'],
        min_samples_split=config['min_samples_split'],
        class_weight=config['class_weight'],
        random_state=42
    )

    model.fit(X_train, y_train)

    # Evaluate and Save results
    accuracy = evaluate_and_save(model, X_val, y_val, config['name'], config['description'], results_dir)
    
    # Store metrics and model in the models list
    all_metrics.append({
        'config_name': config['name'],
        'description': config['description'],
        'accuracy': accuracy
    })
    models_list.append({
        'config_name': config['name'],
        'model': model
    })

# Save final results
comparative_df = pd.DataFrame(all_metrics)
comparative_df.to_csv(os.path.join(results_dir, "model_comparison.csv"), index=False)

# # Plot accuracy comparisons
# plt.figure(figsize=(12, 6))
# sns.barplot(x='config_name', y='accuracy', data=comparative_df)
# plt.title('Accuracy Comparison of Decision Tree Configurations')
# plt.xlabel('Configuration')
# plt.ylabel('Accuracy')
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.savefig(os.path.join(results_dir, "accuracy_comparison.png"))
# plt.show()

# Save best model characteristics
best_model = comparative_df.loc[comparative_df['accuracy'].idxmax()]
print(f"\nBest Model: {best_model['config_name']} with accuracy: {best_model['accuracy']:.4f}")
print(f"Description: {best_model['description']}")

with open(os.path.join(results_dir, "best_model_summary.txt"), "w") as f:
    f.write(f"Best Model: {best_model['config_name']}\n")
    f.write(f"Accuracy: {best_model['accuracy']:.4f}\n")
    f.write(f"Description: {best_model['description']}\n")

# Extract confusion matrix for the best model
best_model_info = next(model_info for model_info in models_list if model_info['config_name'] == best_model['config_name'])
cm = confusion_matrix(y_val, best_model_info['model'].predict(X_val))

# Print confusion matrix
print(f"\nConfusion Matrix for Best Model ({best_model['config_name']}):")
print(cm)

# Print classification report
y_pred_best = best_model_info['model'].predict(X_val)
report = classification_report(y_val, y_pred_best)
print(f"\nClassification Report for Best Model ({best_model['config_name']}):")
print(report)

print("\nDecision Tree training complete. All results saved successfully!")


# In[ ]:







Training: Gini, max_depth=5, min_samples_split=2

Training: Entropy, max_depth=5, min_samples_split=2

Training: Gini, max_depth=5, min_samples_split=10

Training: Entropy, max_depth=5, min_samples_split=10

Training: Gini, max_depth=5, min_samples_split=20

Training: Entropy, max_depth=5, min_samples_split=20

Training: Gini, max_depth=10, min_samples_split=2

Training: Entropy, max_depth=10, min_samples_split=2

Training: Gini, max_depth=10, min_samples_split=10

Training: Entropy, max_depth=10, min_samples_split=10

Training: Gini, max_depth=10, min_samples_split=20

Training: Entropy, max_depth=10, min_samples_split=20

Training: Gini, max_depth=20, min_samples_split=2

Training: Entropy, max_depth=20, min_samples_split=2

Training: Gini, max_depth=20, min_samples_split=10

Training: Entropy, max_depth=20, min_samples_split=10

Training: Gini, max_depth=20, min_samples_split=20

Training: Entropy, max_depth=20, min_samples_split=20

Best Model: gini_d10_s20 with accuracy: 0.7400
D