### **1. Set up libraries and load data**

#### 1.1 import libraries

In [None]:
import seaborn as sns              
import matplotlib.pyplot as plt    
from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import OneHotEncoder       
from sklearn.compose import ColumnTransformer        
from sklearn.pipeline import Pipeline                 
from sklearn.tree import DecisionTreeClassifier, plot_tree  
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score  
from pathlib import Path     
from IPython.display import Image, display
from collections import Counter
import os                        

#### 1.2 Load the dataset

In [None]:
import pandas as pd
penguins = sns.load_dataset("penguins")    
penguins.dropna(inplace=True)
display(penguins)              

### **2. Preparing the datasets**

#### 2.1 Identify features & labels

In [None]:
features = ['island','bill_length_mm','bill_depth_mm','flipper_length_mm','body_mass_g','sex']
X = penguins[features]
y = penguins['species'] # Adelie, Chinstrap, Gentoo

#### 2.2 One-hot encoding for the variable sex and island (categorical)

In [None]:
categorical = ['sex', 'island']
numeric     = list(set(features) - set(categorical))
preprocess  = ColumnTransformer(
      [("cat", OneHotEncoder(), categorical),
       ("num", "passthrough", numeric)]
)

#### 2.3 Create 4 train/test pairs with ratios 40/60, 60/40, 80/20, 90/10, then shuffle and stratify

In [None]:
def stratified_split(X, y, train_size, random_state=42):
    """
    Chia dữ liệu theo kiểu phân tầng.
    
    Parameters:
    - X: Dữ liệu đầu vào (features)
    - y: Nhãn (labels)
    - test_size: Tỷ lệ dữ liệu test
    - random_state: Giá trị seed để tái tạo
    
    Returns:
    - feature_train, feature_test, label_train, label_test: Dữ liệu sau khi chia
    """
    test_size = 1 - train_size
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )
    return X_train, X_test, y_train, y_test

#### 2.4 Display class distribution

In [None]:
def visualize_train_test_distribution(y, y_train, y_test, train_size):
    fig, axes = plt.subplots(1, 3, figsize=(15, 7), sharey=True)
    
    classes = [ 'Adelie', 'Chinstrap', 'Gentoo']
    
    # Phân phối gốc
    original_counts = [Counter(y)[ele] for ele in classes]
    axes[0].bar(classes, original_counts)
    axes[0].set_title('Original Dataset')
    axes[0].set_ylabel('Count')
    for i, count in enumerate(original_counts):
        axes[0].text(i, count + 1, str(count), ha='center', va='bottom', fontsize=15)

    # Phân phối tập train
    train_counts = [Counter(y_train)[ele] for ele in classes]
    axes[1].bar(classes, train_counts)
    axes[1].set_title(f'Training Set ({int(round(train_size, 2)* 100)}%)')
    for i, count in enumerate(train_counts):
        axes[1].text(i, count + 1, str(count), ha='center', va='bottom', fontsize=15)

    # Phân phối tập test
    test_counts = [Counter(y_test)[ele] for ele in classes]
    axes[2].bar(classes, test_counts)
    axes[2].set_title(f'Test Set ({int(round((1 - train_size), 2) * 100)}%)')
    for i, count in enumerate(test_counts):
        axes[2].text(i, count + 1, str(count), ha='center', va='bottom', fontsize=15)

    plt.tight_layout()
    os.makedirs('./output/split', exist_ok=True)
    plt.savefig(f'./output/split/train_test_{int(train_size*100)}_{int(round(1 - train_size, 2) * 100)}.png', format='png', bbox_inches='tight')
    plt.show()

In [None]:
splits = {}  # Dùng để lưu các tập sau khi chia
proportions =  [0.4, 0.6, 0.8, 0.9]
for train_size in proportions:
    feature_train, feature_test, label_train, label_test = stratified_split(X, y, train_size, random_state=42)
    visualize_train_test_distribution(y, label_train, label_test, train_size)
    train_percent = int(train_size * 100)
    test_percent = 100 - train_percent
    name = f"{train_percent}/{test_percent}"
    splits[name] = (feature_train, feature_test, label_train, label_test)


### **3. Training & Visualizing Decision Tree (Building the classifiers)**

In [None]:
Path("figures").mkdir(exist_ok=True)    # create root folder

for name, (X_tr, X_te, y_tr, y_te) in splits.items():
    clf = Pipeline([
        ("prep", preprocess),
        ("dt",  DecisionTreeClassifier(criterion="gini", random_state=42))
    ]).fit(X_tr, y_tr)

    plt.figure(figsize=(16, 10))
    plot_tree(
        clf.named_steps["dt"],
        feature_names=clf.named_steps["prep"].get_feature_names_out(),
        class_names=clf.named_steps["dt"].classes_,
        filled=True, rounded=True, proportion=True
    )
    plt.tight_layout()

    # --- save image ---
    safe_name = str(name).replace("/", "-")              # avoid path error
    plt.savefig(f"figures/tree_penguins_{safe_name}.png", dpi=300)
    print(f"\n--- Train/Test Split: {safe_name} ---")
    display(Image(f"figures/tree_penguins_{safe_name}.png"))
    plt.close()

### **4. Model Evaluation (Evaluating the classifiers)**

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

# Train and store models for each split
models = {}
for name, (X_tr, X_te, y_tr, y_te) in splits.items():
    clf = Pipeline([
        ("prep", preprocess),
        ("dt", DecisionTreeClassifier(criterion="entropy", random_state=42))
    ]).fit(X_tr, y_tr)
    models[name] = (clf, X_te, y_te)

import matplotlib.pyplot as plt

for name, (clf, X_te, y_te) in models.items():
    # 1) In báo cáo
    y_pred = clf.predict(X_te)
    print(f"\n### {name} split ###")
    print(classification_report(y_te, y_pred, digits=3))

    # 2) Tính ma trận nhầm lẫn (số)
    cm = confusion_matrix(y_te, y_pred)

    # 3) Vẽ heat-map confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                                  display_labels=clf.classes_)
    fig, ax = plt.subplots(figsize=(5,5))
    disp.plot(ax=ax, cmap="Blues", values_format="d")
    ax.set_title(f"Confusion Matrix ({name})")
    ax.set_xlabel("Predicted Label")
    ax.set_ylabel("True Label")
    plt.show()

### **5. Effect of Tree Depth (Depth vs Accuracy – 80/20)**

In [None]:
# ❶ Get the correct 80/20 split
X_tr, X_te, y_tr, y_te = splits["80/20"]

# ❷ Try different max_depth values
depths = [None, 2, 3, 4, 5, 6, 7]
acc = []

# ❸ Create a folder to save images
Path("fig_depth").mkdir(exist_ok=True)

for d in depths:
    clf = Pipeline([
        ("prep", preprocess),
        ("dt",  DecisionTreeClassifier(
                    criterion="entropy",
                    max_depth=d,
                    random_state=42))
    ]).fit(X_tr, y_tr)

    # Accuracy
    y_pred = clf.predict(X_te)
    acc.append(accuracy_score(y_te, y_pred))

    # Plot the tree using matplotlib
    plt.figure(figsize=(16, 10))
    plot_tree(
        clf.named_steps["dt"],
        feature_names=clf.named_steps["prep"].get_feature_names_out(),
        class_names=clf.named_steps["dt"].classes_,
        filled=True, rounded=True, proportion=True
    )
    plt.tight_layout()

    depth_tag = d if d is not None else "None"
    plt.savefig(f"fig_depth/tree_depth_{depth_tag}.png", dpi=300)
    print(f"--- tree depth: {depth_tag} ---")
    display(Image(f"fig_depth/tree_depth_{depth_tag}.png"))
    plt.close()

print(acc)

# ❹ Plot depth vs Accuracy chart
plt.figure(figsize=(6,4))
plt.plot([0 if d is None else d for d in depths], acc, marker="o")
plt.xticks([0]+depths[1:])
plt.xlabel("max_depth (0 ≜ None)")
plt.ylabel("Accuracy")
plt.title("Effect of Decision Tree Depth – 80/20 split")
plt.grid(True)
plt.savefig("fig_depth/depth_vs_accuracy.png", dpi=300)
plt.show()