In [5]:
import pandas as pd

from pathlib import Path

from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
breeds_to_compare = ["saint_bernard", "siamese"]

In [7]:
files: Path = Path(f"../datasets").resolve()
    
md_list: list[int] = [md for md in range(1, 11)]

accuracy_data: dict[str, list] = dict()

accuracy_data["dataset"] = list()
accuracy_data["train"] = list()

for md in md_list:
    accuracy_data[f"md={md}"] = list()

for file in files.iterdir():
    df: pd.DataFrame = pd.read_csv(file.as_posix())
    df = df[df.breed.isin(breeds_to_compare)]
    
    features_df: pd.DataFrame = df.iloc[:, 3:]
    metadata_df: pd.DataFrame = df.iloc[:, :3]
    animal_series: pd.Series = df.animal

    scaler = StandardScaler()
    features_scaled_df = scaler.fit_transform(features_df)

    x_train, x_test, y_train, y_test = train_test_split(
        features_scaled_df,
        animal_series,
        test_size=0.3,
        random_state=42,
        stratify=animal_series
    )

    for md in md_list:
        decision_tree: DecisionTreeClassifier = DecisionTreeClassifier(max_depth=md, random_state=42)
        decision_tree.fit(x_train, y_train)
        
        y_predict = decision_tree.predict(x_test)

        accuracy: float = accuracy_score(y_test, y_predict)
        
        accuracy_data[f"md={md}"].append(accuracy)

    accuracy_data["dataset"].append(file.name)
    accuracy_data["train"].append("70/30")

accuracy_df: pd.DataFrame = pd.DataFrame.from_dict(accuracy_data)

output_file: Path = Path(f"../accuracies/decision_tree_70_30.csv").resolve()
output_file.parent.mkdir(parents=True, exist_ok=True)

accuracy_df.to_csv(output_file.as_posix(), index=False)

In [8]:
files: Path = Path(f"../datasets").resolve()
    
md_list: list[int] = [md for md in range(1, 11)]

accuracy_data: dict[str, list] = dict()

accuracy_data["dataset"] = list()
accuracy_data["train"] = list()

for md in md_list:
    accuracy_data[f"md={md}"] = list()

for file in files.iterdir():
    df: pd.DataFrame = pd.read_csv(file.as_posix())
    df = df[df.breed.isin(breeds_to_compare)]
    
    features_df: pd.DataFrame = df.iloc[:, 3:]
    metadata_df: pd.DataFrame = df.iloc[:, :3]
    animal_series: pd.Series = df.animal

    scaler = StandardScaler()
    features_scaled_df = scaler.fit_transform(features_df)

    kfold: KFold = KFold(n_splits=10, random_state=42, shuffle=True)

    for md in md_list:
        decision_tree: DecisionTreeClassifier = DecisionTreeClassifier(max_depth=md, random_state=42)
        
        scores = cross_val_score(decision_tree, features_scaled_df, animal_series, cv=kfold, scoring="accuracy")
        accuracy: float = scores.mean()

        accuracy_data[f"md={md}"].append(accuracy)
    
    accuracy_data["dataset"].append(file.name)
    accuracy_data["train"].append("10-fold CV")

accuracy_df: pd.DataFrame = pd.DataFrame.from_dict(accuracy_data)

output_file: Path = Path(f"../accuracies/decision_tree_10_fold_cv.csv").resolve()
output_file.parent.mkdir(parents=True, exist_ok=True)

accuracy_df.to_csv(output_file.as_posix(), index=False)