## Классификация

In [96]:
import pandas as pd


df = pd.read_csv("data/possum.csv")
df = df.drop(columns=["case"])

feature_type_by_column = {
    "site": "categorical",
    "Pop": "categorical",
    "sex": "categorical",
    "age": "numeric",
    "hdlngth": "numeric",
    "skullw": "numeric",
    "totlngth": "numeric",
    "taill": "numeric",
    "footlgth": "numeric",
    "earconch": "numeric",
    "eye": "numeric",
    "chest": "numeric",
    "belly": "numeric",
}

df.head()

Unnamed: 0,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [97]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   site      104 non-null    int64  
 1   Pop       104 non-null    object 
 2   sex       104 non-null    object 
 3   age       102 non-null    float64
 4   hdlngth   104 non-null    float64
 5   skullw    104 non-null    float64
 6   totlngth  104 non-null    float64
 7   taill     104 non-null    float64
 8   footlgth  103 non-null    float64
 9   earconch  104 non-null    float64
 10  eye       104 non-null    float64
 11  chest     104 non-null    float64
 12  belly     104 non-null    float64
dtypes: float64(10), int64(1), object(2)
memory usage: 10.7+ KB


In [98]:
import numpy as np


X = df.drop(columns=["sex"]).to_numpy()
y = df["sex"].map({"m": 1, "f": 0}).astype(np.int8).to_numpy()
feature_types = [feature_type_by_column[col] for col in df.drop(columns=["sex"]).columns]
X.shape, y.shape

((104, 12), (104,))

In [99]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [100]:
from desicion_tree_classifier_id3 import DecisionTreeClassifierID3
from rich.syntax import Syntax
from rich.console import Console


tree = DecisionTreeClassifierID3(max_depth=5, criterion="entropy")
tree.fit(X_train, y_train, feature_types)

console = Console()
tree_str = str(tree)
syntax = Syntax(tree_str, "python", theme="light")
console.print(syntax)

In [101]:
tree.prune(X_test, y_test)
tree_str = str(tree)
syntax = Syntax(tree_str, "python", theme="light")
console.print(syntax)


In [102]:
from evaluate import evaluate_classifier

tree = DecisionTreeClassifierID3(max_depth=5, criterion="entropy")

# Before pruning
before_pruning_scores = evaluate_classifier(
    tree, X_train, y_train, X_test, y_test, feature_types
)

# After pruning
tree.prune(X_test, y_test)
after_pruning_scores = evaluate_classifier(
    tree, X_train, y_train, X_test, y_test, feature_types
)

# Create markdown table comparing before and after pruning
print("|Metric|Before Pruning|After Pruning|")
print("|------|--------------|-------------|")
print(f"|Accuracy|{before_pruning_scores.accuracy:.4f}|{after_pruning_scores.accuracy:.4f}|")
print(f"|Precision|{before_pruning_scores.precision:.4f}|{after_pruning_scores.precision:.4f}|") 
print(f"|Recall|{before_pruning_scores.recall:.4f}|{after_pruning_scores.recall:.4f}|")
print(f"|F1|{before_pruning_scores.f1:.4f}|{after_pruning_scores.f1:.4f}|")



|Metric|Before Pruning|After Pruning|
|------|--------------|-------------|
|Accuracy|0.7143|0.7143|
|Precision|0.7143|0.7143|
|Recall|0.7143|0.7143|
|F1|0.7143|0.7143|


In [103]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Create masks for categorical and numeric features
categorical_mask = [ft == "categorical" for ft in feature_types]
numeric_mask = [ft == "numeric" for ft in feature_types]

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_mask),
        ("cat", OneHotEncoder(sparse_output=False), categorical_mask),
    ]
)

X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

custom_tree = DecisionTreeClassifierID3(max_depth=5, criterion="entropy")
custom_tree.fit(X_train, y_train, feature_types)
custom_tree.prune(X_test, y_test)

custom_scores = evaluate_classifier(
    custom_tree, X_train, y_train, X_test, y_test, feature_types
)

clf = DecisionTreeClassifier(max_depth=5)
sklearn_scores = evaluate_classifier(
    clf, X_train_processed, y_train, X_test_processed, y_test, feature_types
)

print("|Metric|Custom|sklearn|")
print("|------|--------------|-------------|")
print(f"|Accuracy|{custom_scores.accuracy:.4f}|{sklearn_scores.accuracy:.4f}|")
print(f"|Precision|{custom_scores.precision:.4f}|{sklearn_scores.precision:.4f}|") 
print(f"|Recall|{custom_scores.recall:.4f}|{sklearn_scores.recall:.4f}|")
print(f"|F1|{custom_scores.f1:.4f}|{sklearn_scores.f1:.4f}|")

|Metric|Custom|sklearn|
|------|--------------|-------------|
|Accuracy|0.7143|0.6667|
|Precision|0.7143|0.6599|
|Recall|0.7143|0.6667|
|F1|0.7143|0.6617|


In [104]:
%%time
DecisionTreeClassifier(max_depth=5).fit(X_train_processed, y_train)

CPU times: user 937 μs, sys: 558 μs, total: 1.49 ms
Wall time: 1.01 ms


In [105]:
%%time
DecisionTreeClassifierID3(max_depth=5, criterion="entropy").fit(X_train, y_train, feature_types)


CPU times: user 61 ms, sys: 3.4 ms, total: 64.4 ms
Wall time: 62.8 ms



## Регрессия


In [106]:
df.head()

Unnamed: 0,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,1,Vic,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,1,Vic,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,1,Vic,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,1,Vic,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,1,Vic,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [107]:
X = df.drop(columns=["footlgth"]).to_numpy()
y = df["footlgth"].to_numpy()
feature_types = [feature_type_by_column[col] for col in df.drop(columns=["footlgth"]).columns]
X.shape, y.shape

((104, 12), (104,))

In [108]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [109]:
from desicion_tree_regressor_id3 import DecisionTreeRegressorID3
custom_tree = DecisionTreeRegressorID3(max_depth=5)
custom_tree.fit(X_train, y_train, feature_types)
custom_tree.prune_from_nans()
print(custom_tree)

def explicit_predict(feature):
	if feature[1] == 'Vic':
		if feature[6] <= 80.5:
			if feature[7] <= 32.0:
				return 62.7
			else:
				if feature[3] <= 7.0:
					if feature[2] == 'm':
						return 68.4
					else:
						return 68.7
				else:
					return 70.3
		else:
			if feature[7] <= 37.5:
				if feature[4] <= 92.1:
					return 70.86666666666666
				else:
					if feature[11] <= 32.0:
						return 73.92857142857143
					else:
						return 72.24545454545455
			else:
				if feature[6] <= 91.0:
					if feature[2] == 'm':
						return 73.5
					else:
						return 73.8
				else:
					if feature[3] <= 3.0:
						return 77.9
					else:
						return 75.75
	else:
		if feature[4] <= 98.0:
			if feature[5] <= 54.0:
				if feature[2] == 'm':
					if feature[4] <= 82.5:
						return 65.7
					else:
						return 62.94
				else:
					if feature[4] <= 88.2:
						return 60.5
					else:
						return 61.75
			else:
				if feature[11] <= 31.0:
					if feature[5] <= 55.2:
						return 63.0
					else:
			

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [110]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from evaluate import evaluate_regressor

# Create masks for categorical and numeric features
categorical_mask = [ft == "categorical" for ft in feature_types]
numeric_mask = [ft == "numeric" for ft in feature_types]

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_mask),
        ("cat", OneHotEncoder(sparse_output=False), categorical_mask),
    ]
)

X_train_clean = X_train[~np.isnan(y_train)]
y_train_clean = y_train[~np.isnan(y_train)]

X_train_processed = preprocessor.fit_transform(X_train_clean)
X_test_processed = preprocessor.transform(X_test)


custom_scores = evaluate_regressor(
    custom_tree, X_train, y_train, X_test, y_test, feature_types
)

clf = DecisionTreeRegressor(max_depth=5)
sklearn_scores = evaluate_regressor(
    clf, X_train_processed, y_train_clean, X_test_processed, y_test, feature_types
)

print("|Metric|Custom|sklearn|")
print("|------|--------------|-------------|")
print(f"|MSE|{custom_scores.mse:.4f}|{sklearn_scores.mse:.4f}|")
print(f"|MAE|{custom_scores.mae:.4f}|{sklearn_scores.mae:.4f}|") 
print(f"|R2|{custom_scores.r2:.4f}|{sklearn_scores.r2:.4f}|")

|Metric|Custom|sklearn|
|------|--------------|-------------|
|MSE|6.7308|7.9485|
|MAE|1.9895|2.1833|
|R2|0.6155|0.5280|


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [111]:
%%time
DecisionTreeRegressor(max_depth=5).fit(X_train_processed, y_train_clean)


CPU times: user 627 μs, sys: 157 μs, total: 784 μs
Wall time: 533 μs


In [112]:
%%time
DecisionTreeRegressorID3(max_depth=5).fit(X_train, y_train, feature_types)

CPU times: user 61.5 ms, sys: 1.43 ms, total: 63 ms
Wall time: 62.6 ms


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
