In [31]:
import pandas as pd


columns_name = ['pelvic incidence', 'pelvic tilt', 'lumbar lordosis angle',
                'sacral slope', 'pelvic radius', 'grade of spondylolisthesis', 'class']
df = pd.read_csv('./vertebracolumn/column_2C.dat', header=None, sep=' ', names=columns_name)

df.loc[df['class'] == 'NO', ['class']] = 0
df.loc[df['class'] == 'AB',['class']] = 1

df.head()



Unnamed: 0,pelvic incidence,pelvic tilt,lumbar lordosis angle,sacral slope,pelvic radius,grade of spondylolisthesis,class
0,63.03,22.55,39.61,40.48,98.67,-0.25,1
1,39.06,10.06,25.02,29.0,114.41,4.56,1
2,68.83,22.22,50.09,46.61,105.99,-3.53,1
3,69.3,24.65,44.31,44.64,101.87,11.21,1
4,49.71,9.65,28.32,40.06,108.17,7.92,1


In [32]:
from sklearn.model_selection import train_test_split

X = df.drop('class', axis=1)  # 'class' es el nombre de la columna objetivo
y = df['class'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [34]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

clf = DecisionTreeClassifier()
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.7580645161290323
              precision    recall  f1-score   support

           0       0.58      0.61      0.59        18
           1       0.84      0.82      0.83        44

    accuracy                           0.76        62
   macro avg       0.71      0.71      0.71        62
weighted avg       0.76      0.76      0.76        62



In [35]:
# Poda usando max_depth (pre-pruning)
for depth in [2, 4, 6, 8, 10]:
    clf = DecisionTreeClassifier(max_depth=depth)
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    print(f"Accuracy (max_depth={depth}): {accuracy_score(y_test, y_pred)}")

# Poda usando min_samples_leaf (post-pruning)
for min_samples in [2, 4, 6, 8, 10]:
    clf = DecisionTreeClassifier(min_samples_leaf=min_samples)
    clf.fit(X_train_scaled, y_train)
    y_pred = clf.predict(X_test_scaled)
    print(f"Accuracy (min_samples_leaf={min_samples}): {accuracy_score(y_test, y_pred)}")


Accuracy (max_depth=2): 0.8548387096774194
Accuracy (max_depth=4): 0.8548387096774194
Accuracy (max_depth=6): 0.7580645161290323
Accuracy (max_depth=8): 0.7741935483870968
Accuracy (max_depth=10): 0.7903225806451613
Accuracy (min_samples_leaf=2): 0.7580645161290323
Accuracy (min_samples_leaf=4): 0.8064516129032258
Accuracy (min_samples_leaf=6): 0.8064516129032258
Accuracy (min_samples_leaf=8): 0.8225806451612904
Accuracy (min_samples_leaf=10): 0.8548387096774194


In [36]:
from sklearn.model_selection import GridSearchCV

# With GridSearchCV
# Definimos los hiperparámetros a evaluar
param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 4, 6, 8, 10, 12],
    'min_samples_split': [2, 3, 4, 5],
    'min_samples_leaf': [1, 2, 3, 4, 5]
}

grid_search = GridSearchCV(DecisionTreeClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_scaled, y_train)

# Mejores hiperparámetros
print("Mejores hiperparámetros (GridSearchCV):", grid_search.best_params_)
print("Mejor accuracy (GridSearchCV):", grid_search.best_score_)

Mejores hiperparámetros (GridSearchCV): {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 2}
Mejor accuracy (GridSearchCV): 0.8306938775510204


In [37]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

# With RandomizedSearchCV
# Definimos los hiperparámetros a evaluar
param_dist = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 2, 4, 6, 8, 10, 12],
    'min_samples_split': randint(2, 6),
    'min_samples_leaf': randint(1, 6)
}

random_search = RandomizedSearchCV(DecisionTreeClassifier(), param_distributions=param_dist, n_iter=100, cv=5, scoring='accuracy')
random_search.fit(X_train_scaled, y_train)

# Mejores hiperparámetros
print("Mejores hiperparámetros (RandomizedSearchCV):", random_search.best_params_)
print("Mejor accuracy (RandomizedSearchCV):", random_search.best_score_)


Mejores hiperparámetros (RandomizedSearchCV): {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 5, 'min_samples_split': 2}
Mejor accuracy (RandomizedSearchCV): 0.8346122448979593
