In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

In [5]:
# Sample Dataset
np.random.seed(42)
X = np.random.rand(100, 3)  # 100 samples, 3 features
y = np.random.choice([0, 1], size=100)  # Binary classification

# Convert to DataFrame for feature name consistency
feature_names = ["Feature_1", "Feature_2", "Feature_3"]
X_df = pd.DataFrame(X, columns=feature_names)

# Preprocessing - Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

# Standardization (Keep as DataFrame)
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=feature_names, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=feature_names, index=X_test.index)

# Define Decision Tree Model
dt_model = DecisionTreeClassifier(criterion='gini', random_state=42)

# Train Model
dt_model.fit(X_train_scaled, y_train)

# Compute Predictions
y_pred = dt_model.predict(X_test_scaled)

# Validate Hyperparameters using GridSearchCV
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 3, 5]
}
grid_search = GridSearchCV(DecisionTreeClassifier(criterion='gini', random_state=42),
                           param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Get best model from Grid Search
best_model = grid_search.best_estimator_

# Evaluate Model
print(f"Best Model Parameters: {grid_search.best_params_}")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred, zero_division=1))

# Predict New Data
new_data = pd.DataFrame([[0.6, 0.8, 0.4]], columns=feature_names)  # Keep DataFrame structure
new_data_scaled = pd.DataFrame(scaler.transform(new_data), columns=feature_names)  # Ensure feature names
new_prediction = best_model.predict(new_data_scaled)
new_probability = best_model.predict_proba(new_data_scaled)[:, 1]

print(f"Prediction for New Data: Class = {new_prediction[0]}, Probability = {new_probability[0]:.4f}")

Best Model Parameters: {'max_depth': 5, 'min_samples_leaf': 3, 'min_samples_split': 2}
Accuracy: 0.5500
              precision    recall  f1-score   support

           0       0.56      0.50      0.53        10
           1       0.55      0.60      0.57        10

    accuracy                           0.55        20
   macro avg       0.55      0.55      0.55        20
weighted avg       0.55      0.55      0.55        20

Prediction for New Data: Class = 1, Probability = 0.6667
