In [3]:
# Import necessary libraries
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report

# Load the Iris dataset
iris = load_iris()
# Create a Pandas DataFrame
iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)
iris_df['class'] = iris.target
iris_df['class'] = iris_df['class'].map({0: 'Iris Setosa', 1: 'Iris Versicolour', 2: 'Iris Virginica'})

# Display the first few rows of the dataset
print("First few rows of the Iris dataset:")
print(iris_df.head())
print()

# Split the dataset into features and target variable
X = iris_df.drop('class', axis=1)
y = iris_df['class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Generate a set of binary decision trees with max_depth from 1 to 5
for max_depth in range(1, 6):
    # Create a decision tree classifier
    clf = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=max_depth, random_state=42)
    # Train the model
    clf.fit(X_train, y_train)
    # Make predictions on the test set
    y_pred = clf.predict(X_test)
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred, average='macro')
    precision = precision_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    
    # Print the results
    print(f"max_depth={max_depth}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    print(classification_report(y_test, y_pred))
    print()

First few rows of the Iris dataset:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

         class  
0  Iris Setosa  
1  Iris Setosa  
2  Iris Setosa  
3  Iris Setosa  
4  Iris Setosa  

max_depth=1:
  Accuracy: 0.6333
  Recall: 0.6667
  Precision: 0.4833
  F1 Score: 0.5402
                  precision    recall  f1-score   support

     Iris Setosa       1.00      1.00      1.00        10
Iris Versicolour       0.45      1.00      0.62         9
  Iris Virginica       0.00      0.00      0.00        11

     avg / total       0.47      0.63      0.52        30


max_

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  target = np.empty((n_samples,), dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  target[i] = np.asarray(ir[-1], dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  target[i] = np.asarray(ir[-1], dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  target[i] = np.asarray(ir[-1], dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  target[i] = np.asarray(ir[-1], dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  target[i] = np.asarray(ir[-1],

In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data"
columns = ['Sample_code_number', 'Clump_thickness', 'Uniformity_of_cell_size', 'Uniformity_of_cell_shape', 'Marginal_adhesion', 'Single_epithelial_cell_size', 'Bare_nuclei', 'Bland_chromatin', 'Normal_nucleoli', 'Mitoses', 'Class']
data = pd.read_csv(url, names=columns)

# Data preprocessing
# Handle missing values if any
data = data.replace('?', np.nan)
data = data.dropna()

# Convert the Class column (2 = benign, 4 = malignant) to 0 and 1
label_encoder = LabelEncoder()
data['Class'] = label_encoder.fit_transform(data['Class'])

# Separate features and labels
X = data.drop(['Class'], axis=1)
y = data['Class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a binary decision tree
clf = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2, random_state=42)
clf.fit(X_train, y_train)

# Get information about the first split
# Get the feature index for the first split
first_split_feature_index = clf.tree_.feature[0]
# Get the name of the feature used for the first split
first_split_feature = X.columns[first_split_feature_index]
# Get the threshold value for the first split
first_split_threshold = clf.tree_.threshold[0]

# Calculate entropy, Gini impurity, and misclassification error for the root node
# Get the Gini impurity of the root node
root_gini = clf.tree_.impurity[0]
# Calculate the entropy of the root node
root_entropy = - (y_train.value_counts(normalize=True) * np.log2(y_train.value_counts(normalize=True))).sum()
# Calculate the misclassification error of the root node
root_misclassification_error = 1 - y_train.value_counts(normalize=True).max()

# Print the results
print(f"Feature for the first split: {first_split_feature}")
print(f"Threshold for the first split: {first_split_threshold}")
print(f"Gini impurity of the root node: {root_gini}")
print(f"Entropy of the root node: {root_entropy}")
print(f"Misclassification error of the root node: {root_misclassification_error}")

# Calculate information gain
# Get the number of samples in the left and right child nodes
left_samples = clf.tree_.n_node_samples[clf.tree_.children_left[0]]
right_samples = clf.tree_.n_node_samples[clf.tree_.children_right[0]]
# Get the Gini impurity of the left and right child nodes
left_gini = clf.tree_.impurity[clf.tree_.children_left[0]]
right_gini = clf.tree_.impurity[clf.tree_.children_right[0]]
# Calculate the weighted average Gini impurity
weighted_gini = (left_samples * left_gini + right_samples * right_gini) / (left_samples + right_samples)
# Calculate information gain
information_gain = root_gini - weighted_gini
print(f"Information gain: {information_gain}")

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Calculate performance metrics
accuracy = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print performance metrics
print(f"Accuracy: {accuracy}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 score: {f1}")

Feature for the first split: Uniformity_of_cell_size
Threshold for the first split: 3.5
Gini impurity of the root node: 0.44321673442552567
Entropy of the root node: 0.9164534336173732
Misclassification error of the root node: 0.33150183150183155
Information gain: 0.3361199070906031
Accuracy: 0.9416058394160584
Recall: 0.9137931034482759
Precision: 0.9464285714285714
F1 score: 0.9298245614035087


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)


In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
columns = ['ID', 'Diagnosis', 'radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave_points1', 'symmetry1', 'fractal_dimension1',
           'radius2', 'texture2', 'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave_points2', 'symmetry2', 'fractal_dimension2',
           'radius3', 'texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3', 'symmetry3', 'fractal_dimension3']
data = pd.read_csv(url, names=columns)

# Data preprocessing
# Convert the Diagnosis column (M and B) to 1 and 0
data['Diagnosis'] = data['Diagnosis'].map({'M': 1, 'B': 0})

# Separate features and labels
X = data.drop(['ID', 'Diagnosis'], axis=1)
y = data['Diagnosis']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Function to train and evaluate the model with PCA
def train_and_evaluate_pca(n_components, X_train_scaled, X_test_scaled, y_train, y_test):
    # Apply PCA
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)
    
    # Build a binary decision tree
    clf = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2, random_state=42)
    clf.fit(X_train_pca, y_train)
    
    # Make predictions on the test set
    y_pred = clf.predict(X_test_pca)
    
    # Calculate performance metrics
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    # Calculate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = conf_matrix.ravel()
    
    # Calculate FPR and TPR
    fpr = fp / (fp + tn)
    tpr = tp / (tp + fn)
    
    return precision, recall, f1, fp, tp, fpr, tpr

# Evaluate the model with original continuous data
# Train and evaluate the model with original continuous data
clf_original = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2, random_state=42)
clf_original.fit(X_train_scaled, y_train)
y_pred_original = clf_original.predict(X_test_scaled)

# Calculate performance metrics for original data
precision_original = precision_score(y_test, y_pred_original)
recall_original = recall_score(y_test, y_pred_original)
f1_original = f1_score(y_test, y_pred_original)

# Calculate confusion matrix for original data
conf_matrix_original = confusion_matrix(y_test, y_pred_original)
tn_original, fp_original, fn_original, tp_original = conf_matrix_original.ravel()

# Calculate FPR and TPR for original data
fpr_original = fp_original / (fp_original + tn_original)
tpr_original = tp_original / (tp_original + fn_original)

# Evaluate the model with PCA using the first principal component
precision_pca1, recall_pca1, f1_pca1, fp_pca1, tp_pca1, fpr_pca1, tpr_pca1 = train_and_evaluate_pca(1, X_train_scaled, X_test_scaled, y_train, y_test)

# Evaluate the model with PCA using the first and second principal components
precision_pca2, recall_pca2, f1_pca2, fp_pca2, tp_pca2, fpr_pca2, tpr_pca2 = train_and_evaluate_pca(2, X_train_scaled, X_test_scaled, y_train, y_test)

# Print the results
print("Original Continuous Data:")
print(f"  Precision: {precision_original}")
print(f"  Recall: {recall_original}")
print(f"  F1 Score: {f1_original}")
print(f"  FP: {fp_original}")
print(f"  TP: {tp_original}")
print(f"  FPR: {fpr_original}")
print(f"  TPR: {tpr_original}")
print()

print("PCA with 1 Component:")
print(f"  Precision: {precision_pca1}")
print(f"  Recall: {recall_pca1}")
print(f"  F1 Score: {f1_pca1}")
print(f"  FP: {fp_pca1}")
print(f"  TP: {tp_pca1}")
print(f"  FPR: {fpr_pca1}")
print(f"  TPR: {tpr_pca1}")
print()

print("PCA with 2 Components:")
print(f"  Precision: {precision_pca2}")
print(f"  Recall: {recall_pca2}")
print(f"  F1 Score: {f1_pca2}")
print(f"  FP: {fp_pca2}")
print(f"  TP: {tp_pca2}")
print(f"  FPR: {fpr_pca2}")
print(f"  TPR: {tpr_pca2}")
print()


Original Continuous Data:
  Precision: 0.9487179487179487
  Recall: 0.8604651162790697
  F1 Score: 0.9024390243902439
  FP: 2
  TP: 37
  FPR: 0.028169014084507043
  TPR: 0.8604651162790697

PCA with 1 Component:
  Precision: 0.975609756097561
  Recall: 0.9302325581395349
  F1 Score: 0.9523809523809524
  FP: 1
  TP: 40
  FPR: 0.014084507042253521
  TPR: 0.9302325581395349

PCA with 2 Components:
  Precision: 0.9743589743589743
  Recall: 0.8837209302325582
  F1 Score: 0.9268292682926831
  FP: 1
  TP: 38
  FPR: 0.014084507042253521
  TPR: 0.8837209302325582



Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  y_encoded = np.zeros(y.shape, dtype=np.int)
