In [14]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, f1_score

# Load the Iris dataset
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['target'] = iris.target

# Split the dataset into training and test sets
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Decision trees with different depths
depths = range(1, 6)
for depth in depths:
    clf = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=depth)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    recall = recall_score(y_test, y_pred, average='weighted')
    # Set the zero_division parameter
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_test, y_pred, average='weighted')
    print(f"Depth: {depth}, Recall: {recall}, Precision: {precision}, F1-score: {f1}")

# Differences among micro-average, macro-average, and weighted-average
print("Micro-average: Calculate the true positives, false positives, and false negatives for all classes and then compute the overall precision, recall, and F1-score.")
print("Macro-average: Calculate the precision, recall, and F1-score for each class separately and then take the average.")
print("Weighted-average: Calculate the precision, recall, and F1-score for each class separately and then perform a weighted average based on the number of samples in each class.")

Depth: 1, Recall: 0.7111111111111111, Precision: 0.8555555555555555, F1-score: 0.6148148148148148
Depth: 2, Recall: 1.0, Precision: 1.0, F1-score: 1.0
Depth: 3, Recall: 1.0, Precision: 1.0, F1-score: 1.0
Depth: 4, Recall: 1.0, Precision: 1.0, F1-score: 1.0
Depth: 5, Recall: 1.0, Precision: 1.0, F1-score: 1.0
Micro-average: Calculate the true positives, false positives, and false negatives for all classes and then compute the overall precision, recall, and F1-score.
Macro-average: Calculate the precision, recall, and F1-score for each class separately and then take the average.
Weighted-average: Calculate the precision, recall, and F1-score for each class separately and then perform a weighted average based on the number of samples in each class.


In [15]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.tree import export_graphviz
import graphviz
from collections import Counter
import math

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
column_names = ['id', 'clump_thickness', 'uniformity_of_cell_size', 'uniformity_of_cell_shape',
                'marginal_adhesion','single_epithelial_cell_size', 'bare_nuclei',
                'bland_chromatin', 'normal_nucleoli','mitoses', 'class']
df = pd.read_csv(url, names=column_names)

# Handle missing values
df = df.replace('?', np.nan)
df = df.dropna()

# Encode categorical variables
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])

# Split features and target variable
X = df.drop(['id', 'class'], axis=1)
y = df['class']

# Convert X to numpy.ndarray type and change the data type to np.float32
X_array = X.values.astype(np.float32)

# Build a decision tree
clf = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2)
clf.fit(X_array, y)

# Calculate the entropy, Gini index, and misclassification error of the first split
dot_data = export_graphviz(clf, out_file=None, feature_names=X.columns, class_names=['benign','malignant'],
                           filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)

# Get the node information of the first split
left_child_index = clf.tree_.children_left[0]
right_child_index = clf.tree_.children_right[0]

# Calculate the total number of samples
total_samples = len(y)

# Calculate the number of samples in the left and right child nodes
left_samples = clf.tree_.n_node_samples[left_child_index]
right_samples = clf.tree_.n_node_samples[right_child_index]

# Calculate the class distribution of the left and right child nodes
left_class_counts = Counter(y[clf.tree_.apply(X_array) == left_child_index])
right_class_counts = Counter(y[clf.tree_.apply(X_array) == right_child_index])

# Calculate the entropy of the parent node
parent_entropy = 0
parent_class_counts = Counter(y)
for count in parent_class_counts.values():
    prob = count / total_samples
    parent_entropy -= prob * math.log2(prob)

# Calculate the entropy of the left child node
left_entropy = 0
if left_class_counts:
    for count in left_class_counts.values():
        prob = count / left_samples
        left_entropy -= prob * math.log2(prob)

# Calculate the entropy of the right child node
right_entropy = 0
if right_class_counts:
    for count in right_class_counts.values():
        prob = count / right_samples
        right_entropy -= prob * math.log2(prob)

# Calculate the entropy after the first split
split_entropy = (left_samples / total_samples) * left_entropy + (right_samples / total_samples) * right_entropy

# Calculate the information gain
information_gain = parent_entropy - split_entropy

# Calculate the Gini index
parent_gini = 1
for count in parent_class_counts.values():
    prob = count / total_samples
    parent_gini -= prob ** 2

left_gini = 1
if left_class_counts:
    for count in left_class_counts.values():
        prob = count / left_samples
        left_gini -= prob ** 2
else:
    left_gini = 0

right_gini = 1
if right_class_counts:
    for count in right_class_counts.values():
        prob = count / right_samples
        right_gini -= prob ** 2
else:
    right_gini = 0

split_gini = (left_samples / total_samples) * left_gini + (right_samples / total_samples) * right_gini

# Calculate the misclassification error
parent_misclassification = 1 - max([count / total_samples for count in parent_class_counts.values()])
left_misclassification = 0
if left_class_counts:
    left_misclassification = 1 - max([count / left_samples for count in left_class_counts.values()])

right_misclassification = 0
if right_class_counts:
    right_misclassification = 1 - max([count / right_samples for count in right_class_counts.values()])

split_misclassification = (left_samples / total_samples) * left_misclassification + (right_samples / total_samples) * right_misclassification

print(f"Entropy of the first split: {split_entropy}")
print(f"Gini index of the first split: {split_gini}")
print(f"Misclassification error of the first split: {split_misclassification}")
print(f"Information gain: {information_gain}")
print(f"Feature selected for the first split: {X.columns[clf.tree_.feature[0]]}")
print(f"Value of the decision boundary: {clf.tree_.threshold[0]}")

Entropy of the first split: 0.0
Gini index of the first split: 0.0
Misclassification error of the first split: 0.0
Information gain: 0.9340026588217948
Feature selected for the first split: uniformity_of_cell_size
Value of the decision boundary: 2.5


In [16]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
column_names = ['id', 'diagnosis'] + [f'feature_{i}' for i in range(30)]
df = pd.read_csv(url, names=column_names)

# Encode the categorical variable
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

# Split features and target variable
X = df.drop(['id', 'diagnosis'], axis=1)
y = df['diagnosis']

# Split the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Data standardization
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Dimensionality reduction with PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Use the first principal component
X_train_pca_1 = X_train_pca[:, :1]
X_test_pca_1 = X_test_pca[:, :1]

clf_pca_1 = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2)
clf_pca_1.fit(X_train_pca_1, y_train)
y_pred_pca_1 = clf_pca_1.predict(X_test_pca_1)

f1_pca_1 = f1_score(y_test, y_pred_pca_1)
precision_pca_1 = precision_score(y_test, y_pred_pca_1)
recall_pca_1 = recall_score(y_test, y_pred_pca_1)

# Use the first two principal components
X_train_pca_2 = X_train_pca[:, :2]
X_test_pca_2 = X_test_pca[:, :2]

clf_pca_2 = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2)
clf_pca_2.fit(X_train_pca_2, y_train)
y_pred_pca_2 = clf_pca_2.predict(X_test_pca_2)

f1_pca_2 = f1_score(y_test, y_pred_pca_2)
precision_pca_2 = precision_score(y_test, y_pred_pca_2)
recall_pca_2 = recall_score(y_test, y_pred_pca_2)

# Use the original data
clf_original = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2)
clf_original.fit(X_train_scaled, y_train)
y_pred_original = clf_original.predict(X_test_scaled)

f1_original = f1_score(y_test, y_pred_original)
precision_original = precision_score(y_test, y_pred_original)
recall_original = recall_score(y_test, y_pred_original)

# Confusion matrix
cm_pca_1 = confusion_matrix(y_test, y_pred_pca_1)
cm_pca_2 = confusion_matrix(y_test, y_pred_pca_2)
cm_original = confusion_matrix(y_test, y_pred_original)

# False positive rate and true positive rate
FP_pca_1 = cm_pca_1[0, 1]
TP_pca_1 = cm_pca_1[1, 1]
FPR_pca_1 = FP_pca_1 / (cm_pca_1[0, 0] + FP_pca_1)
TPR_pca_1 = TP_pca_1 / (cm_pca_1[1, 0] + TP_pca_1)

FP_pca_2 = cm_pca_2[0, 1]
TP_pca_2 = cm_pca_2[1, 1]
FPR_pca_2 = FP_pca_2 / (cm_pca_2[0, 0] + FP_pca_2)
TPR_pca_2 = TP_pca_2 / (cm_pca_2[1, 0] + TP_pca_2)

FP_original = cm_original[0, 1]
TP_original = cm_original[1, 1]
FPR_original = FP_original / (cm_original[0, 0] + FP_original)
TPR_original = TP_original / (cm_original[1, 0] + TP_original)

print("Using the first principal component:")
print(f"F1 score: {f1_pca_1}, Precision: {precision_pca_1}, Recall: {recall_pca_1}")
print(f"False positives: {FP_pca_1}, True positives: {TP_pca_1}, False positive rate: {FPR_pca_1}, True positive rate: {TPR_pca_1}")

print("Using the first two principal components:")
print(f"F1 score: {f1_pca_2}, Precision: {precision_pca_2}, Recall: {recall_pca_2}")
print(f"False positives: {FP_pca_2}, True positives: {TP_pca_2}, False positive rate: {FPR_pca_2}, True positive rate: {TPR_pca_2}")

print("Using the original data:")
print(f"F1 score: {f1_original}, Precision: {precision_original}, Recall: {recall_original}")
print(f"False positives: {FP_original}, True positives: {TP_original}, False positive rate: {FPR_original}, True positive rate: {TPR_original}")

# Discuss the benefits of using continuous data
print("The benefits of using continuous data may include retaining more information, thereby improving the model's performance. However, it may also increase computational complexity and the risk of overfitting. A trade-off needs to be made based on the specific situation.")    

Using the first principal component:
F1 score: 0.8992248062015504, Precision: 0.8787878787878788, Recall: 0.9206349206349206
False positives: 8, True positives: 58, False positive rate: 0.07407407407407407, True positive rate: 0.9206349206349206
Using the first two principal components:
F1 score: 0.8852459016393442, Precision: 0.9152542372881356, Recall: 0.8571428571428571
False positives: 5, True positives: 54, False positive rate: 0.046296296296296294, True positive rate: 0.8571428571428571
Using the original data:
F1 score: 0.9047619047619048, Precision: 0.9047619047619048, Recall: 0.9047619047619048
False positives: 6, True positives: 57, False positive rate: 0.05555555555555555, True positive rate: 0.9047619047619048
The benefits of using continuous data may include retaining more information, thereby improving the model's performance. However, it may also increase computational complexity and the risk of overfitting. A trade-off needs to be made based on the specific situation.
