# Install and Import Libraries

In [None]:
!pip install ucimlrepo --quiet

In [None]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from ucimlrepo import fetch_ucirepo

# Set Random Seed

Make sure that you use this on every single call to the `train_test_split` function

In [None]:
RANDOM_SEED = 123456789

# Fetch and One-Hot Encode Datasets

In [None]:
# fetch dataset
bank_marketing = fetch_ucirepo(id=222)

# data (as pandas dataframes)
bank_marketing_features = bank_marketing.data.features
bank_marketing_features_onehot = pd.get_dummies(bank_marketing_features)
bank_marketing_labels = bank_marketing.data.targets

# metadata
print(bank_marketing.metadata)

# variable information
print(bank_marketing.variables)

{'uci_id': 222, 'name': 'Bank Marketing', 'repository_url': 'https://archive.ics.uci.edu/dataset/222/bank+marketing', 'data_url': 'https://archive.ics.uci.edu/static/public/222/data.csv', 'abstract': 'The data is related with direct marketing campaigns (phone calls) of a Portuguese banking institution. The classification goal is to predict if the client will subscribe a term deposit (variable y).', 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 45211, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Occupation', 'Marital Status', 'Education Level'], 'target_col': ['y'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2014, 'last_updated': 'Fri Aug 18 2023', 'dataset_doi': '10.24432/C5K306', 'creators': ['S. Moro', 'P. Rita', 'P. Cortez'], 'intro_paper': {'ID': 277, 'type': 'NATIVE', 'title': 'A data-driven approach to predict the s

In [None]:
cc_default = fetch_ucirepo(id=350)

# data (as pandas dataframes)
cc_default_features = cc_default.data.features
cc_default_features_onehot = pd.get_dummies(cc_default_features, columns=['X2', 'X3', 'X4', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11'])
cc_default_labels = cc_default.data.targets

# metadata
print(cc_default.metadata)

# variable information
print(cc_default.variables)

{'uci_id': 350, 'name': 'Default of Credit Card Clients', 'repository_url': 'https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients', 'data_url': 'https://archive.ics.uci.edu/static/public/350/data.csv', 'abstract': "This research aimed at the case of customers' default payments in Taiwan and compares the predictive accuracy of probability of default among six data mining methods.", 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 30000, 'num_features': 23, 'feature_types': ['Integer', 'Real'], 'demographics': ['Sex', 'Education Level', 'Marital Status', 'Age'], 'target_col': ['Y'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Fri Mar 29 2024', 'dataset_doi': '10.24432/C55S3H', 'creators': ['I-Cheng Yeh'], 'intro_paper': {'ID': 365, 'type': 'NATIVE', 'title': 'The comparisons of data mining techniques for the predictive accuracy of 

# Q1

## Construct Default Decision Tree for Bank Marketing

In [None]:
# Change the code below to construct, train, and evaluate the default decision tree on the bank marketing dataset
# Make sure you report accuracy on the training set as well!
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets (80-20 split)
bank_marketing_features_train, bank_marketing_features_test, bank_marketing_labels_train, bank_marketing_labels_test = train_test_split(
    bank_marketing_features_onehot,  # Preprocessed features
    bank_marketing_labels,          # Target labels
    test_size=0.2,                  # 20% for testing
    random_state=42                 # Reproducibility
)

# Initialize the Decision Tree Classifier with default parameters
bank_marketing_dt_default = DecisionTreeClassifier(random_state=42)

# Train the decision tree on the training data
bank_marketing_dt_default.fit(bank_marketing_features_train, bank_marketing_labels_train)

# Make predictions on both the training and testing sets
bank_marketing_train_predictions = bank_marketing_dt_default.predict(bank_marketing_features_train)
bank_marketing_test_predictions = bank_marketing_dt_default.predict(bank_marketing_features_test)

# Calculate accuracy for both training and testing sets
train_accuracy = accuracy_score(bank_marketing_labels_train, bank_marketing_train_predictions)
test_accuracy = accuracy_score(bank_marketing_labels_test, bank_marketing_test_predictions)

# Output the training and testing accuracy
print(f"Training Accuracy: {train_accuracy*100:.2f}%")
print(f"Testing Accuracy: {test_accuracy*100:.2f}%")


Training Accuracy: 100.00%
Testing Accuracy: 87.67%


# Q2

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets (80-20 split)
bank_marketing_features_train, bank_marketing_features_test, bank_marketing_labels_train, bank_marketing_labels_test = train_test_split(
    bank_marketing_features_onehot,  # Preprocessed features
    bank_marketing_labels,          # Target labels
    test_size=0.2,                  # 20% for testing
    random_state=42                 # Reproducibility
)

# Maximum depths to limit to
max_depths = [1, 2, 3, 5, 7, 10, 15, 20]

# Initialize a list to store results
results = []

# Loop through different maximum depths
for depth in max_depths:
    # Initialize a decision tree classifier with the specified max depth
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)

    # Train the model
    dt.fit(bank_marketing_features_train, bank_marketing_labels_train)

    # Make predictions
    train_predictions = dt.predict(bank_marketing_features_train)
    test_predictions = dt.predict(bank_marketing_features_test)

    # Calculate accuracies
    train_accuracy = accuracy_score(bank_marketing_labels_train, train_predictions)
    test_accuracy = accuracy_score(bank_marketing_labels_test, test_predictions)

    # Append results
    results.append({"Depth": depth, "Training Accuracy": train_accuracy, "Testing Accuracy": test_accuracy})

# Convert results to a DataFrame for better visualization
results_df = pd.DataFrame(results)

# Print the table
print(results_df)

# Optional: Save the table to a CSV
results_df.to_csv("decision_tree_depth_analysis.csv", index=False)


   Depth  Training Accuracy  Testing Accuracy
0      1           0.883931          0.879354
1      2           0.896925          0.893177
2      3           0.902372          0.896495
3      5           0.906741          0.897158
4      7           0.912077          0.897822
5     10           0.925265          0.897269
6     15           0.958665          0.894062
7     20           0.985291          0.884552


## Finish Q2 like Q1, but limiting maximum depth

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the dataset (reuse for consistency)
bank_marketing_features_train, bank_marketing_features_test, bank_marketing_labels_train, bank_marketing_labels_test = train_test_split(
    bank_marketing_features_onehot,  # Features with one-hot encoding
    bank_marketing_labels,          # Target labels
    test_size=0.2,                  # 80% train, 20% test split
    random_state=42                 # Ensure reproducibility
)

# Maximum depths to evaluate
max_depths = [1, 2, 3, 5, 7, 10, 15, 20]

# Results storage
results = []

# Loop through each depth
for depth in max_depths:
    # Create and train a decision tree with the specified max depth
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)
    dt.fit(bank_marketing_features_train, bank_marketing_labels_train)

    # Evaluate performance
    train_accuracy = accuracy_score(bank_marketing_labels_train, dt.predict(bank_marketing_features_train))
    test_accuracy = accuracy_score(bank_marketing_labels_test, dt.predict(bank_marketing_features_test))

    # Store the depth and accuracies
    results.append({"Depth": depth, "Training Accuracy": train_accuracy, "Testing Accuracy": test_accuracy})

# Convert results into a DataFrame
results_df = pd.DataFrame(results)

# Print results table
print(results_df)

# Save results as CSV for further use (optional)
results_df.to_csv("decision_tree_max_depth_analysis.csv", index=False)


   Depth  Training Accuracy  Testing Accuracy
0      1           0.883931          0.879354
1      2           0.896925          0.893177
2      3           0.902372          0.896495
3      5           0.906741          0.897158
4      7           0.912077          0.897822
5     10           0.925265          0.897269
6     15           0.958665          0.894062
7     20           0.985291          0.884552


# Q3

## Construct Default Decision Tree for Credit Card Default Dataset

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets (80-20 split)
cc_default_features_train, cc_default_features_test, cc_default_labels_train, cc_default_labels_test = train_test_split(
    cc_default_features_onehot,  # Preprocessed features with one-hot encoding
    cc_default_labels,          # Target labels
    test_size=0.2,              # 20% for testing
    random_state=42             # Reproducibility
)

# Create a Decision Tree Classifier with default parameters
dt_default = DecisionTreeClassifier(random_state=42)

# Train the model on the training set
dt_default.fit(cc_default_features_train, cc_default_labels_train)

# Make predictions on both the training and test sets
train_predictions = dt_default.predict(cc_default_features_train)
test_predictions = dt_default.predict(cc_default_features_test)

# Calculate accuracy for training and test sets
train_accuracy = accuracy_score(cc_default_labels_train, train_predictions)
test_accuracy = accuracy_score(cc_default_labels_test, test_predictions)

# Report accuracies
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")


Training Accuracy: 99.95%
Testing Accuracy: 72.57%


# Q4

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Split the data into training and testing sets (80-20 split)
cc_default_features_train, cc_default_features_test, cc_default_labels_train, cc_default_labels_test = train_test_split(
    cc_default_features_onehot,  # Preprocessed features with one-hot encoding
    cc_default_labels,          # Target labels
    test_size=0.2,              # 20% for testing
    random_state=42             # Reproducibility
)

# List of maximum depths to evaluate
max_depths = [1, 2, 3, 5, 7, 10, 15, 20]

# Initialize a list to store results
results = []

# Loop through each depth
for depth in max_depths:
    # Create a Decision Tree Classifier with the current max depth
    dt = DecisionTreeClassifier(max_depth=depth, random_state=42)

    # Train the model on the training set
    dt.fit(cc_default_features_train, cc_default_labels_train)

    # Make predictions on the training and test sets
    train_predictions = dt.predict(cc_default_features_train)
    test_predictions = dt.predict(cc_default_features_test)

    # Calculate accuracy for training and test sets
    train_accuracy = accuracy_score(cc_default_labels_train, train_predictions)
    test_accuracy = accuracy_score(cc_default_labels_test, test_predictions)

    # Append the results to the list
    results.append({
        "Depth": depth,
        "Training Accuracy": train_accuracy,
        "Testing Accuracy": test_accuracy
    })

# Convert results into a DataFrame for better readability
results_df = pd.DataFrame(results)

# Print results table
print(results_df)

# Save results as CSV for further use (optional)
results_df.to_csv("decision_tree_depth_analysis.csv", index=False)


   Depth  Training Accuracy  Testing Accuracy
0      1           0.812125          0.815667
1      2           0.812958          0.816333
2      3           0.818750          0.818833
3      5           0.823625          0.820500
4      7           0.829375          0.820000
5     10           0.848042          0.809500
6     15           0.892667          0.795667
7     20           0.937417          0.768667


## Finish Q4 like Q3, but limiting maximum depth

# Question 5

### Similarities Between Logistic Regression and Classification Tree

Both are supervised learning MachineLearning algorithms

### Differences Between Logistic Regression and Classification Tree

1. **Model Type:**
   - **Logistic Regression**: Assume a linear model
   - **Classification Tree**: Non-linear model that splits data based on feature values.

2. **Interpretability:**
   - **Logistic Regression**: Coefficients represent feature influence, but harder to visualize.
   - **Classification Tree**: Easy to interpret with a tree structure, showing decision-making steps.

3. **Assumptions:**
   - **Logistic Regression**: Assumes linearity and independent errors.
   - **Classification Tree**: Makes no assumptions about data distribution.

4. **Feature Handling:**
   - **Logistic Regression**: Works best with numerical features (needs encoding for categorical).
   - **Classification Tree**: Handles both numerical and categorical features directly.

5. **Overfitting:**
   - **Logistic Regression**: Less prone to overfitting if regularized.
   - **Classification Tree**: More prone to overfitting, especially without pruning.

### When Would They Perform Similarly?

- **Scenario:** **Linearly Separable Data** (e.g., a clear boundary separating classes in feature space).
  - Both algorithms would perform similarly as logistic regression would fit a linear decision boundary and classification trees would make clear splits.

### When Would They Perform Differently?

- **Scenario:** **Non-Linearly Separable Data** (e.g., classes are intertwined or have complex patterns).
  - **Logistic Regression**: Struggles with non-linear separability and performs poorly.
  - **Classification Tree**: Performs well by making complex splits to capture non-linear relationships.

### Summary

- **Similar Performance:** When the data is linearly separable.
- **Different Performance:** In complex, non-linear datasets, classification trees tend to outperform logistic regression due to their flexibility.

# Question 6

I noticed that my coworker's decision tree is overfitting because the depth is set too high (10). With a small dataset (~500 data points), a deep tree becomes too complex and memorizes specific details and noise from the training data. This results in near-perfect training accuracy (~100%) but poor testing accuracy (~60%) because the model struggles to generalize. In small datasets, the model becomes overly specific to the training set, capturing noise instead of general patterns. To fix this, I would recommend reducing the tree’s depth or using pruning techniques to prevent overfitting and improve generalization.

# Question 7

No, the logistic regression and decision tree do not match. Logistic regression uses a continuous, linear relationship between the features (weight, height, and age) and retention, as shown by the equation. In contrast, the decision tree creates discrete splits based on thresholds (e.g., weight > 200 or age > 40). This results in a non-linear, piecewise decision-making process. While both predict retention, the logistic regression model gives a smooth, continuous prediction, whereas the decision tree makes predictions based on specific feature thresholds.