In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import cross_validate
import matplotlib.pyplot as plt

train_data = pd.read_csv("train_final.csv")
test_data = pd.read_csv("test_final.csv")
# Define features
numeric_features = ['age', 'fnlwgt', 'education.num', 'capital.gain', 'capital.loss', 'hours.per.week']
categorical_features = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country']

# Handle missing values
def handle_missing_values(train_data, test_data):
    train_cleaned = train_data.copy()
    test_cleaned = test_data.copy()
    
    for feature in ['workclass', 'occupation', 'native.country']:
        mode_value = train_data[feature][train_data[feature] != '?'].mode()[0]
        train_cleaned[feature] = train_cleaned[feature].replace('?', mode_value)
        test_cleaned[feature] = test_cleaned[feature].replace('?', mode_value)
    
    return train_cleaned, test_cleaned

# Clean the data
train_data_cleaned, test_data_cleaned = handle_missing_values(train_data, test_data)

# Create preprocessing pipelines
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create Decision Tree Classifier pipeline
# Using CART (Classification and Regression Trees) algorithm
dt_model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(
        criterion='gini',           # Use Gini impurity as splitting criterion
        max_depth=5,               # Limit tree depth to prevent overfitting
        min_samples_split=50,      # Minimum samples required to split a node
        min_samples_leaf=20,       # Minimum samples required at each leaf node
        random_state=42            # For reproducibility
    ))
])

# Prepare target variable
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data_cleaned['income>50K'])

# Perform cross-validation with multiple metrics
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1'
}

cv_results = cross_validate(dt_model, 
                          train_data_cleaned.drop('income>50K', axis=1), 
                          y_train, 
                          cv=5, 
                          scoring=scoring)

# Print cross-validation results
print("\nDecision Tree Classification Results:")
print("=====================================")
print("Model Type: CART (Classification and Regression Trees)")
print("Criterion: Gini impurity")
print("\nCross-validation metrics:")
for metric in scoring.keys():
    scores = cv_results[f'test_{metric}']
    print(f"{metric.capitalize()}:")
    print(f"  Mean: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# Fit the model on full training data
dt_model.fit(train_data_cleaned.drop('income>50K', axis=1), y_train)

# Generate predictions for test data
test_ids = test_data_cleaned['ID'].copy()
test_predictions = dt_model.predict(test_data_cleaned.drop('ID', axis=1))

# Create submission dataframe
submission = pd.DataFrame({
    'ID': test_ids,
    'income>50K': label_encoder.inverse_transform(test_predictions)
})

# Feature importance analysis
def get_feature_importance(pipeline, feature_names):
    categorical_features_encoded = pipeline.named_steps['preprocessor']\
        .named_transformers_['cat'].get_feature_names_out(categorical_features)
    all_features = numeric_features + list(categorical_features_encoded)
    
    importances = pipeline.named_steps['classifier'].feature_importances_
    
    feature_importance = pd.DataFrame({
        'feature': all_features,
        'importance': importances
    })
    
    return feature_importance.sort_values('importance', ascending=False)

# Print feature importance
print("\nTop 10 Most Important Features:")
feature_importance = get_feature_importance(dt_model, numeric_features + categorical_features)
print(feature_importance.head(10))

# Save predictions
submission.to_csv('decision_tree_predictions.csv', index=False)

# Print model parameters
print("\nDecision Tree Parameters:")
tree_params = dt_model.named_steps['classifier'].get_params()
print(f"Maximum depth: {tree_params['max_depth']}")
print(f"Minimum samples split: {tree_params['min_samples_split']}")
print(f"Minimum samples leaf: {tree_params['min_samples_leaf']}")
print(f"Criterion: {tree_params['criterion']}")


Decision Tree Classification Results:
Model Type: CART (Classification and Regression Trees)
Criterion: Gini impurity

Cross-validation metrics:
Accuracy:
  Mean: 0.853 (+/- 0.009)
Precision:
  Mean: 0.779 (+/- 0.036)
Recall:
  Mean: 0.543 (+/- 0.043)
F1:
  Mean: 0.639 (+/- 0.028)

Top 10 Most Important Features:
                              feature  importance
29  marital.status_Married-civ-spouse    0.454778
2                       education.num    0.234920
3                        capital.gain    0.226094
4                        capital.loss    0.054662
5                      hours.per.week    0.018416
0                                 age    0.005586
24                  education_Masters    0.005311
56                           sex_Male    0.000210
93       native.country_United-States    0.000023
9              workclass_Self-emp-inc    0.000000

Decision Tree Parameters:
Maximum depth: 5
Minimum samples split: 50
Minimum samples leaf: 20
Criterion: gini


