In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt

In [7]:
def run_decision_tree_model(file_path, target_variable='county_count', missing_marker=-999):
    df = pd.read_csv(file_path)
    
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.fillna(missing_marker, inplace=True)
    
    X = df.drop(target_variable, axis=1)
    y = df[target_variable]
    
    X = X.select_dtypes(include=[np.number])
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    param_grid = {
        'max_depth': [None, 5, 10, 15, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    cv = KFold(n_splits=5, shuffle=True, random_state=42)
    
    grid_search = GridSearchCV(
        DecisionTreeRegressor(random_state=42),
        param_grid,
        cv=cv,
        scoring='neg_mean_absolute_error',
        n_jobs=-1
    )
    
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    
    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"File: {file_path}")
    print("Best hyperparameters:", grid_search.best_params_)
    print("Test set performance:")
    print(f"Mean Absolute Error (MAE): {mae:.4f}")
    print(f"Mean Squared Error (MSE): {mse:.4f}")
    print(f"R^2 Score: {r2:.4f}")
        
    return best_model, grid_search.best_params_, (mae, mse, r2)


In [9]:
bladder_file = "bladder_no_geo.csv"
lung_file = "lung_no_geo.csv"

print("Bladder Cancer Model")
bladder_model, bladder_params, bladder_metrics = run_decision_tree_model(
    bladder_file, 
    target_variable='county_count'
)

print("\nLung Cancer Model")
lung_model, lung_params, lung_metrics = run_decision_tree_model(
    lung_file, 
    target_variable='county_count'
)

Bladder Cancer Model
File: bladder_no_geo.csv
Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5}
Test set performance:
Mean Absolute Error (MAE): 0.1068
Mean Squared Error (MSE): 0.2514
R^2 Score: 0.8767

Lung Cancer Model
File: lung_no_geo.csv
Best hyperparameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
Test set performance:
Mean Absolute Error (MAE): 0.2913
Mean Squared Error (MSE): 1.4438
R^2 Score: 0.9119
