In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import utils

In [None]:
df = utils.pre_process_df()
df

In [None]:
# Check column types
non_numeric_columns = []
cols_w_missing_values = []

for col in df.columns:
    if not pd.api.types.is_integer_dtype(df[col]):
        non_numeric_columns.append((col, df[col].dtype))
    if df[col].isnull().sum() > 0:
        cols_w_missing_values.append(col)

# Print result
if len(non_numeric_columns) == 0:
    print("All columns are int or float.\n")
else:
    print("There are columns that are not int:")
    for col, dtype in non_numeric_columns:
        print(f"Column: {col}, Type: {dtype}")
        
        
if len(cols_w_missing_values) == 0:
    print("No columns have missing values.")
else:
    print("Columns with missing values:")
    for col in cols_w_missing_values:
        print(f"Column: {col}")

In [None]:
utils.visualize_correlation(df)

In [None]:
# Verify class imbalance of the Target

utils.visualize_class_imbalance(df)

As we see there is an imbalance in the classes. We will SMOTE to balance the classes.

In [None]:
utils.visualize_feature_distributions(df)

In [None]:
## DECISION TREE

tree = DecisionTreeClassifier(random_state=42)

accuracy = utils.holdout_accuracy(df, tree, test_size=0.2)
cv_score = utils.cross_validation_acc(df, tree, cv_fold=10)

print(f"Holdout Accuracy: {accuracy}%\nCross-Validation Accuracy: {cv_score}%")

In [None]:
utils.analyze_tree_complexity(tree, df)

In [None]:
### RANDOM FOREST

forest = RandomForestClassifier(random_state=42)

accuracy = utils.holdout_accuracy(df, tree, test_size=0.2)
cv_score = utils.cross_validation_acc(df, tree, cv_fold=10)

print(f"Holdout Accuracy: {accuracy}%\nCross-Validation Accuracy: {cv_score}%")

In [None]:
utils.apply_simplification_based_xai(forest,df)

In [None]:
utils.apply_feature_based_xai(forest,df)

In [None]:
utils.apply_simplification_based_xai(forest,df)