In [1]:
import pandas as pd
import numpy as np

In [2]:
tree_df = pd.read_csv("../data/tree_data.csv")
tree_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 40 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   gender                             7043 non-null   int64  
 1   age                                7043 non-null   int64  
 2   married                            7043 non-null   int64  
 3   number_of_dependents               7043 non-null   int64  
 4   zip_code                           7043 non-null   int64  
 5   population                         7043 non-null   int64  
 6   tenure_in_months                   7043 non-null   int64  
 7   phone_service                      7043 non-null   int64  
 8   avg_monthly_long_distance_charges  7043 non-null   float64
 9   multiple_lines                     7043 non-null   int64  
 10  internet_service                   7043 non-null   int64  
 11  avg_monthly_gb_download            7043 non-null   int64

## Train-Test Split

In [3]:
from sklearn.model_selection import train_test_split

# separate features and target
X = tree_df.drop(columns=["churn_label"])
y = tree_df['churn_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42,
                                                    stratify=y)

# check class distributions
train_dist = y_train.value_counts(normalize=True).rename("Train Proportion")
test_dist = y_test.value_counts(normalize=True).rename("Test Proportion")
class_distribution = pd.concat([train_dist, test_dist], axis=1)

print(class_distribution)

             Train Proportion  Test Proportion
churn_label                                   
0                    0.734647         0.734564
1                    0.265353         0.265436


## Decision Tree Classifier

In [6]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve

dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# make predictions
y_pred_dt = dt_model.predict(X_test)
y_prob_dt = dt_model.predict_proba(X_test)[:,1]

In [9]:
# confusion matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_dt))

# classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))

# ROC AUC score
auc = roc_auc_score(y_test, y_prob_dt)
print(f"ROC AUC Score: {auc:.2f}")

Confusion Matrix:
 [[998  37]
 [ 38 336]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.96      0.96      1035
           1       0.90      0.90      0.90       374

    accuracy                           0.95      1409
   macro avg       0.93      0.93      0.93      1409
weighted avg       0.95      0.95      0.95      1409

ROC AUC Score: 0.93


**Insights:**
- **Accuracy:** 0.95
- **Recall for label 0:** 0.96 (few false negatives)
- **Recall for label 1:** 0.90 (few false negatives)
- **F-1 Score for label 1:** 0.90
- **ROC AUC Score:** 0.93 (model is able to separate the classes)