<a href="https://colab.research.google.com/github/sangjunlim-dev/ml-practicals/blob/main/decision_trees.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns

sns.set()
from matplotlib import pyplot as plt

%config InlineBackend.figure_format = 'retina'

In [3]:
DATA_URL = "https://raw.githubusercontent.com/Yorko/mlcourse.ai/main/data/"

df = pd.read_csv(DATA_URL + "telecom_churn.csv")
df

Unnamed: 0,State,Account length,Area code,International plan,Voice mail plan,Number vmail messages,Total day minutes,Total day calls,Total day charge,Total eve minutes,Total eve calls,Total eve charge,Total night minutes,Total night calls,Total night charge,Total intl minutes,Total intl calls,Total intl charge,Customer service calls,Churn
0,KS,128,415,No,Yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.70,1,False
1,OH,107,415,No,Yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.70,1,False
2,NJ,137,415,No,No,0,243.4,114,41.38,121.2,110,10.30,162.6,104,7.32,12.2,5,3.29,0,False
3,OH,84,408,Yes,No,0,299.4,71,50.90,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,False
4,OK,75,415,Yes,No,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,AZ,192,415,No,Yes,36,156.2,77,26.55,215.5,126,18.32,279.1,83,12.56,9.9,6,2.67,2,False
3329,WV,68,415,No,No,0,231.1,57,39.29,153.4,55,13.04,191.3,123,8.61,9.6,4,2.59,3,False
3330,RI,28,510,No,No,0,180.8,109,30.74,288.8,58,24.55,191.9,91,8.64,14.1,6,3.81,2,False
3331,CT,184,510,Yes,No,0,213.8,105,36.35,159.6,84,13.57,139.2,137,6.26,5.0,10,1.35,2,False


In [4]:
# Preprocessing
df['International plan'] = df['International plan'].map({'Yes': 1, 'No': 0})
df['Voice mail plan'] = df['Voice mail plan'].map({'Yes': 1, 'No': 0})

df['Churn'] = df['Churn'].astype('int64')

states = df.pop('State')

In [6]:
X, y = df.drop('Churn', axis=1), df['Churn']

In [29]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Splitting the data (X, y) into 70% train and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

# Initializing a decision tree classifier
tree = DecisionTreeClassifier()

# Training the model on the training data
tree.fit(X_train, y_train)

# Cross Validation
cv_scores = cross_val_score(tree, X_train, y_train, cv=5)

# Print cross-validation scores
print(f"Cross-validation scores on training data: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")


# Predicting on the test set
pred_test = tree.predict(X_test)

# Evaluating the accuracy of the model on the test data
print(f"Test Set Accuracy: {accuracy_score(y_test, pred_test)}")

Cross-validation scores on training data: [0.90792291 0.90578158 0.89507495 0.89055794 0.89270386]
Mean cross-validation score: 0.8984082491659852
Test Set Accuracy: 0.91


In [27]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score

# Hyperparameters
params = {'max_depth': np.arange(2, 11), 'min_samples_leaf': np.arange(2, 11)}

# Initialize StratifiedKFold for cross-validation with 5 splits
skf = StratifiedKFold(n_splits=5, shuffle=True)

# Create a GridSearchCV object to tune the hyperparameters of the decision tree using cross-validation
gridsearch = GridSearchCV(estimator=tree,
                          param_grid=params,
                          cv=skf,
                          n_jobs=-1,
                          verbose=1)

# Fit the GridSearchCV object on the training data (X_train, y_train)
gridsearch.fit(X_train, y_train)

# Get the best estimator (model) found by the grid search
best_tree = gridsearch.best_estimator_

# Get the best score (mean cross-validated score of the best estimator) from the grid search
best_score = gridsearch.best_score_

# Print results
print(f"Best Model: {best_tree}")
print(f"Best Score: {best_score}")

# Cross Validation
cv_scores = cross_val_score(best_tree, X_train, y_train, cv=5)

# Print cross-validation scores
print(f"Cross-validation scores on training data: {cv_scores}")
print(f"Mean cross-validation score: {cv_scores.mean()}")


# Test the model on the test set
pred_test = best_tree.predict(X_test)

# Evaluate performance on the test set
test_accuracy = accuracy_score(y_test, pred_test)
print(f"Test Set Accuracy: {test_accuracy}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Model: DecisionTreeClassifier(max_depth=8, min_samples_leaf=4)
Best Score: 0.9348521748720258
Cross-validation scores on training data: [0.92077088 0.95074946 0.93147752 0.94635193 0.95493562]
Mean cross-validation score: 0.9408570824640892
Test Set Accuracy: 0.928
