# The Decision Tree on the Churn Dataset with Cross Validation

In [None]:
import sys
import pandas as pd
from IPython.display import display, HTML
from sklearn import preprocessing
from sklearn.tree import export_text, DecisionTreeClassifier
from sklearn.metrics import accuracy_score 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.model_selection import cross_validate, train_test_split, cross_val_score, GridSearchCV

sys.path.append("..")

## Read the churn file 

In [None]:
inputFile = "../data/churn.csv"
df = pd.read_csv(inputFile, delimiter=";")
display(df)
print (df.info())

## Data Preparation
### Transform labels into index

In [None]:
df_features = df.drop("LEAVE",axis=1) # drop label attribute from the features
df_labels = df[["LEAVE"]].copy()
display(df_features)
display(df_labels)

num_attributes = df.select_dtypes(include=["int64"]).columns.tolist()
cat_attributes = df.select_dtypes(exclude=["int64"]).columns.tolist()
cat_attributes.remove("LEAVE")
print(num_attributes)
print(cat_attributes)
cat_encoder = OrdinalEncoder()
label_encoder = OrdinalEncoder().set_output(transform="pandas")
transform_pipeline = ColumnTransformer([("num",StandardScaler(),num_attributes), \
                                        ("cat",cat_encoder,cat_attributes)]).set_output(transform="pandas")
df_features_prepared = transform_pipeline.fit_transform(df_features)
display(df_features_prepared)
df_labels_prepared = label_encoder.fit_transform(df_labels)
df_labels_prepared.rename(columns={"LEAVE":"label"},inplace=True)
display(df_labels_prepared)

### Spliting the dataset into train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_features_prepared,df_labels_prepared,test_size=0.4,random_state=1234)
display (X_train)
display (X_test) 
display (y_train)
display (y_test)

## Build the decision tree model

In [None]:
dt = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=12,max_depth=5)

## Cross Validation 

In [None]:
scores = cross_validate(dt, X_train, y_train,cv=5,scoring="accuracy")
print (scores)
print("Test Error = %0.2f (std=%0.2f)" %((1.0 - scores["test_score"].mean()), scores["test_score"].std()))
# solution with cross_val_score
scores = cross_val_score(dt, X_train, y_train,cv=5,scoring="accuracy")
print (scores)
print("Test Error = %0.2f (std=%0.2f)" %((1.0 - scores.mean()), scores.std()))

## Nested Cross Validation
### Build a network parameters grid

In [None]:
param_grid = [{"criterion":["entropy"], "max_depth": [ 5, 10 ], "min_samples_leaf":[5, 10, 15]},\
              {"criterion":["entropy","gini"]}]

### Hyperparamenter search

In [None]:
grid_search = GridSearchCV(dt, param_grid, cv=5, scoring="accuracy",return_train_score=True)
grid_search.fit(X_train, y_train)
print (grid_search.best_params_)
print (grid_search.best_estimator_)
print (grid_search.best_score_)

### Results of the hyperparameter search

In [None]:
results = grid_search.cv_results_
print(results)

## Test the model 

In [None]:
y_pred = grid_search.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print("Test Error = " ,(1.0 - accuracy))

### Create the model again for verification

In [None]:
dt_opt = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=5,max_depth=5)
dt_opt.fit(X_train,y_train)
y_pred_opt = dt_opt.predict(X_test)
accuracy = accuracy_score(y_test,y_pred_opt)
print("Test Error = " ,(1.0 - accuracy))