# The Decision Tree on the Churn Dataset

In [None]:
import sys
import pandas as pd
from sklearn import preprocessing
from sklearn.tree import export_text, DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.model_selection import train_test_split

from IPython.display import display, HTML

sys.path.append("..")

## Select the churn file 

In [None]:
inputFile = "../data/churn.csv"
df = pd.read_csv(inputFile, delimiter=";")
display(df)
print (df.info())

## Data Preparation
### Transform labels into index

In [None]:
df_features = df.drop("LEAVE",axis=1) # drop label attribute from the features
df_labels = df[["LEAVE"]].copy()
display(df_features)
display(df_labels)

num_attributes = df.select_dtypes(include=["int64"]).columns.tolist()
cat_attributes = df.select_dtypes(exclude=["int64"]).columns.tolist()
cat_attributes.remove("LEAVE")
print(num_attributes)
print(cat_attributes)
cat_encoder = OrdinalEncoder()
label_encoder = OrdinalEncoder()
transform_pipeline = ColumnTransformer([("num",StandardScaler(),num_attributes), \
                                        ("cat",cat_encoder,cat_attributes)]).set_output(transform="pandas")
df_features_prepared = transform_pipeline.fit_transform(df_features)
display(df_features_prepared)
labels_prepared = label_encoder.fit_transform(df_labels)
df_labels_prepared = pd.DataFrame(labels_prepared,columns=["label"],index=df_labels.index)
display(df_labels_prepared)

### Spliting the dataset into train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_features_prepared,df_labels_prepared,test_size=0.4,random_state=1234)
display (X_train)
display (X_test) 
display (y_train)
display (y_test)

## Build the decision tree model

In [None]:
dtc = DecisionTreeClassifier(criterion="entropy", min_samples_leaf=12,max_depth=4)
dtc_model = dtc.fit(X_train,y_train)
plot_tree(dtc_model)
text_tree = export_text(dtc_model, feature_names=X_train.columns.values.tolist())
print(text_tree)

## Do the prediction 

In [None]:
y_pred = dtc_model.predict(X_test)
y_pred
y_prob = dtc_model.predict_proba(X_test)
y_prob

## Manual test

In [None]:
# join the features to the result
result = X_test.copy()
result['prediction'] = y_pred
result = result.join(y_test)
result_orig = result.join(df_features[cat_attributes],rsuffix='_ORIG').join(df_labels)
result_orig["LEAVE_PRED"] = label_encoder.inverse_transform(result[['prediction']])
display(result_orig)
correct = result_orig[result_orig["LEAVE"]==result_orig["LEAVE_PRED"]].shape[0]
incorrect = result_orig[result_orig["LEAVE"]!=result_orig["LEAVE_PRED"]].shape[0]
print("Test Error = " ,(1/(correct+incorrect)) * incorrect )


## Evaluate / Test the Model 

In [None]:
# Select (prediction, true label) and compute test error.
accuracy = accuracy_score(y_test,y_pred)
print("Test Error = " ,(1.0 - accuracy))