# Decision Tree

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

import env
import acquire
import prepare

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
SEED = 21

In [4]:
df = acquire.get_titanic_data()
df = prepare.prep_titanic(df)
df = df.drop(columns=['sex','embark_town'])
df.head()

Unnamed: 0,survived,pclass,sibsp,parch,fare,alone,sex_female,sex_male,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton
0,0,3,1,0,7.25,0,0,1,0,0,1
1,1,1,1,0,71.2833,0,1,0,1,0,0
2,1,3,0,0,7.925,1,1,0,0,0,1
3,1,1,1,0,53.1,0,1,0,0,0,1
4,0,3,0,0,8.05,1,0,1,0,0,1


### What is your baseline prediction? What is your baseline accuracy? 

In [5]:
df.survived.value_counts()

0    549
1    340
Name: survived, dtype: int64

In [6]:
1 - df.survived.mean()

0.6175478065241844

Baseline prediction is 0 (didn't survive) and is accurate 62% of the time

### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [7]:
train, val, test = prepare.split_data(df,target='survived')
train.shape,val.shape,test.shape

((711, 11), (124, 11), (54, 11))

In [8]:
X_train = train.drop(columns='survived')
y_train = train.survived

X_val = val.drop(columns='survived')
y_val = val.survived

X_test = test.drop(columns='survived')
y_test = test.survived

In [9]:
clf = DecisionTreeClassifier(max_depth=3, random_state=SEED)

In [10]:
clf = clf.fit(X_train, y_train)

### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

This model predicted 'survived' with an accuracy of 80% compared to a baseline of 62%

In [11]:
clf.score(X_train, y_train)

0.8059071729957806

In [12]:
y_pred = clf.predict(X_train)

In [13]:
confusion_matrix(y_train, y_pred)

array([[393,  46],
       [ 92, 180]])

In [14]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85       439
           1       0.80      0.66      0.72       272

    accuracy                           0.81       711
   macro avg       0.80      0.78      0.79       711
weighted avg       0.81      0.81      0.80       711



In [15]:
clf.score(X_val, y_val)

0.8548387096774194

In [16]:
y_pred = clf.predict(X_val)

In [17]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.92      0.89        77
           1       0.85      0.74      0.80        47

    accuracy                           0.85       124
   macro avg       0.85      0.83      0.84       124
weighted avg       0.85      0.85      0.85       124



## Repeat with a different Decision Tree Depth

In [18]:
clf = DecisionTreeClassifier(max_depth=6,random_state=SEED)

In [19]:
clf = clf.fit(X_train,y_train)

In [20]:
clf.score(X_train, y_train)

0.8466947960618847

In [21]:
y_pred = clf.predict(X_train)

In [22]:
confusion_matrix(y_train, y_pred)

array([[414,  25],
       [ 84, 188]])

In [23]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.94      0.88       439
           1       0.88      0.69      0.78       272

    accuracy                           0.85       711
   macro avg       0.86      0.82      0.83       711
weighted avg       0.85      0.85      0.84       711



Accuracy is higher with a Decision Tree depth of 6 with accuracy of 85%

In [24]:
y_pred = clf.predict(X_val)

In [25]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.95      0.88        77
           1       0.89      0.66      0.76        47

    accuracy                           0.84       124
   macro avg       0.85      0.80      0.82       124
weighted avg       0.85      0.84      0.83       124



A depth of 6 still performs well on our Validate set with and accuracy of 84%

# Work with Telco 

In [36]:
df = acquire.get_telco_data()
df = prepare.prep_telco(df)
df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,...,contract_type_Month-to-month,contract_type_One year,contract_type_Two year,internet_service_type_DSL,internet_service_type_Fiber optic,internet_service_type_None,payment_type_Bank transfer (automatic),payment_type_Credit card (automatic),payment_type_Electronic check,payment_type_Mailed check
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,...,0,1,0,1,0,0,0,0,0,1
1,Male,0,No,No,9,Yes,Yes,No,No,No,...,1,0,0,1,0,0,0,0,0,1
2,Male,0,No,No,4,Yes,No,No,No,Yes,...,1,0,0,0,1,0,0,0,1,0
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,...,1,0,0,0,1,0,0,0,1,0
4,Female,1,Yes,No,3,Yes,No,No,No,No,...,1,0,0,0,1,0,0,0,0,1


Remove columns of dtype 'object'

In [37]:
col_drop = df.loc[:,df.columns!='churn'].select_dtypes('object').columns.to_list()
df = df.drop(columns=col_drop)
df = df.drop(columns=['churn_No','churn_Yes'])

Determine Baseline

In [44]:
df.churn.value_counts(normalize=True)

No     0.73463
Yes    0.26537
Name: churn, dtype: float64

Our baseline prediction is 'No' with an accuracy of 73%

Split my data and prepare for ML

In [38]:
train,val,test = prepare.split_data(df,target='churn')

X_train= train.drop(columns='churn')
y_train= train.churn

X_val= val.drop(columns='churn')
y_val= val.churn

X_test= test.drop(columns='churn')
y_test= test.churn

In [39]:
clf = DecisionTreeClassifier(max_depth=6,random_state=SEED)
clf = clf.fit(X_train,y_train)

In [40]:
clf.score(X_train,y_train)

0.8127440539581114

In [41]:
y_preds = clf.predict(X_train)

In [42]:
print(classification_report(y_train,y_preds))

              precision    recall  f1-score   support

          No       0.85      0.90      0.88      4139
         Yes       0.67      0.57      0.62      1495

    accuracy                           0.81      5634
   macro avg       0.76      0.73      0.75      5634
weighted avg       0.81      0.81      0.81      5634



How well can we predict the validate set?

In [45]:
y_preds = clf.predict(X_val)

In [46]:
print(classification_report(y_val,y_preds))

              precision    recall  f1-score   support

          No       0.84      0.89      0.86       724
         Yes       0.62      0.53      0.57       262

    accuracy                           0.79       986
   macro avg       0.73      0.71      0.72       986
weighted avg       0.78      0.79      0.78       986

