In [18]:
import acquire
import prepare

import numpy as np
import pandas as pd

from pydataset import data

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
titanic = acquire.get_titanic_data()

In [3]:
titanic.head(3)

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1


### What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.

In [4]:
titanic.survived.mode()[0]

0

In [5]:
titanic['model_baseline'] = titanic.survived.mode()[0]

In [6]:
titanic.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone,model_baseline
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1,0
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1,0


In [7]:
titanic = titanic.drop(columns= 'Unnamed: 0')

In [11]:
train, test, validate = prepare.prep_titanic_data(titanic)

In [13]:
train.shape, test.shape, validate.shape

((498, 11), (214, 11), (179, 11))

### Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [14]:
train.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,alone,model_baseline,sex_male,embark_town_Queenstown,embark_town_Southampton
583,0,1,36.0,0,0,40.125,1,0,1,0,0
165,1,3,9.0,0,2,20.525,0,0,1,0,1
50,0,3,7.0,4,1,39.6875,0,0,1,0,1
259,1,2,50.0,0,1,26.0,0,0,0,0,1
306,1,1,29.678105,0,0,110.8833,1,0,0,0,0


In [26]:
X_train = train.drop(columns=['survived'])

In [22]:
y_train= train.survived

In [27]:
clf = DecisionTreeClassifier(max_depth= 3, random_state=123)

In [28]:
clf.fit(X_train, y_train)

In [30]:
clf.classes_

array([0, 1])

In [32]:
y_pred = clf.predict(X_train)
y_pred[:5]

array([0, 0, 0, 1, 1])

In [34]:
y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[:5]

array([[0.65048544, 0.34951456],
       [0.65048544, 0.34951456],
       [0.65048544, 0.34951456],
       [0.03225806, 0.96774194],
       [0.03225806, 0.96774194]])

### Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [35]:
print('Training accuracy{:.2f}'
      .format(clf.score(X_train, y_train)))

Training accuracy0.83


In [36]:
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.83      0.90      0.86       307
           1       0.82      0.70      0.75       191

    accuracy                           0.83       498
   macro avg       0.82      0.80      0.81       498
weighted avg       0.82      0.83      0.82       498



In [38]:
conf = confusion_matrix(y_train, y_pred)
conf

array([[277,  30],
       [ 57, 134]])

In [40]:
clf.score(X_train, y_train)

0.8253012048192772

### Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [47]:
TN, FP, FN, TP = conf.ravel()

In [50]:
accuracy = TP + TN / (TN + FN + TP + TN)
tru_pos_rate = TP / (TP + FN)
false_pos_rate = FP / (FP + TN)
tru_neg_rate = TN / (TN + FP)
false_neg_rate = FN / (FN + TP)

precision = TP / (TP + FP)
recall  = tru_pos_rate
f1_score = 2 * (precision * recall) / (precision + recall)

In [51]:
print(f'''
accuracy is {accuracy}
tru_pos_rate is {tru_pos_rate}
false_pos_rate is {false_pos_rate}
tru_neg_rate is {tru_neg_rate}
false_neg_rate is {false_neg_rate}

precision is {precision}
recall is {recall}
f1_score is {f1_score}
''')


accuracy is 134.37181208053693
tru_pos_rate is 0.7015706806282722
false_pos_rate is 0.09771986970684039
tru_neg_rate is 0.9022801302931596
false_neg_rate is 0.29842931937172773

precision is 0.8170731707317073
recall is 0.7015706806282722
f1_score is 0.7549295774647887



### Run through steps 2-4 using a different max_depth value.

In [59]:
clf = DecisionTreeClassifier(max_depth= 5, random_state=123)
clf.fit(X_train, y_train)

DecisionTreeClassifier
DecisionTreeClassifier(max_depth=3, random_state=123)
clf.classes_
y_pred = clf.predict(X_train)
y_pred[:5]

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[:5]

#Evaluate your in-sample results using the model score, confusion matrix, and classification report.
X_train, y_train
print('Training accuracy{:.2f}'
      .format(clf.score(X_train, y_train)))

print(classification_report(y_train, y_pred))

conf
conf = confusion_matrix(y_train, y_pred)
conf

clf.score(X_train, y_train)


#Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
TN, FP, FN, TP = conf.ravel()

accuracy = TP + TN / (TN + FN + TP + TN)
tru_pos_rate = TP / (TP + FN)
false_pos_rate = FP / (FP + TN)
tru_neg_rate = TN / (TN + FP)
false_neg_rate = FN / (FN + TP)

precision = TP / (TP + FP)
recall  = tru_pos_rate
f1_score = 2 * (precision * recall) / (precision + recall)

print(f'''
accuracy is {accuracy}
tru_pos_rate is {tru_pos_rate}
false_pos_rate is {false_pos_rate}
tru_neg_rate is {tru_neg_rate}
false_neg_rate is {false_neg_rate}
​
precision is {precision}
recall is {recall}
f1_score is {f1_score}
''')

Training accuracy0.85
              precision    recall  f1-score   support

           0       0.85      0.92      0.89       307
           1       0.86      0.74      0.80       191

    accuracy                           0.85       498
   macro avg       0.85      0.83      0.84       498
weighted avg       0.85      0.85      0.85       498


accuracy is 142.37384412153236
tru_pos_rate is 0.743455497382199
false_pos_rate is 0.0781758957654723
tru_neg_rate is 0.9218241042345277
false_neg_rate is 0.25654450261780104
​
precision is 0.8554216867469879
recall is 0.743455497382199
f1_score is 0.7955182072829131



### Which model performs better on your in-sample data?

the k = 5 model performs better at .86 precision and .74 recall

### Which model performs best on your out-of-sample data, the validate set?

In [60]:
X_train = validate.drop(columns= 'survived')
y_train = validate.survived

In [61]:
clf = DecisionTreeClassifier(max_depth= 3, random_state=123)
clf.fit(X_train, y_train)

DecisionTreeClassifier
DecisionTreeClassifier(max_depth=3, random_state=123)
clf.classes_
y_pred = clf.predict(X_train)
y_pred[:5]

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[:5]

#Evaluate your in-sample results using the model score, confusion matrix, and classification report.
X_train, y_train
print('Training accuracy{:.2f}'
      .format(clf.score(X_train, y_train)))

print(classification_report(y_train, y_pred))

conf
conf = confusion_matrix(y_train, y_pred)
conf

clf.score(X_train, y_train)


#Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
TN, FP, FN, TP = conf.ravel()

accuracy = TP + TN / (TN + FN + TP + TN)
tru_pos_rate = TP / (TP + FN)
false_pos_rate = FP / (FP + TN)
tru_neg_rate = TN / (TN + FP)
false_neg_rate = FN / (FN + TP)

precision = TP / (TP + FP)
recall  = tru_pos_rate
f1_score = 2 * (precision * recall) / (precision + recall)

print(f'''
accuracy is {accuracy}
tru_pos_rate is {tru_pos_rate}
false_pos_rate is {false_pos_rate}
tru_neg_rate is {tru_neg_rate}
false_neg_rate is {false_neg_rate}
​
precision is {precision}
recall is {recall}
f1_score is {f1_score}
''')

Training accuracy0.86
              precision    recall  f1-score   support

           0       0.86      0.93      0.89       110
           1       0.87      0.75      0.81        69

    accuracy                           0.86       179
   macro avg       0.86      0.84      0.85       179
weighted avg       0.86      0.86      0.86       179


accuracy is 52.37362637362637
tru_pos_rate is 0.7536231884057971
false_pos_rate is 0.07272727272727272
tru_neg_rate is 0.9272727272727272
false_neg_rate is 0.2463768115942029
​
precision is 0.8666666666666667
recall is 0.7536231884057971
f1_score is 0.8062015503875969



In [62]:
clf = DecisionTreeClassifier(max_depth= 5, random_state=123)
clf.fit(X_train, y_train)

DecisionTreeClassifier
DecisionTreeClassifier(max_depth=3, random_state=123)
clf.classes_
y_pred = clf.predict(X_train)
y_pred[:5]

y_pred_proba = clf.predict_proba(X_train)
y_pred_proba[:5]

#Evaluate your in-sample results using the model score, confusion matrix, and classification report.
X_train, y_train
print('Training accuracy{:.2f}'
      .format(clf.score(X_train, y_train)))

print(classification_report(y_train, y_pred))

conf
conf = confusion_matrix(y_train, y_pred)
conf

clf.score(X_train, y_train)


#Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.
TN, FP, FN, TP = conf.ravel()

accuracy = TP + TN / (TN + FN + TP + TN)
tru_pos_rate = TP / (TP + FN)
false_pos_rate = FP / (FP + TN)
tru_neg_rate = TN / (TN + FP)
false_neg_rate = FN / (FN + TP)

precision = TP / (TP + FP)
recall  = tru_pos_rate
f1_score = 2 * (precision * recall) / (precision + recall)

print(f'''
accuracy is {accuracy}
tru_pos_rate is {tru_pos_rate}
false_pos_rate is {false_pos_rate}
tru_neg_rate is {tru_neg_rate}
false_neg_rate is {false_neg_rate}
​
precision is {precision}
recall is {recall}
f1_score is {f1_score}
''')

Training accuracy0.92
              precision    recall  f1-score   support

           0       0.89      0.99      0.94       110
           1       0.98      0.80      0.88        69

    accuracy                           0.92       179
   macro avg       0.93      0.89      0.91       179
weighted avg       0.92      0.92      0.91       179


accuracy is 55.37979094076655
tru_pos_rate is 0.7971014492753623
false_pos_rate is 0.00909090909090909
tru_neg_rate is 0.990909090909091
false_neg_rate is 0.2028985507246377
​
precision is 0.9821428571428571
recall is 0.7971014492753623
f1_score is 0.8799999999999999



In [None]:
the k = 5 model performs better 

### Work through these same exercises using the Telco dataset.

### Experiment with this model on other datasets with a higher number of output classes