##Using the titanic data, in your classification-exercises repository, create a notebook, model.ipynb where you will do the following:



In [6]:
# General DS Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Decision Tree and Model Evaluation Imports
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_text
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

import warnings
warnings.filterwarnings("ignore")

import acquire
import prepare

In [7]:
# Acquire Step
df = acquire.get_titanic_data()
df = df.drop(columns='passenger_id')

In [8]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [9]:
train, validate, test = prepare.prep_titanic_data(df)
train.head()

KeyError: "['passenger_id'] not found in axis"

In [None]:
# Preprocessing for modeling
X_train_titanic = train.drop(columns=['survived', 'sex', 'embark_town'])
y_train_titanic = train.survived

X_validate_titanic = validate.drop(columns=['survived', 'sex', 'embark_town'])
y_validate_titanic = validate.survived

X_test_titanic = test.drop(columns=['survived', 'sex', 'embark_town'])
y_test_titanic = test.survived


In [None]:
X_train_titanic.head()

In [None]:
y_train_titanic[:5]


In [None]:
X_train_titanic.shape, X_validate_titanic.shape, X_test_titanic.shape

In [None]:
y_train_titanic.shape, y_validate_titanic.shape, y_test_titanic.shape

### 1. What is your baseline prediction? What is your baseline accuracy? remember: your baseline prediction for a classification problem is predicting the most prevelant class in the training dataset (the mode). When you make those predictions, what is your accuracy? This is your baseline accuracy.



In [None]:
y_train_titanic[0:10]


In [None]:
y_train_titanic.value_counts()


In [None]:
# The mode is a great baseline
baseline_titanic = y_train_titanic.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = (y_train_titanic == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

### 2. Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)



In [None]:
# Make the model
tree1 = DecisionTreeClassifier(max_depth=3, random_state=123)

# Fit the model (on train and only train)
tree1.fit(X_train_titanic, y_train_titanic)

# Use the model
# We'll evaluate the model's performance on train, first
y_predictions_titanic = tree1.predict(X_train_titanic)

In [None]:
plt.figure(figsize=(12, 7))
plot_tree(tree1, feature_names=X_train_titanic.columns, class_names=['0','1'])
plt.show()

### 3. Evaluate your in-sample results using the model score, confusion matrix, and classification report.



In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree1.score(X_train_titanic, y_train_titanic)))

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree1.score(X_train_titanic, y_train_titanic)))

In [None]:
plot_confusion_matrix(tree1, X_train_titanic, y_train_titanic)


In [None]:
print(classification_report(y_train_titanic, y_predictions_titanic))


In [None]:
# Produce the classification report on the actual y values and this model's predicted y values
report = classification_report(y_train_titanic, y_predictions_titanic, output_dict=True)
print("Tree of 1 depth")
pd.DataFrame(report)

### 4. Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [None]:
TN, FP, FN, TP = confusion_matrix(y_train_titanic, y_predictions_titanic).ravel()


The label of positive and negative is arbitrary. What is sklearn considering to be the positive case here?

In [None]:
TN, FP, FN, TP


In [None]:
negative_cases = TN + FP
positive_cases = FN + TP
print(f"Negative Cases: {negative_cases}")
print(f"Positive Cases: {positive_cases}")
print(y_train.value_counts())

Sklearn is calling survival (1) our positive case



In [None]:
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
true_positive_rate = sensitivity = recall = power = TP/(TP+FN)
false_positive_rate = false_alarm_ratio = fallout = FP/(FP+TN)
true_negative_rate = specificity = selectivity = TN/(TN+FP)
false_negative_rate = miss_rate = FN/(FN+TP)
precision = PPV = TP/(TP+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN

print(f"Accuracy: {accuracy}", '\n')
print(f"True Positive Rate/Sensitivity/Recall/Power: {true_positive_rate}", '\n')
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {false_positive_rate}", '\n')
print(f"True Negative Rate/Specificity/Selectivity: {true_negative_rate}", '\n')
print(f"False Negative Rate/Miss Rate: {false_negative_rate}", '\n')
print(f"Precision/PPV: {precision}", '\n')
print(f"F1 Score: {f1_score}", '\n')
print(f"Support (0): {support_neg}", '\n')
print(f"Support (1): {support_pos}")


### 5. Run through steps 2-4 using a different max_depth value.



In [None]:
# Let's get loopy
for i in range(1, 21):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train, y_train)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions = tree.predict(X_train)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train, y_predictions, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

### 6. Which model performs better on your in-sample data?



In [None]:
# Max depth of 15+ produces the highest accuracy


### 7. Which model performs best on your out-of-sample data, the validate set?



In [None]:
# Let's continue getting loopy, so we can compare in-sample to out-of-sample
metrics = []

for i in range(1, 25):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train_titanic, y_train_titanic)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = tree.score(X_train_titanic, y_train_titanic)
    
    out_of_sample_accuracy = tree.score(X_validate_titanic, y_validate_titanic)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
titanic = pd.DataFrame(metrics)
titanic["difference"] = titanic.train_accuracy - titanic.validate_accuracy
titanic


In [None]:
plt.figure(figsize=(12, 6))
plt.plot(titanic.max_depth, titanic.train_accuracy, marker = 'o', label = 'Train')
plt.plot(titanic.max_depth, titanic.validate_accuracy, marker = 'o', label = 'Validate')
plt.title('Overfitting Occurs at Higher Values for Max Depth')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
titanic[titanic.difference <= 0.10].sort_values(by=['validate_accuracy', 'difference'], ascending = [False,True])


### 8. Work through these same exercises using the Telco dataset.


In [None]:
# Acquire Step
telco = acquire.get_telco_data()

In [None]:
telco.head()

In [None]:
telco.columns

In [None]:
train, validate, test = prepare.prep_telco_data(telco)
train.head()

In [None]:
# Preprocessing for modeling

#functions can't take strings so i dropped all columns that are strings
drop_columns = list(train.select_dtypes(include='object').columns) + ['churn_encoded']

X_train_telco = train.drop(columns=drop_columns)
y_train_telco = train.churn_encoded

X_validate_telco = validate.drop(columns=drop_columns)
y_validate_telco = validate.churn_encoded

X_test_telco = test.drop(columns=drop_columns)
y_test_telco = test.churn_encoded


In [None]:
y_train_telco

In [None]:
X_train_telco.columns.tolist()

In [None]:
X_train_telco.head()

In [None]:
X_train_telco.shape, X_validate_telco.shape, X_test_telco.shape

In [None]:
y_train_telco.shape, y_validate_telco.shape, y_test_telco.shape


In [None]:
y_train_telco[0:10]

In [None]:
y_train_telco.value_counts()

In [None]:
# The mode is a great baseline
baseline = y_train_telco.mode()

# Produce a boolean array with True representing a match between the baseline prediction and reality
matches_baseline_prediction = (y_train_telco == 0)

baseline_accuracy = matches_baseline_prediction.mean()
print(f"Baseline accuracy: {round(baseline_accuracy, 2)}")

In [None]:
# Make the model
tree2 = DecisionTreeClassifier(max_depth=3, random_state=123)

# Fit the model (on train and only train)
tree2.fit(X_train_telco, y_train_telco)

# Use the model
# We'll evaluate the model's performance on train, first
y_predictions_telco = tree2.predict(X_train_telco)

In [None]:
plt.figure(figsize=(12, 7))
plot_tree(tree2, feature_names=X_train_telco.columns, class_names=['0','1'])
plt.show()

In [None]:
print('Accuracy of Decision Tree classifier on training set: {:.2f}'
      .format(tree2.score(X_train_telco, y_train_telco)))

In [None]:
plot_confusion_matrix(tree2, X_train_telco, y_train_telco)

In [None]:
print(classification_report(y_train_telco, y_predictions_telco))

In [None]:
# Produce the classification report on the actual y values and this model's predicted y values
report = classification_report(y_train_telco, y_predictions_telco, output_dict=True)
print("Tree of 1 depth")
pd.DataFrame(report)


### Question 4: Just for Fun - Calculate Metrics
Compute: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
TN, FP, FN, TP = confusion_matrix(y_train_telco, y_predictions_telco).ravel()


The label of positive and negative is arbitrary. What is sklearn considering to be the positive case here?



In [None]:
TN, FP, FN, TP



In [None]:
negative_cases = TN + FP
positive_cases = FN + TP
print(f"Negative Cases: {negative_cases}")
print(f"Positive Cases: {positive_cases}")
print(y_train_telco.value_counts())

In [None]:
ALL = TP + FP + FN + TN

accuracy = (TP + TN)/ALL
true_positive_rate = sensitivity = recall = power = TP/(TP+FN)
false_positive_rate = false_alarm_ratio = fallout = FP/(FP+TN)
true_negative_rate = specificity = selectivity = TN/(TN+FP)
false_negative_rate = miss_rate = FN/(FN+TP)
precision = PPV = TP/(TP+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN

print(f"Accuracy: {accuracy}", '\n')
print(f"True Positive Rate/Sensitivity/Recall/Power: {true_positive_rate}", '\n')
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {false_positive_rate}", '\n')
print(f"True Negative Rate/Specificity/Selectivity: {true_negative_rate}", '\n')
print(f"False Negative Rate/Miss Rate: {false_negative_rate}", '\n')
print(f"Precision/PPV: {precision}", '\n')
print(f"F1 Score: {f1_score}", '\n')
print(f"Support (0): {support_neg}", '\n')
print(f"Support (1): {support_pos}")

In [None]:
# Let's get loopy
for i in range(1, 21):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train_telco, y_train_telco)

    # Use the model
    # We'll evaluate the model's performance on train, first
    y_predictions_telco = tree.predict(X_train_telco)

    # Produce the classification report on the actual y values and this model's predicted y values
    report = classification_report(y_train_telco, y_predictions_telco, output_dict=True)
    print(f"Tree with max depth of {i}")
    print(pd.DataFrame(report))
    print()

Max depth of 19

In [None]:
# Let's continue getting loopy, so we can compare in-sample to out-of-sample
metrics = []

for i in range(1, 25):
    # Make the model
    tree = DecisionTreeClassifier(max_depth=i, random_state=123)

    # Fit the model (on train and only train)
    tree = tree.fit(X_train_telco, y_train_telco)

    # Use the model
    # We'll evaluate the model's performance on train, first
    in_sample_accuracy = tree.score(X_train_telco, y_train_telco)
    
    out_of_sample_accuracy = tree.score(X_validate_telco, y_validate_telco)

    output = {
        "max_depth": i,
        "train_accuracy": in_sample_accuracy,
        "validate_accuracy": out_of_sample_accuracy
    }
    
    metrics.append(output)
    
telco = pd.DataFrame(metrics)
telco["difference"] = telco.train_accuracy - telco.validate_accuracy
telco

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(telco.max_depth, telco.train_accuracy, marker = 'o', label = 'Train')
plt.plot(telco.max_depth, telco.validate_accuracy, marker = 'o', label = 'Validate')
plt.title('Overfitting Occurs at Higher Values for Max Depth')
plt.xlabel('Max Depth')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
telco[telco.difference <= 0.10].sort_values(by=['validate_accuracy', 'difference'], ascending = [False,True])


### Continue working in your model file with titanic data to do the following:



In [None]:
# ignore warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns




### 1. Fit the Random Forest classifier to your training sample and transform (i.e. make predictions on the training sample) setting the random_state accordingly and setting min_samples_leaf = 1 and max_depth = 10.



In [None]:
rf = RandomForestClassifier(bootstrap=True, 
                            class_weight=None, 
                            criterion='gini',
                            min_samples_leaf=1,                  
                            max_depth=10, 
                            random_state=123)


In [None]:
rf.fit(X_train, y_train)

In [None]:
print(rf.feature_importances_)

In [None]:
y_pred = rf.predict(X_train)

In [None]:
y_pred_proba = rf.predict_proba(X_train)

### 2. Evaluate your results using the model score, confusion matrix, and classification report.



In [None]:
print('Accuracy of random forest classifier on training set: {:.2f}'
     .format(rf.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [None]:
TN, FP, FN, TP = confusion_matrix(y_train, y_predictions).ravel()

ALL = TP + FP + FN + TN

TP, TN, FP, FN, ALL

In [None]:
accuracy = (TP + TN)/ALL
true_positive_rate = sensitivity = recall = power = TP/(TP+FN)
false_positive_rate = false_alarm_ratio = fallout = FP/(FP+TN)
true_negative_rate = specificity = selectivity = TN/(TN+FP)
false_negative_rate = miss_rate = FN/(FN+TP)
precision = PPV = TP/(TP+FP)
f1_score = 2*(precision*recall)/(precision+recall)
support_pos = TP + FN
support_neg = FP + TN

print(f"Accuracy: {accuracy}", '\n')
print(f"True Positive Rate/Sensitivity/Recall/Power: {true_positive_rate}", '\n')
print(f"False Positive Rate/False Alarm Ratio/Fall-out: {false_positive_rate}", '\n')
print(f"True Negative Rate/Specificity/Selectivity: {true_negative_rate}", '\n')
print(f"False Negative Rate/Miss Rate: {false_negative_rate}", '\n')
print(f"Precision/PPV: {precision}", '\n')
print(f"F1 Score: {f1_score}", '\n')
print(f"Support (0): {support_neg}", '\n')
print(f"Support (1): {support_pos}")

### 4. Run through steps increasing your min_samples_leaf and decreasing your max_depth.



In [None]:
model_prediction = y_train.copy()

# Let's get loopy
# Make the model
for i in range(1, 6):
    for j in range(10, 5, -1):
        rf = RandomForestClassifier(
            max_depth=i, 
            min_samples_leaf=j, 
            random_state=123
        )
        rf.fit(X_train, y_train)
        
        curr_preds = rf.predict(X_train)
        
        model_prediction[f'msl_{i}_md_{j}'] = curr_preds
    

In [None]:
model_prediction

### 5. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



After making a few models, which one has the best performance (or closest metrics) on both train and validate?

In [None]:
actuals = model_prediction.survived
preds = model_prediction.drop(columns = 'survived')

for column in preds.columns:
    accuracy = (actuals == pred[column]).mean()
    print(f'{column} accuracy: {accuracy}')

In [None]:
X_train

In [None]:
import acquire
import prepare

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

# Data handling
import pandas as pd
import numpy as np

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Sklearn modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

# Data acquisition
from pydataset import data



In [None]:
# Acquire Step
df = acquire.get_titanic_data()
#df = df.drop(columns='passenger_id')

In [None]:
df.head()

In [None]:
train, validate, test = prepare.prep_titanic_data(df)
train.head()

In [None]:
train.shape

In [None]:
# Preprocessing for modeling
X_train_titanic = train.drop(columns=['survived', 'sex', 'embark_town'])
y_train_titanic = train.survived

X_validate_titanic = validate.drop(columns=['survived', 'sex', 'embark_town'])
y_validate_titanic = validate.survived

X_test_titanic = test.drop(columns=['survived', 'sex', 'embark_town'])
y_test_titanic = test.survived


# KNN
### Continue working in your model file with the titanic dataset.

#### 1. Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)


In [None]:
knn_titanic = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [None]:
knn_titanic.fit(X_train, y_train)

In [None]:
y_pred = knn_titanic.predict(X_train)

In [None]:
y_pred_proba = knn_titanic.predict_proba(X_train)


#### 2. Evaluate your results using the model score, confusion matrix, and classification report.



In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn_titanic.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.



In [None]:
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()
ALL = TP + FP + FN + TN

TP, TN, FP, FN, ALL

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

#### 4. Run through steps 1-3 setting k to 10



#### 1. CREATE & FIT

In [None]:
knn_titanic2 = KNeighborsClassifier(n_neighbors=10, weights='uniform')

In [None]:
knn_titanic2.fit(X_train, y_train)

In [None]:
y_pred = knn_titanic2.predict(X_train)

In [None]:
y_pred_proba = knn_titanic2.predict_proba(X_train)


#### 2. Evaluate your results using the model score, confusion matrix, and classification report.

In [None]:
print('Accuracy of KNN classifier on training set: {:.2f}'
     .format(knn_titanic2.score(X_train, y_train)))

In [None]:
print(confusion_matrix(y_train, y_pred))

In [None]:
print(classification_report(y_train, y_pred))

#### 3. Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [None]:
TN, FP, FN, TP = confusion_matrix(y_train, y_pred).ravel()
ALL = TP + FP + FN + TN

TP, TN, FP, FN, ALL

In [None]:
accuracy = (TP + TN)/ALL
print(f"Accuracy: {accuracy}")

true_positive_rate = TP/(TP+FN)
print(f"True Positive Rate: {true_positive_rate}")

false_positive_rate = FP/(FP+TN)
print(f"False Positive Rate: {false_positive_rate}")

true_negative_rate = TN/(TN+FP)
print(f"True Negative Rate: {true_negative_rate}")

false_negative_rate = FN/(FN+TP)
print(f"False Negative Rate: {false_negative_rate}")

precision = TP/(TP+FP)
print(f"Precision: {precision}")

recall = TP/(TP+FN)
print(f"Recall: {recall}")

f1_score = 2*(precision*recall)/(precision+recall)
print(f"F1 Score: {f1_score}")

support_pos = TP + FN
print(f"Support (0): {support_pos}")

support_neg = FP + TN
print(f"Support (1): {support_neg}")

#### 5. Run through steps 1-3 setting k to 20



#### 6. What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?



#### 7. Which model performs best on our out-of-sample data from validate?