In [1]:
import acquire
import prepare

import pandas as pd

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, classification_report

Using the Titanic Dataset

In [3]:
df = prepare.prep_titanic(acquire.get_titanic_data())

In [4]:
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,embark_town,alone,Cherbourg,Queenstown,Southampton
0,0,0,3,1,22.0,1,0,7.25,2,Third,Southampton,0,0.0,0.0,1.0
1,1,1,1,0,38.0,1,0,71.2833,0,First,Cherbourg,0,1.0,0.0,0.0
2,2,1,3,0,26.0,0,0,7.925,2,Third,Southampton,1,0.0,0.0,1.0
3,3,1,1,0,35.0,1,0,53.1,2,First,Southampton,0,0.0,0.0,1.0
4,4,0,3,1,35.0,0,0,8.05,2,Third,Southampton,1,0.0,0.0,1.0


# Split the Data into Train, Test, Validate

In [5]:
# Split the data once to get .8 train, .2 test
train, test = train_test_split(df, train_size=.8, random_state=115)

In [6]:
# Split the train again, resulting in .6 train, .2 validate, .2 test
train, validate = train_test_split(train, train_size=.8, random_state=115)

In [7]:
print(train.shape, validate.shape, test.shape)

(569, 15) (143, 15) (179, 15)


# Using Fare and Pclass to Predict Survival

In [8]:
train_X = train[['fare', 'pclass']]
train_y = train.survived

In [9]:
fare_pclass = LogisticRegression().fit(train_X, train_y)

In [10]:
# Create a df to hold the predictions
predictions = pd.DataFrame({'actual': train_y})

In [11]:
# Apply the model predictions to the df
predictions['fare_pclass'] = fare_pclass.predict(train_X)

In [12]:
predictions.head()

Unnamed: 0,actual,fare_pclass
90,0,0
381,1,0
546,1,0
844,0,0
859,0,0


In [13]:
# Calculate the probability of survival, predicted by the model
fare_pclass.predict_proba(train_X)

array([[0.75895547, 0.24104453],
       [0.75142847, 0.24857153],
       [0.56450771, 0.43549229],
       ...,
       [0.75992255, 0.24007745],
       [0.75910477, 0.24089523],
       [0.75754849, 0.24245151]])

In [14]:
# Print the coefficients and intercept
print('Coeffecient: \n', fare_pclass.coef_)
print('Intercept: \n', fare_pclass.intercept_)

Coeffecient: 
 [[ 0.00529351 -0.79246604]]
Intercept: 
 [1.187824]


# Evaluate the Model

## Compute the Accuracy

In [15]:
print('Accuracy: {: .2f}'.format(fare_pclass.score(train_X, train_y)))

Accuracy:  0.69


## Create a Confusion Matrix

In [16]:
print(confusion_matrix(predictions.actual, predictions.fare_pclass))

[[303  44]
 [131  91]]


## Compute Precision, Recall, F1-Score, and Support

In [17]:
print(classification_report(predictions.actual, predictions.fare_pclass))

              precision    recall  f1-score   support

           0       0.70      0.87      0.78       347
           1       0.67      0.41      0.51       222

    accuracy                           0.69       569
   macro avg       0.69      0.64      0.64       569
weighted avg       0.69      0.69      0.67       569



# Create another model which includes age in addition to fare and pclass

In [18]:
train_X = train[['age', 'fare', 'pclass']]
train_y = train.survived

In [19]:
age_fare_pclass = LogisticRegression().fit(train_X, train_y)

In [20]:
predictions['age_fare_pclass'] = age_fare_pclass.predict(train_X)

In [21]:
predictions.head()

Unnamed: 0,actual,fare_pclass,age_fare_pclass
90,0,0,0
381,1,0,0
546,1,0,1
844,0,0,0
859,0,0,0


In [22]:
age_fare_pclass.score(train_X, train_y)

0.718804920913884

In [23]:
train_X = train[['sex', 'age', 'fare', 'pclass']]
train_y = train.survived

In [24]:
sex_age_fare_pclass = LogisticRegression().fit(train_X, train_y)

In [25]:
predictions['sex_age_fare_pclass'] = sex_age_fare_pclass.predict(train_X) 

In [26]:
predictions.head()

Unnamed: 0,actual,fare_pclass,age_fare_pclass,sex_age_fare_pclass
90,0,0,0,0
381,1,0,0,1
546,1,0,1,1
844,0,0,0,0
859,0,0,0,0


In [27]:
sex_age_fare_pclass.score(train_X, train_y)

0.789103690685413

In [28]:
train_X = train[['sex', 'age', 'fare', 'pclass', 'parch']]
train_y = train.survived

In [29]:
parch_sex_age_fare_pclass = LogisticRegression().fit(train_X, train_y)

In [30]:
parch_sex_age_fare_pclass.score(train_X, train_y)

0.7873462214411248

# Evaluating the Model

In [31]:
test_X = test[['sex', 'age', 'fare', 'pclass']]
test_y = test.survived

In [32]:
sex_age_fare_pclass.score(test_X, test_y)

0.7932960893854749

# Decision Tree

## Fit the decision tree classifier to your training sample and transform (i.e. make predictions on the training sample)

In [33]:
tree = DecisionTreeClassifier(criterion='gini', max_depth=3, random_state=115).fit(train_X, train_y)

## Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [34]:
evaluation = pd.DataFrame({'actual':train_y, 'survival ~ sex + age + fare + pclass': tree.predict(train_X)})

In [35]:
evaluation.head()

Unnamed: 0,actual,survival ~ sex + age + fare + pclass
90,0,0
381,1,1
546,1,1
844,0,0
859,0,0


In [36]:
# Accuracy
print('Accuracy of model vs actual is:{: .2f}'.format(tree.score(train_X, train_y)))

Accuracy of model vs actual is: 0.81


In [37]:
# Confusion Matrix
confusion_matrix(train_y, tree.predict(train_X))

array([[311,  36],
       [ 71, 151]])

In [38]:
print(classification_report(evaluation.actual, tree.predict(train_X)))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85       347
           1       0.81      0.68      0.74       222

    accuracy                           0.81       569
   macro avg       0.81      0.79      0.80       569
weighted avg       0.81      0.81      0.81       569



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [39]:
row_labels = ['tp', 'fp']
column_labels = ['tn', 'fn']
conf = pd.DataFrame(confusion_matrix(train_y, tree.predict(train_X)), index=row_labels, columns=column_labels)
conf.index.name = "actual"
conf

Unnamed: 0_level_0,tn,fn
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
tp,311,36
fp,71,151


## Run through steps 2-4 using entropy as your measure of impurity.

In [40]:
tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=115).fit(train_X, train_y)

In [41]:
evaluation = pd.DataFrame({'actual':train_y, 'survival ~ sex + age + fare + pclass': tree.predict(train_X)})

In [42]:
evaluation.head()

Unnamed: 0,actual,survival ~ sex + age + fare + pclass
90,0,0
381,1,1
546,1,1
844,0,0
859,0,0


In [43]:
# Accuracy
print('Accuracy of model vs actual is:{: .2f}'.format(tree.score(train_X, train_y)))

Accuracy of model vs actual is: 0.81


In [44]:
# Confusion Matrix
confusion_matrix(train_y, tree.predict(train_X))

array([[311,  36],
       [ 71, 151]])

In [45]:
print(classification_report(evaluation.actual, tree.predict(train_X)))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85       347
           1       0.81      0.68      0.74       222

    accuracy                           0.81       569
   macro avg       0.81      0.79      0.80       569
weighted avg       0.81      0.81      0.81       569



## Which performs better on your in-sample data?

They both did the same

# Random Forest

In [46]:
forest = RandomForestClassifier(min_samples_leaf=1, n_estimators=100, max_depth=20, random_state=115).fit(train_X, train_y)

## Evaluate your in-sample results using the model score, confusion matrix, and classification report.

In [47]:
evaluation = pd.DataFrame({'actual':train_y, 'survival ~ sex + age + fare + pclass': forest.predict(train_X)})

In [48]:
evaluation.head()

Unnamed: 0,actual,survival ~ sex + age + fare + pclass
90,0,0
381,1,1
546,1,1
844,0,0
859,0,0


In [49]:

# Accuracy
print('Accuracy of model vs actual is:{: .2f}'.format(forest.score(train_X, train_y)))

Accuracy of model vs actual is: 0.98


In [50]:
# Confusion Matrix
confusion_matrix(train_y, forest.predict(train_X))

array([[346,   1],
       [ 10, 212]])

In [51]:
print(classification_report(evaluation.actual, forest.predict(train_X)))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98       347
           1       1.00      0.95      0.97       222

    accuracy                           0.98       569
   macro avg       0.98      0.98      0.98       569
weighted avg       0.98      0.98      0.98       569



## Print and clearly label the following: Accuracy, true positive rate, false positive rate, true negative rate, false negative rate, precision, recall, f1-score, and support.

In [52]:
row_labels = ['tp', 'fp']
column_labels = ['tn', 'fn']
conf = pd.DataFrame(confusion_matrix(train_y, forest.predict(train_X)), index=row_labels, columns=column_labels)
conf.index.name = "actual"
conf

Unnamed: 0_level_0,tn,fn
actual,Unnamed: 1_level_1,Unnamed: 2_level_1
tp,346,1
fp,10,212


## Change the number of min_samples_leaf and max_depth

In [53]:
forest = RandomForestClassifier(min_samples_leaf=1, n_estimators=100, max_depth=20, random_state=115).fit(train_X, train_y)

In [54]:
evaluation = pd.DataFrame({'actual':train_y, 'survival ~ sex + age + fare + pclass': tree.predict(train_X)})

In [55]:
evaluation.head()

Unnamed: 0,actual,survival ~ sex + age + fare + pclass
90,0,0
381,1,1
546,1,1
844,0,0
859,0,0


In [56]:
# Accuracy
print('Accuracy of model vs actual is:{: .2f}'.format(tree.score(train_X, train_y)))

Accuracy of model vs actual is: 0.81


In [57]:
# Confusion Matrix
confusion_matrix(train_y, tree.predict(train_X))

array([[311,  36],
       [ 71, 151]])

In [58]:
print(classification_report(evaluation.actual, tree.predict(train_X)))

              precision    recall  f1-score   support

           0       0.81      0.90      0.85       347
           1       0.81      0.68      0.74       222

    accuracy                           0.81       569
   macro avg       0.81      0.79      0.80       569
weighted avg       0.81      0.81      0.81       569



## Which performs better on your in-sample data?

The first model performed substantially better than the second

# KNN

Scale the train data

In [59]:
train_X = train[['age', 'fare', 'sex', 'pclass']]
train_y = train.survived

In [60]:
scaled_train_X = MinMaxScaler().fit_transform(train_X)

## Fit a K-Nearest Neighbors classifier to your training sample and transform (i.e. make predictions on the training sample)

In [61]:
knn = KNeighborsClassifier().fit(scaled_train_X, train_y)

In [62]:
evaluation = pd.DataFrame({'actual': train_y, 'knn_prediction': knn.predict(scaled_train_X)})

In [63]:
evaluation.head()

Unnamed: 0,actual,knn_prediction
90,0,0
381,1,0
546,1,1
844,0,0
859,0,0


## Evaluate your results using the model score, confusion matrix, and classification report.

In [64]:
print('Accuracy of model: {: .2f}'.format(knn.score(scaled_train_X, train_y)))

Accuracy of model:  0.83


In [65]:
row_labels = ['Actually_Died', 'Actually_Survived']
col_labels = ['Pred_Died', 'Pred_Survived']
confusion = pd.DataFrame(confusion_matrix(evaluation.actual, evaluation.knn_prediction, normalize='true'), columns= col_labels, index= row_labels)

In [66]:
confusion.head()

Unnamed: 0,Pred_Died,Pred_Survived
Actually_Died,0.899135,0.100865
Actually_Survived,0.27027,0.72973


In [67]:
print(classification_report(evaluation.actual, evaluation.knn_prediction))

              precision    recall  f1-score   support

           0       0.84      0.90      0.87       347
           1       0.82      0.73      0.77       222

    accuracy                           0.83       569
   macro avg       0.83      0.81      0.82       569
weighted avg       0.83      0.83      0.83       569



## Run through steps 2-4 setting k to 10

In [68]:
knn = KNeighborsClassifier(n_neighbors=10).fit(scaled_train_X, train_y)

In [69]:
evaluation = pd.DataFrame({'actual': train_y, 'knn_prediction': knn.predict(scaled_train_X)})

In [70]:
evaluation.head()

Unnamed: 0,actual,knn_prediction
90,0,0
381,1,0
546,1,1
844,0,0
859,0,0


In [71]:
print('Accuracy of model: {: .2f}'.format(knn.score(scaled_train_X, train_y)))

Accuracy of model:  0.82


In [72]:
row_labels = ['Actually_Died', 'Actually_Survived']
col_labels = ['Pred_Died', 'Pred_Survived']
confusion = pd.DataFrame(confusion_matrix(evaluation.actual, evaluation.knn_prediction, normalize='true'), columns= col_labels, index= row_labels)

In [73]:
confusion.head()

Unnamed: 0,Pred_Died,Pred_Survived
Actually_Died,0.930836,0.069164
Actually_Survived,0.364865,0.635135


In [74]:
print(classification_report(evaluation.actual, evaluation.knn_prediction))

              precision    recall  f1-score   support

           0       0.80      0.93      0.86       347
           1       0.85      0.64      0.73       222

    accuracy                           0.82       569
   macro avg       0.83      0.78      0.79       569
weighted avg       0.82      0.82      0.81       569



## Run through setps 2-4 setting k to 20

In [75]:
knn = KNeighborsClassifier(n_neighbors=20).fit(scaled_train_X, train_y)

In [76]:
evaluation = pd.DataFrame({'actual': train_y, 'knn_prediction': knn.predict(scaled_train_X)})

In [77]:
evaluation.head()

Unnamed: 0,actual,knn_prediction
90,0,0
381,1,0
546,1,1
844,0,0
859,0,0


In [78]:
print('Accuracy of model: {: .2f}'.format(knn.score(scaled_train_X, train_y)))

Accuracy of model:  0.81


In [79]:
row_labels = ['Actually_Died', 'Actually_Survived']
col_labels = ['Pred_Died', 'Pred_Survived']
confusion = pd.DataFrame(confusion_matrix(evaluation.actual, evaluation.knn_prediction, normalize='true'), columns= col_labels, index= row_labels)

In [80]:
confusion.head()

Unnamed: 0,Pred_Died,Pred_Survived
Actually_Died,0.919308,0.080692
Actually_Survived,0.355856,0.644144


In [81]:
print(classification_report(evaluation.actual, evaluation.knn_prediction))

              precision    recall  f1-score   support

           0       0.80      0.92      0.86       347
           1       0.84      0.64      0.73       222

    accuracy                           0.81       569
   macro avg       0.82      0.78      0.79       569
weighted avg       0.82      0.81      0.81       569



## What are the differences in the evaluation metrics? Which performs better on your in-sample data? Why?

The lower numbers of k-neighbors (5) actually performed better than the the other sets.

# Test

## Using Both the Titanic and Iris Datasets

### Determine which model (with hyperparameters) perform the best

#### Titanic

In [82]:
train_X = train[['sex', 'age', 'fare', 'pclass', 'parch']]
train_y = train.survived

In [83]:
val_X = validate[['sex', 'age', 'fare', 'pclass', 'parch']]
val_y = validate.survived

In [84]:
test_X = test[['sex', 'age', 'fare', 'pclass', 'parch']]
test_y = test.survived

In [85]:
scaler = MinMaxScaler()

In [86]:
X_train_scaled = scaler.fit_transform(train_X)

In [87]:
X_val_scaled = scaler.transform(val_X)

In [88]:
for i in range(1, 20):
    knn = KNeighborsClassifier(n_neighbors=i).fit(X_train_scaled, train_y)
    print(i, knn.score(X_val_scaled, val_y))

1 0.7552447552447552
2 0.7622377622377622
3 0.7482517482517482
4 0.7902097902097902
5 0.7692307692307693
6 0.7622377622377622
7 0.7762237762237763
8 0.8041958041958042
9 0.7902097902097902
10 0.7902097902097902
11 0.7902097902097902
12 0.8041958041958042
13 0.8041958041958042
14 0.8041958041958042
15 0.7902097902097902
16 0.7972027972027972
17 0.7972027972027972
18 0.7972027972027972
19 0.7832167832167832


In [89]:
knn = KNeighborsClassifier(n_neighbors=8).fit(X_train_scaled, train_y)

In [90]:
knn.score(X_train_scaled, train_y)

0.8189806678383128

In [91]:
knn.score(X_val_scaled, val_y)

0.8041958041958042

#### Iris

In [96]:
df = prepare.prep_iris(acquire.get_iris_data())

In [97]:
df = df[['sepal_area', 'petal_area', 'petal_length', 'petal_width', 'species']]

In [98]:
train_X = train[['sepal_area', 'petal_area', 'petal_length', 'petal_width']]

KeyError: "None of [Index(['sepal_area', 'petal_area', 'petal_length', 'petal_width'], dtype='object')] are in the [columns]"

### 