# DAT 203 - Lab Assignment #6

- Author: Colin Bowers
- Date: Jun 17, 2023

**Instructions**


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk 

**Instructions**
1. Upload Titanic dataset 
1. Define Survived column as TARGET variable 
1. Select ALL  features that can be predictive of the survival status
1. Transform ALL Possible  categorical features with Dummy values 
1. Import Decision Tree function, train and test function from sklearn library 
1. Apply Decision Tree on the split train/test dataset  and report the accuracy 
1. Part II
    1. Change and expand the possible parameters and the number of cv for the hyper-parameter tuning
    1. Report feature importance using “feature_importances_” attribute
    1. Report the best features according to the decision tree using n_features_ attribute
    1. Compute your model’s confusion matrix, accuracy, and f1-score. 

**1 - Upload Titanic dataset**

In [None]:
df = pd.read_csv("data/Titanic_original.csv")
df.sample(3)

**2 - Define Survived column as TARGET variable**

In [None]:
target = 'Survived'

**3 - Select ALL features that can be predictive of the survival status**

In [None]:
predictors = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex', 'Embarked']

In [None]:
# To reduce column count, let's use PassengerId as the index
df.set_index("PassengerId", inplace=True)

# Logistic Regression requires numeric variables so we should drop these
df.drop('Name', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)
df.drop('Cabin', axis=1, inplace=True)

In [None]:
# Replace missing values of Age to be the median
median = df["Age"].median()
df.fillna({'Age': median}, inplace=True)

**4 - Transform ALL Possible categorical features with Dummy values**

In [None]:
# for the columns that are categorical, convert them and create dummies for each
dummies = ['Embarked', 'Sex']
for d in dummies:
    df[d] = df[d].astype('category')

dummy_data = pd.get_dummies(df[dummies])
df2 = pd.concat([df, dummy_data], axis=1)
df2.drop(dummies, axis=1, inplace=True)

In [None]:
# Add the new dummy fields to the list of predictors 
predictors.remove("Embarked")
predictors.remove("Sex")
predictors += ["Embarked_C", "Embarked_Q", "Embarked_S", "Sex_female", "Sex_male" ]

**5 - Import Decision Tree function, train and test function from sklearn library**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metrics

**6 - Apply Decision Tree on the split train/test dataset and report the accuracy**

In [None]:
# split the data into training (80%) and test (20%)
train, test = train_test_split(df2, test_size=0.2)

X_train = train[predictors]
y_train = train[target]

X_test = test[predictors]
y_test = test[target]


In [None]:
model = DecisionTreeClassifier(random_state=0)
model = model.fit(X_train, y_train)

In [None]:
# check accuracy with TRAINING data
y_pred_train = model.predict(X_train)
print(metrics.classification_report(y_train, y_pred_train))


In [None]:
# check accuracy with TEST data
y_pred = model.predict(X_test)
score = metrics.accuracy_score(y_test, y_pred)
print(metrics.classification_report(y_test, y_pred))

In [None]:
mat = metrics.confusion_matrix(y_pred, y_test)
mat

In [None]:
#fig = plt.figure(figsize=(25,20))
#_ = tree.plot_tree(
#    model, 
#    feature_names= predictors,  
#    class_names = target,
#    filled = True
#)

## Part II

In [None]:
#Use the following code to perform hyper-parameter tuning for the decision tree.

from sklearn.model_selection import GridSearchCV
#from sklearn import tree

clf = tree.DecisionTreeClassifier()
parameters = {
    'max_features': ['log2', 'sqrt'], 
    'criterion': ['entropy', 'gini'],
    'max_depth': [2, 3, 5, 10], 
    'min_samples_split': [2, 3, 5],
    'min_samples_leaf': [1,5,8]
}
grid_obj = GridSearchCV(clf, parameters, cv=5)
grid_obj = grid_obj.fit(X_train, y_train)


In [None]:
grid_obj.best_params_

In [None]:
model2 = grid_obj.best_estimator_
model2.fit(X_train, y_train)

y_pred = model2.predict(X_test)
score2 = metrics.accuracy_score(y_test, y_pred)
print(metrics.classification_report(y_test, y_pred))


In [None]:
print("Accuracy improved by: ", score2 - score)

**7 - Change and expand the possible parameters and the number of cv for the hyper-parameter tuning**

In [None]:
# Explore Max Depth

depth_range = range(2, 20)
accuracy = [ ]
for d in depth_range:
    m = tree.DecisionTreeClassifier(max_depth = d, random_state=0)
    m.fit(X_train, y_train)
    score = m.score(X_train, y_train)
    accuracy.append(score)
    
plt.plot(depth_range, accuracy, "o-")
plt.xlabel("Depth")
plt.ylabel("Accuracy")
plt.show()

In [None]:
# Explore CV

**8 - Report feature importance using “feature_importances_” attribute**

In [None]:
imp = model2.feature_importances_ 

In [None]:
df = pd.DataFrame(imp, index=X_train.columns, columns=["Importance"])
df.sort_values(by='Importance', ascending=False, inplace=True)
df.plot(kind='bar', figsize=(8,6))
plt.show()

**9 - Report the best features according to the decision tree using n_features_ attribute**

In [None]:
model2.n_features_in_

In [None]:
model2.feature_names_in_

**10 - Compute your model’s confusion matrix, accuracy, and f1-score.**

In [None]:
y_pred = model2.predict(X_test)
score2 = metrics.accuracy_score(y_test, y_pred)
print(metrics.classification_report(y_test, y_pred))

In [None]:
mat2 = metrics.confusion_matrix(y_pred, y_test)
mat2