In [None]:
from sklearn import tree
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
from IPython.display import display
from IPython.display import SVG
from graphviz import Source
from IPython.display import HTML
style = "<style>svg{width:30% !important;height:30% !important;}</style>"
# HTML(style)

In [None]:
# https://www.kaggle.com/c/titanic/data ==> explore
titanic_data = pd.read_csv("/home/octopus/Documents/2scripts/ScientificPython/3_ML/Lecture10_DT/files/train.csv")
print(titanic_data.shape)
titanic_data.head()

TASK: train a decision tree that will predict whether the passenger survived or did not survive, based on the passenger data from the train.csv dataset.


In [None]:
titanic_data.isnull().head()

In [None]:
titanic_data.isnull().sum()

In [None]:
titanic_data.Age.median()

In [None]:
titanic_data = titanic_data.fillna({'Age': titanic_data.Age.median()})

In [None]:
titanic_data.isnull().sum()

In [None]:
# Which columns will we delete?

In [None]:
titanic_data.head()

In [None]:
# What does axis = 1 mean?
X = titanic_data.drop(["PassengerId", "Survived", "Name", "Ticket", "Cabin"], axis = 1)

In [None]:
X

In [None]:
X.isnull().sum()

In [None]:
y = titanic_data.Survived
y.head()

In [None]:
clf = tree.DecisionTreeClassifier(criterion='entropy')

In [None]:
clf.fit(X,y)

In [None]:
X

In [None]:
X = pd.get_dummies(X)

In [None]:
X.head()

In [None]:
clf.fit(X, y)

In [None]:
# graph = Source(tree.export_graphviz(clf, out_file=None,
#                                    feature_names=list(X),
#                                    class_names=['Negative','Positive'],
#                                    filled = True))
# display(SVG(graph.pipe(format='svg')))

In [None]:
import os
dotfile = open("dtree2.dot", 'w')
tree.export_graphviz(clf, 
                     out_file = dotfile, 
                     feature_names = list(X),
                     class_names=['Negative','Positive'],
                     filled = True)
dotfile.close()
os.system("dot -Tpng dtree2.dot -o dtree2.png")

In [None]:
clf.score(X, y)

In [None]:
# we studied to classify this particular dataset but we didn't study to find rules/patterns

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(X, 
                                                     y, 
                                                     test_size=0.30, 
                                                     random_state=42) # seed

# 30% of the data will go into the test sample, the rest -- in training

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
X.shape

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
# looks like overfitting
# we want to avoid it. we can do it by several ways:

In [None]:
from sklearn import set_config
set_config(print_changed_only=False)
clf.fit(X_train, y_train)

In [None]:
clf = tree.DecisionTreeClassifier(criterion='entropy', 
                                  max_depth=3)
# we are trying to avoid overfitting

# The maximum depth of the tree. 

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
# Much better but whether this max_depth is optimal?

In [None]:
max_depth_values = range(1, 100)

In [None]:
scores_data = pd.DataFrame()

In [None]:
for max_depth in max_depth_values:
    
    clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=max_depth)
    
    clf.fit(X_train, y_train)
    
    train_score = clf.score(X_train, y_train)
    
    test_score = clf.score(X_test, y_test)
    
    temp_score_data = pd.DataFrame({'max_depth': [max_depth],
                                    'train_score': [train_score],
                                    'test_score': [test_score]})
    
    scores_data = scores_data.append(temp_score_data)

In [None]:
scores_data.head()

In [None]:
scores_data_long = pd.melt(scores_data,
                           id_vars=['max_depth'],
                           value_vars=['train_score', 'test_score'],
                           var_name='set_type',
                           value_name='score')

In [None]:
scores_data_long.head()

In [None]:
sns.lineplot(x="max_depth", 
             y="score", 
             hue="set_type",
             data=scores_data_long)

In [None]:
# There is one more problem: we always have the same test data.

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
clf = tree.DecisionTreeClassifier(criterion='entropy', 
                                  max_depth=4) 

In [None]:
cross_val_score(clf, 
                X, 
                y, 
                cv=5,
                verbose=True) # validation score for test -> by default split 5 times, can ve changed `cv=5`

In [None]:
# Train: [2 3 4 5 6 7 8 9] | test: [0 1]
# Train: [0 1 4 5 6 7 8 9] | test: [2 3]
# Train: [0 1 2 3 6 7 8 9] | test: [4 5]
# Train: [0 1 2 3 4 5 8 9] | test: [6 7]
# Train: [0 1 2 3 4 5 6 7] | test: [8 9]

In [None]:
cross_val_score(clf, X, y).mean()

In [None]:
from sklearn.model_selection import cross_validate

In [None]:
cross_validate(clf, X, y) # the full output, not just score only

In [None]:
scores_data = pd.DataFrame()

In [None]:
for max_depth in max_depth_values:
    
    clf = tree.DecisionTreeClassifier(criterion='entropy', 
                                      max_depth=max_depth)
    
    # 1
    clf.fit(X_train, y_train)
    # train
    train_score = clf.score(X_train, y_train)
    # test
    test_score = clf.score(X_test, y_test)
    
    # 2
    mean_cross_val_score = cross_val_score(clf, X, y).mean()
    
    temp_score_data = pd.DataFrame({'max_depth': [max_depth],
                                    'train_score': [train_score],
                                    'test_score': [test_score],
                                    'cross_val_score': [mean_cross_val_score]})
    
    scores_data = scores_data.append(temp_score_data)

In [None]:
scores_data.head()

In [None]:
scores_data_long = pd.melt(scores_data, 
                           id_vars=['max_depth'], 
                           value_vars=['train_score', 'test_score', 'cross_val_score'],
                           var_name='set_type', 
                           value_name='score')

In [None]:
scores_data_long.head()

In [None]:
sns.lineplot(x="max_depth", 
             y="score", 
             hue="set_type", 
             data=scores_data_long)

In [None]:
scores_data_long.query("set_type == 'cross_val_score'").head(20)

In [None]:
qq = scores_data_long.query("set_type == 'cross_val_score'")

In [None]:
qq.score.max() # optimal depth is 7

In [None]:
best_clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=7)

In [None]:
best_clf.fit(X_train, y_train)



In [None]:
best_clf.score(X_test, y_test)