In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import warnings
warnings.filterwarnings('ignore')
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_fname = '/kaggle/input/tabular-playground-series-apr-2021/train.csv'
df_train = pd.read_csv(train_fname)

In [None]:
df_train.head()

### How many missing values are there?

In [None]:
# How many missing values
df_train.isna().sum()

Pick a few columns to work with and fill the missing data.
For the numerical columns we can replace the NAs with the median values. For the categorical column, we can create a new category for the missing values.

In [None]:
# Pick a few columns
col = [
    'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Survived'
]
df_clean = df_train[col]

# Handle the missing data
df_clean['Fare'] = df_clean['Fare'].fillna(df_clean['Fare'].median())
df_clean['Age'] = df_clean['Age'].fillna(df_clean['Age'].median())
df_clean['Embarked'] = df_clean['Embarked'].fillna('NaN')

Double check that we now have no missing data.

In [None]:
# Verify no missing data
df_clean.isna().sum()

Pandas has a nice `get_dummies` function that will create binary variables for categorical features. The `Pclass` column is a numerical column but is actually just a categorical feature, so we convert it to string before calling `get_dummies`.

In [None]:
# Make binary variable columns from the categorical columns
df_clean['Pclass'] = df_clean['Pclass'].astype(str)
dft = pd.get_dummies(df_clean)

In class, we talked a bit about training and validation accuracy. In general, we always want to evaluate the effectiveness of our models on data the model has never seen before. This means we should set aside some data before training our model to use as validation data. Scikit-learn has a nice `KFold` class that lets us split our data into training and test sets to do precisely this.

We will use look at how the `max_depth` parameter of the `DecisionTreeClassifier` affects the training/validation accuracies. We expect the training accuracy to increase as we increase `max_depth` since a higher depth means our model can fit the data better. We also expect the validation accuracy to increase up to a point before our model starts to overfit to our training data.

In [None]:
cols = [c for c in dft.columns if c != 'Survived']
x_train = dft[cols]
y_train = dft['Survived']

kf = KFold(n_splits=5, shuffle=True)
tr_accs = []
val_accs = []

# max depth values to try
pvals = range(1, 20)

mean_tr_accs = []
mean_val_accs = []
models = []
for pval in pvals:
    tr_accs = []
    val_accs = []
    for train_index, test_index in kf.split(x_train):
        x_tr, x_val = x_train.iloc[train_index], x_train.iloc[test_index]
        y_tr, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

        dt = DecisionTreeClassifier(max_depth=pval)
        dt.fit(x_tr, y_tr)

        yv_pred = dt.predict(x_val)
        yt_pred = dt.predict(x_tr)
        train_acc = accuracy_score(y_tr, yt_pred)
        val_acc = accuracy_score(y_val, yv_pred)
        tr_accs.append(train_acc)
        val_accs.append(val_acc)
    models.append(dt)

    print('Depth: {:2d} | Train acc: {:.3f} | Val acc: {:.3f}'.format(
            pval,
            np.mean(tr_accs),
            np.mean(val_accs)
    ))
    mean_tr_accs.append(np.mean(tr_accs))
    mean_val_accs.append(np.mean(val_accs))

In [None]:
# plot test/val accuracies
plt.plot(pvals, mean_tr_accs, color='b', label='Train acc')
plt.plot(pvals, mean_val_accs, color='r', label='Val acc')
plt.legend()
plt.ylabel('Accuracy')
plt.xlabel('Max Depth')
plt.title('Train vs Val acc over Max Depth settings')
plt.show()

We can see that increasing our max depth for the Decision Tree classfier allows us to fit the data better. The validation accuracy peaks at max depth = 7 and decreases beyond that.