In [2]:
# Import necessary libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
from IPython.display import clear_output

import tensorflow as tf
tf.random.set_seed(0)

In [3]:
# Import the data.
def get_dfs():
    return pd.read_csv('..\Data\credit_card_default_train.csv'), \
           pd.read_csv('..\Data\credit_card_default_test.csv')

In [4]:
# Clean & transform dataframe.
def del_columns(df, columns):
    for column in columns:
        del df[column]
    return df

def label_encoder(df, columns):
    for column in columns:
        df[column] = df[column].astype('category')
    
    cat_columns = df.select_dtypes(['category']).columns
    df[cat_columns] = df[cat_columns].apply(lambda x: x.cat.codes)
    return df

def one_hot_encoder(df, columns):
    for column in columns:
        dummies = pd.get_dummies(df[column], prefix=column, drop_first=False).astype('int64')
        del df[column]
        df = pd.concat([df, dummies], axis=1)
    return df

def age_encoder(df, columns):
    for i, column in enumerate(columns, 1):
        df['Age_'+str(i)] = df[column].replace(['Less than 30','31-45', '46-65', 'More than 65'],
                                               [0, 1, 2, 3])
        del df[column]
    return df
    
def str_to_currency(df, columns):
    for i, column in enumerate(columns, 1):
        df['Limit_'+str(i)] = (df[column].replace(r'[KM]+$', '', regex=True).astype(float) * \
                               df[column].str.extract(r'[\d\.]+([KM]+)', expand=False)
                                         .fillna(1)
                                         .replace(['K','M'], [10**3, 10**6]).astype(int))
        del df[column]
    return df

def one_hot_cat_column(feature_name, vocab):
    return tf.feature_column.indicator_column(tf.feature_column.categorical_column_with_vocabulary_list(feature_name, vocab))

In [5]:
def preprocess_df(df):
    df.set_index('Client_ID', inplace=True)
    df = age_encoder(df, ['AGE'])
    df = str_to_currency(df, ['Balance_Limit_V1'])
    return df

X_train, X_test = map(preprocess_df, get_dfs())
y_train = X_train.pop('NEXT_MONTH_DEFAULT')

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, test_size=0.15, random_state=0)

In [6]:
CATEGORICAL_COLUMNS = ['Gender', 'EDUCATION_STATUS', 'MARITAL_STATUS']
NUMERIC_COLUMNS = ['Age_1', 'Limit_1', 'PAY_JULY', 'PAY_AUG', 'PAY_SEP', 'PAY_OCT', 'PAY_NOV', 'PAY_DEC',
                   'DUE_AMT_JULY', 'DUE_AMT_AUG', 'DUE_AMT_SEP', 'DUE_AMT_OCT', 'DUE_AMT_NOV', 'DUE_AMT_DEC',
                   'PAID_AMT_JULY', 'PAID_AMT_AUG', 'PAID_AMT_SEP', 'PAID_AMT_OCT', 'PAID_AMT_NOV', 'PAID_AMT_DEC']
FEATURE_COLUMNS = []

for feature in CATEGORICAL_COLUMNS:
    vocabulary = X_train[feature].unique()
    FEATURE_COLUMNS.append(one_hot_cat_column(feature, vocabulary))

for feature in NUMERIC_COLUMNS:
    FEATURE_COLUMNS.append(tf.feature_column.numeric_column(feature, dtype=tf.float32))

In [7]:
def make_input_fn(X, y=None, n_epochs=None, batch=32, shuffle=True):
    def input_fn():
        if y is None:
            dataset = tf.data.Dataset.from_tensor_slices(X.to_dict(orient='list'))
        else:
            dataset = tf.data.Dataset.from_tensor_slices((X.to_dict(orient='list'), y))
        if shuffle:
            dataset = dataset.shuffle(1000)
        dataset = (dataset.repeat(n_epochs)
                          .batch(batch))
        return dataset
    return input_fn

# Training and evaluation input functions.
train_input_fn = make_input_fn(X_train, y_train)
val_input_fn = make_input_fn(X_val, y_val, shuffle=False, n_epochs=1)
test_input_fn = make_input_fn(X_test, shuffle=False, n_epochs=1)

In [19]:
est = tf.estimator.DNNClassifier(feature_columns=FEATURE_COLUMNS,
                                 hidden_units=[10, 5])
est.train(train_input_fn, max_steps=10)

# Evaluation.
results = est.evaluate(val_input_fn)
clear_output()
pd.Series(results).to_frame()

Unnamed: 0,0
accuracy,0.2225
accuracy_baseline,0.778056
auc,0.499462
auc_precision_recall,0.221759
average_loss,9133.829102
label/mean,0.221944
loss,9120.820312
precision,0.221758
prediction/mean,0.99861
recall,0.997497


In [20]:
y_pred = np.array([np.argmax(p['probabilities']) for p in est.predict(test_input_fn)])
y_test = pd.DataFrame(y_pred, columns=['NEXT_MONTH_DEFAULT'], index=X_test.index)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from C:\Users\TRABEY~1\AppData\Local\Temp\tmpj31g6sna\model.ckpt-10
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [21]:
y_test.to_csv('..\Day 3\Outputs\DNNClassifier_Pred.csv', index=True)

In [30]:
import matplotlib.pyplot as plt
import seaborn as sns
sns_colors = sns.color_palette('colorblind')

In [32]:
pred_dicts = list(est.experimental_predict_with_explanations(val_input_fn))

INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\TRABEY~1\\AppData\\Local\\Temp\\tmp_ohbux4w', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:te

In [None]:
# Create DFC Pandas dataframe.
labels = y_val.values
probs = pd.Series([pred['probabilities'][1] for pred in pred_dicts])
df_dfc = pd.DataFrame([pred['dfc'] for pred in pred_dicts])
df_dfc.describe().T

In [None]:
# Sum of DFCs + bias == probabality.
bias = pred_dicts[0]['bias']
dfc_prob = df_dfc.sum(axis=1) + bias
np.testing.assert_almost_equal(dfc_prob.values, probs.values)

In [None]:
# Boilerplate code for plotting :)
def _get_color(value):
    """To make positive DFCs plot green, negative DFCs plot red."""
    green, red = sns.color_palette()[2:4]
    if value >= 0: return green
    return red

def _add_feature_values(feature_values, ax):
    """Display feature's values on left of plot."""
    x_coord = ax.get_xlim()[0]
    OFFSET = 0.15
    for y_coord, (feat_name, feat_val) in enumerate(feature_values.items()):
        t = plt.text(x_coord, y_coord - OFFSET, '{}'.format(feat_val), size=12)
        t.set_bbox(dict(facecolor='white', alpha=0.5))
    from matplotlib.font_manager import FontProperties
    font = FontProperties()
    font.set_weight('bold')
    t = plt.text(x_coord, y_coord + 1 - OFFSET, 'feature\nvalue',
    fontproperties=font, size=12)

def plot_example(example):
  TOP_N = 8 # View top 8 features.
  sorted_ix = example.abs().sort_values()[-TOP_N:].index  # Sort by magnitude.
  example = example[sorted_ix]
  colors = example.map(_get_color).tolist()
  ax = example.to_frame().plot(kind='barh',
                          color=[colors],
                          legend=None,
                          alpha=0.75,
                          figsize=(10,6))
  ax.grid(False, axis='y')
  ax.set_yticklabels(ax.get_yticklabels(), size=14)

  # Add feature values.
  _add_feature_values(X_val.iloc[ID][sorted_ix], ax)
  return ax

In [None]:
# Plot results.
ID = 1
example = df_dfc.iloc[ID]  # Choose ith example from evaluation set.
TOP_N = 8  # View top 8 features.
sorted_ix = example.abs().sort_values()[-TOP_N:].index
ax = plot_example(example)
ax.set_title('Feature contributions for example {}\n pred: {:1.2f}; label: {}'.format(ID, probs[ID], labels[ID]))
ax.set_xlabel('Contribution to predicted probability', size=14)
plt.show()

In [None]:
# Boilerplate plotting code.
def dist_violin_plot(df_dfc, ID):
  # Initialize plot.
  fig, ax = plt.subplots(1, 1, figsize=(10, 6))

  # Create example dataframe.
  TOP_N = 8  # View top 8 features.
  example = df_dfc.iloc[ID]
  ix = example.abs().sort_values()[-TOP_N:].index
  example = example[ix]
  example_df = example.to_frame(name='dfc')

  # Add contributions of entire distribution.
  parts=ax.violinplot([df_dfc[w] for w in ix],
                 vert=False,
                 showextrema=False,
                 widths=0.7,
                 positions=np.arange(len(ix)))
  face_color = sns_colors[0]
  alpha = 0.15
  for pc in parts['bodies']:
      pc.set_facecolor(face_color)
      pc.set_alpha(alpha)

  # Add feature values.
  _add_feature_values(X_val.iloc[ID][sorted_ix], ax)

  # Add local contributions.
  ax.scatter(example,
              np.arange(example.shape[0]),
              color=sns.color_palette()[2],
              s=100,
              marker="s",
              label='contributions for example')

  # Legend
  # Proxy plot, to show violinplot dist on legend.
  ax.plot([0,0], [1,1], label='eval set contributions\ndistributions',
          color=face_color, alpha=alpha, linewidth=10)
  legend = ax.legend(loc='lower right', shadow=True, fontsize='x-large',
                     frameon=True)
  legend.get_frame().set_facecolor('white')

  # Format plot.
  ax.set_yticks(np.arange(example.shape[0]))
  ax.set_yticklabels(example.index)
  ax.grid(False, axis='y')
  ax.set_xlabel('Contribution to predicted probability', size=14)

In [None]:
dist_violin_plot(df_dfc, ID)
plt.title('Feature contributions for example {}\n pred: {:1.2f}; label: {}'.format(ID, probs[ID], labels[ID]))
plt.show()

In [None]:
importances = est.experimental_feature_importances(normalize=True)
df_imp = pd.Series(importances)

# Visualize importances.
N = 8
ax = (df_imp.iloc[0:N][::-1]
    .plot(kind='barh',
          color=sns_colors[0],
          title='Gain feature importances',
          figsize=(10, 6)))
ax.grid(False, axis='y')

In [None]:
# Plot.
dfc_mean = df_dfc.abs().mean()
N = 8
sorted_ix = dfc_mean.abs().sort_values()[-N:].index  # Average and sort by absolute.
ax = dfc_mean[sorted_ix].plot(kind='barh',
                       color=sns_colors[1],
                       title='Mean |directional feature contributions|',
                       figsize=(10, 6))
ax.grid(False, axis='y')

In [None]:
FEATURE = 'PAY_JULY'
feature = pd.Series(df_dfc[FEATURE].values, index=X_val[FEATURE].values).sort_index()
ax = sns.regplot(feature.index.values, feature.values, lowess=True)
ax.set_ylabel('contribution')
ax.set_xlabel(FEATURE)
ax.set_xlim(0, 100)
plt.show()