In [111]:
import functools
import numpy as np
import tensorflow as tf
import pandas as pd
from tensorflow import feature_column
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import pprint
import tensorflow_model_analysis as tfma
from google.protobuf import text_format

In [2]:
TRAIN_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
TEST_DATA_URL = "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"

train_file_path = tf.keras.utils.get_file("train.csv", TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file("eval.csv", TEST_DATA_URL)

In [3]:
!head {train_file_path}

survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,male,22.0,1,0,7.25,Third,unknown,Southampton,n
1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
1,female,26.0,0,0,7.925,Third,unknown,Southampton,y
1,female,35.0,1,0,53.1,First,C,Southampton,n
0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
0,male,2.0,3,1,21.075,Third,unknown,Southampton,n
1,female,27.0,0,2,11.1333,Third,unknown,Southampton,n
1,female,14.0,1,0,30.0708,Second,unknown,Cherbourg,n
1,female,4.0,1,1,16.7,Third,G,Southampton,n


In [57]:
train_df = pd.read_csv(train_file_path, header='infer')
test_df = pd.read_csv(test_file_path, header='infer')
train_df.dropna()
test_df.dropna()

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,35.0,0,0,8.0500,Third,unknown,Southampton,y
1,0,male,54.0,0,0,51.8625,First,E,Southampton,y
2,1,female,58.0,0,0,26.5500,First,C,Southampton,y
3,1,female,55.0,0,0,16.0000,Second,unknown,Southampton,y
4,1,male,34.0,0,0,13.0000,Second,D,Southampton,y
...,...,...,...,...,...,...,...,...,...,...
259,1,female,25.0,0,1,26.0000,Second,unknown,Southampton,n
260,0,male,33.0,0,0,7.8958,Third,unknown,Southampton,y
261,0,female,39.0,0,5,29.1250,Third,unknown,Queenstown,n
262,0,male,27.0,0,0,13.0000,Second,unknown,Southampton,y


In [58]:
train_df

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.2500,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.9250,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1000,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
...,...,...,...,...,...,...,...,...,...,...
622,0,male,28.0,0,0,10.5000,Second,unknown,Southampton,y
623,0,male,25.0,0,0,7.0500,Third,unknown,Southampton,y
624,1,female,19.0,0,0,30.0000,First,B,Southampton,y
625,0,female,28.0,1,2,23.4500,Third,unknown,Southampton,n


In [59]:
LABEL_COLUMN = 'survived'
LABELS = [0, 1]

train_ds = tf.data.experimental.make_csv_dataset(
      train_file_path,
      batch_size=3,
      label_name=LABEL_COLUMN,
      na_value="?",
      num_epochs=1,
      ignore_errors=True)

In [60]:
test_ds = tf.data.experimental.make_csv_dataset(
      test_file_path,
      batch_size=3,
      label_name=LABEL_COLUMN,
      na_value="?",
      num_epochs=1,
      ignore_errors=True)

In [61]:
for batch, label in train_ds.take(1):
  print(label)
  for key, value in batch.items():
    print("{}: {}".format(key,value.numpy()))


tf.Tensor([0 0 1], shape=(3,), dtype=int32)
sex: [b'male' b'male' b'male']
age: [28. 28. 19.]
n_siblings_spouses: [0 0 0]
parch: [0 0 0]
fare: [ 7.75   56.4958 10.5   ]
class: [b'Third' b'Third' b'Second']
deck: [b'unknown' b'unknown' b'unknown']
embark_town: [b'Queenstown' b'Southampton' b'Southampton']
alone: [b'y' b'y' b'y']


In [62]:
feature_columns = []

# numeric cols
for header in ['age', 'n_siblings_spouses', 'parch', 'fare']:
  feature_columns.append(feature_column.numeric_column(header))

In [63]:
train_df.describe()

Unnamed: 0,survived,age,n_siblings_spouses,parch,fare
count,627.0,627.0,627.0,627.0,627.0
mean,0.38756,29.631308,0.545455,0.379585,34.385399
std,0.487582,12.511818,1.15109,0.792999,54.59773
min,0.0,0.75,0.0,0.0,0.0
25%,0.0,23.0,0.0,0.0,7.8958
50%,0.0,28.0,0.0,0.0,15.0458
75%,1.0,35.0,1.0,0.0,31.3875
max,1.0,80.0,8.0,5.0,512.3292


In [64]:
# We will use this batch to demonstrate several types of feature columns
example_batch = next(iter(train_ds))[0]
# A utility method to create a feature column
# and to transform a batch of data
def demo(feature_column):
  feature_layer = layers.DenseFeatures(feature_column)
  print(feature_layer(example_batch).numpy())

In [65]:
age = feature_column.numeric_column('age')
age_buckets = feature_column.bucketized_column(age, boundaries=[23, 28, 35])
demo(age_buckets)

[[0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]]


In [66]:
h = {}
for col in train_df:
  if col in ['sex', 'class', 'deck', 'embark_town', 'alone']:
    print(col, ':', train_df[col].unique())
    h[col] = train_df[col].unique()

sex : ['male' 'female']
class : ['Third' 'First' 'Second']
deck : ['unknown' 'C' 'G' 'A' 'B' 'D' 'F' 'E']
embark_town : ['Southampton' 'Cherbourg' 'Queenstown' 'unknown']
alone : ['n' 'y']


In [67]:
type(h.get('sex').tolist())

list

In [68]:
sex_type = feature_column.categorical_column_with_vocabulary_list(
      'sex', ['male' 'female'])
sex_type_one_hot = feature_column.indicator_column(sex_type)

## use key to lookup value and pass the value to one-hot encoding.
sex_type = feature_column.categorical_column_with_vocabulary_list(
      'sex', h.get('sex').tolist())
sex_type_one_hot = feature_column.indicator_column(sex_type)

class_type = feature_column.categorical_column_with_vocabulary_list(
      'class', ['Third' 'First' 'Second'])
class_type_one_hot = feature_column.indicator_column(class_type)

deck_type = feature_column.categorical_column_with_vocabulary_list(
      'deck', h.get('deck').tolist())
deck_type_one_hot = feature_column.indicator_column(deck_type)

embark_town_type = feature_column.categorical_column_with_vocabulary_list(
      'embark_town', h.get('embark_town').tolist())
embark_town_type_one_hot = feature_column.indicator_column(embark_town_type)

alone_type = feature_column.categorical_column_with_vocabulary_list(
      'alone', h.get('alone').tolist())
alone_one_hot = feature_column.indicator_column(alone_type)

In [69]:
deck = feature_column.categorical_column_with_vocabulary_list(
      'deck', train_df.deck.unique())
deck_embedding = feature_column.embedding_column(deck, dimension=3)

In [70]:
class_hashed = feature_column.categorical_column_with_hash_bucket(
      'class', hash_bucket_size=4)
demo(feature_column.indicator_column(class_hashed))

[[0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]]


In [71]:
feature_column

<module 'tensorflow._api.v2.feature_column' from '/Users/mbp16/Documents/projects/tf23/lib/python3.8/site-packages/tensorflow/_api/v2/feature_column/__init__.py'>

In [72]:
crossed_feature = feature_column.crossed_column([sex_type, class_type], hash_bucket_size=5)
demo(feature_column.indicator_column(crossed_feature))

[[0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 1.]
 [0. 1. 0. 0. 0.]]


In [73]:
crossed_feature

CrossedColumn(keys=(VocabularyListCategoricalColumn(key='sex', vocabulary_list=('male', 'female'), dtype=tf.string, default_value=-1, num_oov_buckets=0), VocabularyListCategoricalColumn(key='class', vocabulary_list=('ThirdFirstSecond',), dtype=tf.string, default_value=-1, num_oov_buckets=0)), hash_bucket_size=5, hash_key=None)

In [74]:
feature_columns = []

# numeric cols
for header in ['age', 'n_siblings_spouses', 'parch', 'fare']:
  feature_columns.append(feature_column.numeric_column(header))

In [75]:
# bucketized cols
age = feature_column.numeric_column('age')
age_buckets = feature_column.bucketized_column(age, boundaries=[23, 28, 35])

In [76]:
feature_columns.append(age_buckets)

In [77]:
# indicator_columns
indicator_column_names = ['sex', 'class', 'deck', 'embark_town', 'alone']
for col_name in indicator_column_names:
  categorical_column = feature_column.categorical_column_with_vocabulary_list(
      col_name, train_df[col_name].unique())
  indicator_column = feature_column.indicator_column(categorical_column)
  feature_columns.append(indicator_column)

In [78]:
# append embedding columns
deck = feature_column.categorical_column_with_vocabulary_list(
      'deck', train_df.deck.unique())
deck_embedding = feature_column.embedding_column(deck, dimension=3)
feature_columns.append(deck_embedding)

In [79]:
# append crossed columns
cross_type_feature = feature_column.crossed_column(['sex', 'class'], hash_bucket_size=5)
feature_columns.append(feature_column.indicator_column(cross_type_feature))

In [80]:
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

In [81]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def pandas_to_dataset(dataframe, shuffle=True, batch_size=32):
  dataframe = dataframe.copy()
  labels = dataframe.pop('survived')
  ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [82]:
val_df, test_df = train_test_split(test_df, test_size=0.4)

In [83]:
print(len(train_df), 'train examples')
print(len(val_df), 'validation examples')
print(len(test_df), 'test examples')

627 train examples
158 validation examples
106 test examples


In [84]:
train_df

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
0,0,male,22.0,1,0,7.2500,Third,unknown,Southampton,n
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
2,1,female,26.0,0,0,7.9250,Third,unknown,Southampton,y
3,1,female,35.0,1,0,53.1000,First,C,Southampton,n
4,0,male,28.0,0,0,8.4583,Third,unknown,Queenstown,y
...,...,...,...,...,...,...,...,...,...,...
622,0,male,28.0,0,0,10.5000,Second,unknown,Southampton,y
623,0,male,25.0,0,0,7.0500,Third,unknown,Southampton,y
624,1,female,19.0,0,0,30.0000,First,B,Southampton,y
625,0,female,28.0,1,2,23.4500,Third,unknown,Southampton,n


In [142]:
'''
batch_size = 33
train_ds = df_to_dataset(train_df, batch_size=batch_size)
'''

'\nbatch_size = 33\ntrain_ds = df_to_dataset(train_df, batch_size=batch_size)\n'

In [32]:
batch_size = 33
labels = train_df.pop('survived')
working_ds = tf.data.Dataset.from_tensor_slices((dict(train_df), labels))
working_ds = working_ds.shuffle(buffer_size=len(train_df))
train_ds = working_ds.batch(batch_size)

In [33]:
dict(train_df)

{'sex': 0        male
 1      female
 2      female
 3      female
 4        male
         ...  
 622      male
 623      male
 624    female
 625    female
 626      male
 Name: sex, Length: 627, dtype: object,
 'age': 0      22.0
 1      38.0
 2      26.0
 3      35.0
 4      28.0
        ... 
 622    28.0
 623    25.0
 624    19.0
 625    28.0
 626    32.0
 Name: age, Length: 627, dtype: float64,
 'n_siblings_spouses': 0      1
 1      1
 2      0
 3      1
 4      0
       ..
 622    0
 623    0
 624    0
 625    1
 626    0
 Name: n_siblings_spouses, Length: 627, dtype: int64,
 'parch': 0      0
 1      0
 2      0
 3      0
 4      0
       ..
 622    0
 623    0
 624    0
 625    2
 626    0
 Name: parch, Length: 627, dtype: int64,
 'fare': 0       7.2500
 1      71.2833
 2       7.9250
 3      53.1000
 4       8.4583
         ...   
 622    10.5000
 623     7.0500
 624    30.0000
 625    23.4500
 626     7.7500
 Name: fare, Length: 627, dtype: float64,
 'class': 0       Third
 

In [85]:
val_ds = pandas_to_dataset(val_df, shuffle=False, batch_size=batch_size)
test_ds = pandas_to_dataset(test_df, shuffle=False, batch_size=batch_size)

In [86]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_features_3 (DenseFeatu multiple                  24        
_________________________________________________________________
dense_9 (Dense)              multiple                  4608      
_________________________________________________________________
dense_10 (Dense)             multiple                  16512     
_________________________________________________________________
dropout_3 (Dropout)          multiple                  0         
_________________________________________________________________
dense_11 (Dense)             multiple                  129       
Total params: 21,273
Trainable params: 21,273
Non-trainable params: 0
_________________________________________________________________


In [41]:
model = tf.keras.Sequential([
  feature_layer,
  layers.Dense(128, activation='relu'),
  layers.Dense(128, activation='relu'),
  layers.Dropout(.1),
  layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_ds,
          validation_data=val_ds,
          epochs=10)

Epoch 1/10
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Consider rewriting this model with the Functional API.
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe3e086f0d0>

In [87]:
prediction_raw = model.predict(test_ds)

In [88]:
prediction_raw

array([[0.6699142 ],
       [0.6239286 ],
       [0.06013593],
       [0.80649877],
       [0.1638726 ],
       [0.218701  ],
       [0.18982252],
       [0.9612068 ],
       [0.8671681 ],
       [0.217652  ],
       [0.73804766],
       [0.27055115],
       [0.16897348],
       [0.97456145],
       [0.5109846 ],
       [0.16969743],
       [0.98396575],
       [0.99717927],
       [0.2284449 ],
       [0.17796624],
       [0.6546713 ],
       [0.9586839 ],
       [0.14072827],
       [0.12421191],
       [0.1337139 ],
       [0.6490319 ],
       [0.1492177 ],
       [0.4314343 ],
       [0.573238  ],
       [0.67310584],
       [0.29760784],
       [0.9440007 ],
       [0.7306901 ],
       [0.5690205 ],
       [0.588595  ],
       [0.13869563],
       [0.720448  ],
       [0.62128365],
       [0.99787366],
       [0.8370784 ],
       [0.5758164 ],
       [0.6024462 ],
       [0.33588302],
       [0.5559689 ],
       [0.17466038],
       [0.15874505],
       [0.5229116 ],
       [0.227

In [89]:
#Recode prediction to binary. If p >= 0.5, then 1, otherwise 0
#prediction_np = np.where(prediction_raw >= 0.5, 1, 0)

In [90]:
#prediction_np.shape

(106, 1)

In [134]:
prediction_list = prediction_raw.squeeze().tolist()

In [135]:
test_df['predicted'] = prediction_list

In [141]:
# Put predicted as first col, next to survived.
cols = list(test_df.columns)
cols = [cols[-1]] + cols[:-1]
test_df = test_df[cols]

In [142]:
test_df

Unnamed: 0,predicted,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
247,0.669914,1,male,32.0,0,0,56.4958,Third,unknown,Southampton,y
112,0.623929,0,female,20.0,0,0,8.6625,Third,unknown,Southampton,y
129,0.060136,0,male,28.0,0,0,0.0000,Second,unknown,Southampton,y
29,0.806499,1,female,28.0,1,1,22.3583,Third,F,Cherbourg,n
226,0.163873,0,male,16.0,0,0,7.7750,Third,unknown,Southampton,y
...,...,...,...,...,...,...,...,...,...,...,...
19,0.102746,0,male,29.0,0,0,8.0500,Third,unknown,Southampton,y
164,0.097697,1,male,32.0,0,0,7.9250,Third,unknown,Southampton,y
203,0.167355,0,male,42.0,0,0,7.6500,Third,F,Southampton,y
190,0.946212,0,male,58.0,0,2,113.2750,First,D,Cherbourg,n


In [137]:
OUTPUT_PATH = './titanic-fairness'

In [138]:
# Specify Fairness Indicators in eval_config.
eval_config = text_format.Parse("""
  model_specs {
    prediction_key: 'predicted',
    label_key: 'survived'
  }
  metrics_specs {
    metrics {class_name: "AUC"}
    metrics {
      class_name: "FairnessIndicators"
      config: '{"thresholds": [0.1, 0.50, 0.90]}'
    }
    metrics { class_name: "ExampleCount" }
  }
  
  slicing_specs {
    feature_keys: ['sex', 'class']
  }
  slicing_specs {}
  """, tfma.EvalConfig())

# Run TensorFlow Model Analysis.
eval_result = tfma.analyze_raw_data(
  data= test_df,
  eval_config=eval_config,
  output_path=OUTPUT_PATH)



In [139]:
!jupyter nbextension enable tensorflow_model_analysis --user --py

Enabling notebook extension tensorflow_model_analysis/extension...
      - Validating: [32mOK[0m


In [143]:
# Render Fairness Indicators.
tfma.addons.fairness.view.widget_view.render_fairness_indicator(eval_result)

FairnessIndicatorViewer(slicingMetrics=[{'sliceValue': 'Third_X_male', 'slice': 'class_X_sex:Third_X_male', 'm…

In [161]:
sel_df = test_df[(test_df['sex'] == 'male') & (test_df['class'] == 'Second')]
sel_df

Unnamed: 0,predicted,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
129,0.060136,0,male,28.0,0,0,0.0,Second,unknown,Southampton,y
67,0.218701,0,male,54.0,1,0,26.0,Second,unknown,Southampton,n
244,0.867168,1,male,1.0,0,2,37.0042,Second,unknown,Cherbourg,n
62,0.270551,0,male,24.0,0,0,10.5,Second,unknown,Southampton,y
246,0.673106,1,male,0.83,1,1,18.75,Second,unknown,Southampton,n
4,0.335883,1,male,34.0,0,0,13.0,Second,D,Southampton,y
64,0.200719,0,male,29.0,0,0,10.5,Second,unknown,Southampton,y
37,0.182209,0,male,42.0,0,0,13.0,Second,unknown,Southampton,y
254,0.258147,0,male,21.0,1,0,11.5,Second,unknown,Southampton,n
77,0.182209,1,male,42.0,0,0,13.0,Second,unknown,Southampton,y


In [163]:
sel_df = train_df[(train_df['sex'] == 'female') & (train_df['class'] == 'First')]
sel_df

Unnamed: 0,survived,sex,age,n_siblings_spouses,parch,fare,class,deck,embark_town,alone
1,1,female,38.0,1,0,71.2833,First,C,Cherbourg,n
3,1,female,35.0,1,0,53.1000,First,C,Southampton,n
24,1,female,28.0,1,0,146.5208,First,B,Cherbourg,n
40,1,female,49.0,1,0,76.7292,First,D,Cherbourg,n
48,1,female,38.0,0,0,80.0000,First,B,unknown,y
...,...,...,...,...,...,...,...,...,...,...
599,1,female,28.0,1,0,89.1042,First,C,Cherbourg,n
601,1,female,16.0,0,1,39.4000,First,D,Southampton,n
603,1,female,45.0,1,1,164.8667,First,unknown,Southampton,n
613,1,female,47.0,1,1,52.5542,First,D,Southampton,n


In [165]:
train_df.groupby(['sex', 'class', 'survived' ]).size().reset_index(name='counts')

Unnamed: 0,sex,class,survived,counts
0,female,First,0,2
1,female,First,1,67
2,female,Second,0,5
3,female,Second,1,50
4,female,Third,0,41
5,female,Third,1,52
6,male,First,0,56
7,male,First,1,34
8,male,Second,0,64
9,male,Second,1,8
