In [1]:
import tensorflow as tf
import pandas as pd
import os
import numpy as np

In [49]:
train = pd.read_csv(os.path.join(os.getcwd(), 'train.csv'))
test = pd.read_csv(os.path.join(os.getcwd(), 'test.csv'))

In [50]:
numerical_cols = ['rectal_temp', 'pulse', 'respiratory_rate','nasogastric_reflux_ph', 
                  'packed_cell_volume', 'total_protein', 'abdomo_protein', 'lesion_1']
categorical_num_cols = []
categorical_str_cols = ['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain', 
                        'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 
                        'abdomo_appearance', 'cp_data']
target_col = ['outcome']

In [51]:
train = train[numerical_cols + categorical_num_cols + categorical_str_cols + target_col]
test = test[numerical_cols + categorical_num_cols + categorical_str_cols]
target = train.pop('outcome')
target = target.astype('category')
target = target.cat.codes
train['target'] = target
train.shape

(1235, 24)

In [123]:
train.dtypes

Index(['died', 'euthanized', 'lived'], dtype='object')

In [52]:
# handling missing values
for name in numerical_cols:
    train[name].fillna(train[name].mean())
for name in categorical_str_cols:
    train[name].fillna(train[name].value_counts().iloc[0])
# fix data types
train[numerical_cols] = train[numerical_cols].astype('float32')
train[categorical_str_cols] = train[categorical_str_cols].astype('str')


In [53]:
# handling missing values
for name in numerical_cols:
    test[name].fillna(train[name].mean())
for name in categorical_str_cols:
    test[name].fillna(train[name].value_counts().iloc[0])
# fix data types
test[numerical_cols] = test[numerical_cols].astype('float32')
test[categorical_str_cols] = test[categorical_str_cols].astype('str')

In [54]:
# create tensorflow dataset
def df_to_dataset(dataframe, shuffle=True, batch_size=128):
    df = dataframe.copy()
    labels = df.pop('target')
    df = {name: np.array(value)[:, tf.newaxis] for name, value in df.items()}
    ds = tf.data.Dataset.from_tensor_slices((df, labels))

    if shuffle==True:
        ds = ds.shuffle(buffer_size=1000)
    
    ds = ds.batch(batch_size=batch_size)
    ds = ds.prefetch(buffer_size=1000)

    return ds


In [9]:
# convert train to tensor dataset
train_ds = df_to_dataset(train, batch_size=5)

In [10]:
# testing train dataset
[(train_features, label_batch)] = train_ds.take(1)
label_batch

<tf.Tensor: shape=(5,), dtype=int8, numpy=array([1, 2, 1, 0, 0], dtype=int8)>

In [55]:
# normalization features transformation function
def get_normalization_layer(name, dataset):
    normalizer = tf.keras.layers.Normalization(axis=None)
    feature_ds = dataset.map(lambda x,y: x[name])
    normalizer.adapt(feature_ds)
    return normalizer

In [12]:
# test normalization feature transformation function
normalizer_layer = get_normalization_layer('rectal_temp', train_ds)
normalizer_layer(train_features['rectal_temp'])

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([-1.2712613, -0.256483 ,  1.3925325, -0.7638746,  1.3925325],
      dtype=float32)>

In [56]:
# categorical featuers transformation function
def get_category_encoding_layer(name, dataset, dtype='string', max_tokens=None):
    if dtype == 'string':
        index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
    else:
        index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

    feature_ds = dataset.map(lambda x,y: x[name])
    index.adapt(feature_ds)
    encoding = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())

    return lambda feature: encoding(index(feature))

In [14]:
# test categorical encoding function
encoding_layer = get_category_encoding_layer('pain', train_ds, 'string')
encoding_layer(train_features['pain'])

<tf.Tensor: shape=(8,), dtype=float32, numpy=array([0., 1., 1., 1., 0., 0., 0., 0.], dtype=float32)>

In [57]:
# create input and output for model
batch_size = 128
train_ds = df_to_dataset(train, shuffle=True, batch_size=batch_size)

all_inputs = []
encoded_features = []
for name in numerical_cols:
    num_col = tf.keras.Input(shape=(1,), name=name, dtype='float32')
    normalization_layer = get_normalization_layer(name, train_ds)
    encoded_numeric_col = normalization_layer(num_col)
    all_inputs.append(num_col)
    encoded_features.append(encoded_numeric_col)

for name in categorical_str_cols:
    categorical_col = tf.keras.Input(shape=(1,), name=name, dtype='string')
    encoding_str_layer = get_category_encoding_layer(name, train_ds, 'string')
    encoded_str_col = encoding_str_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_str_col)



In [64]:
# creating the model
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(units=64, activation='relu')(all_features)
x = tf.keras.layers.Dense(units=32, activation='relu')(x)
x = tf.keras.layers.Dropout(0.3)(x)
output = tf.keras.layers.Dense(units=3)(x)

model = tf.keras.Model(all_inputs, output)

In [65]:
# compile model
model.compile(optimizer='adam', 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [66]:
# run model
model.fit(train_ds, epochs=100)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.src.callbacks.History at 0x22857673110>

In [67]:
test_dict = {name : np.array(value)[:, tf.newaxis] for name, value in test.items()}
test_ds = tf.data.Dataset.from_tensor_slices(test_dict)

In [121]:
test.columns

Index(['id', 'surgery', 'age', 'hospital_number', 'rectal_temp', 'pulse',
       'respiratory_rate', 'temp_of_extremities', 'peripheral_pulse',
       'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis',
       'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux',
       'nasogastric_reflux_ph', 'rectal_exam_feces', 'abdomen',
       'packed_cell_volume', 'total_protein', 'abdomo_appearance',
       'abdomo_protein', 'surgical_lesion', 'lesion_1', 'lesion_2', 'lesion_3',
       'cp_data'],
      dtype='object')

In [106]:
prediction = model.predict(test_ds)

  1/824 [..............................] - ETA: 28s



In [118]:
prediction_argmax = [np.argmax(x) for x in prediction]

In [119]:
prediction_argmax

[2,
 0,
 2,
 1,
 2,
 0,
 2,
 0,
 2,
 2,
 0,
 0,
 1,
 1,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 1,
 2,
 0,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 2,
 2,
 1,
 2,
 1,
 0,
 1,
 0,
 2,
 0,
 2,
 2,
 2,
 0,
 2,
 0,
 0,
 1,
 2,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 2,
 0,
 2,
 1,
 0,
 2,
 2,
 0,
 2,
 2,
 2,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 2,
 1,
 2,
 2,
 2,
 1,
 2,
 2,
 0,
 2,
 0,
 1,
 1,
 1,
 2,
 2,
 0,
 0,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 0,
 2,
 2,
 0,
 2,
 0,
 0,
 2,
 0,
 2,
 0,
 0,
 2,
 0,
 2,
 2,
 2,
 0,
 2,
 0,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 0,
 1,
 2,
 2,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 1,
 0,
 2,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 2,
 1,
 0,
 0,
 0,
 2,
 1,
 2,
 1,
 2,
 0,
 2,
 2,
 1,
 1,
 1,
 2,
 2,
 0,
 2,
 2,
 0,
 0,
 0,
 2,
 0,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 0,
 0,
 2,
 0,
 0,
 0,
 1,
 2,
 2,
 0,
 2,
 1,
 2,
 2,
 2,
 2,
 2,
 2,
 2,
 0,
 2,
 2,
 2,
 1,
 2,
 0,
 1,
 1,
 2,
 2,
 1,
 0,
 2,
 0,
 2,
 2,
 2,
 0,
 2,
 0,
 2,
 0,
 0,
 0,
 1,
 2,
 0,
 2,
 1,
 2,
 2,
 0,
 0,
 0,
 2,


In [122]:
prediction_df = prediction_df.astype('category')

Unnamed: 0,0,1,2
0,-3.593696,-4.121152,5.824048
1,3.991136,-3.015255,0.003747
2,1.564877,-5.081720,2.993718
3,-3.924227,5.715120,-1.311954
4,-0.737739,-5.472135,5.715425
...,...,...,...
819,2.533762,-0.677752,-1.388781
820,-0.634482,1.858314,-1.062467
821,2.311933,-3.466755,1.228221
822,0.387666,-4.923575,2.656766


In [117]:
prediction

array([[-3.5936959e+00, -4.1211519e+00,  5.8240476e+00],
       [ 3.9911361e+00, -3.0152555e+00,  3.7468821e-03],
       [ 1.5648769e+00, -5.0817199e+00,  2.9937179e+00],
       ...,
       [ 2.3119330e+00, -3.4667549e+00,  1.2282214e+00],
       [ 3.8766631e-01, -4.9235754e+00,  2.6567659e+00],
       [-2.6443252e-01, -3.8670473e+00,  4.2103019e+00]], dtype=float32)

In [125]:
categories = ['died', 'euthanized', 'lived']

'died'

In [128]:
submission_prediction = [categories[x] for x in prediction_argmax]
submission_id = test['id']

In [129]:
submission = pd.DataFrame({'id': submission_id, 'outcome': submission_prediction})

In [131]:
submission.to_csv('submission.csv', index=False)

In [None]:
train.dtypes