In [1]:
import tensorflow as tf
import pandas as pd
import os
import numpy as np

In [18]:
def most_common_null_filler(dataframe):
    for name in dataframe.columns:
        dataframe.loc[dataframe[name].isnull(), name] = train[name].value_counts().index[0] 
# tf.keras.utils.get_file -> used to get file from web
train = pd.read_csv(os.path.join(os.getcwd(),'train.csv'))
train['target'] = np.where(train.Transported, 1,0)
train = train.drop(columns=['PassengerId', 'Cabin', 'Name', 'Transported'])
most_common_null_filler(train)
# train = train.convert_dtypes(convert_integer=False)
train[['CryoSleep', 'VIP']] = train[['CryoSleep', 'VIP']].astype(dtype='bool')

test = pd.read_csv(os.path.join(os.getcwd(),'test.csv'))
test = test.drop(columns=['PassengerId', 'Cabin', 'Name'])
most_common_null_filler(test)
test[['CryoSleep', 'VIP']] = test[['CryoSleep', 'VIP']].astype(dtype='bool')


In [63]:
train

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,target
0,Europa,False,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,0
1,Earth,False,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,1
2,Europa,False,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,0
3,Europa,False,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,0
4,Earth,False,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,1
...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,0
8689,Earth,True,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,0
8690,Earth,False,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,1
8691,Europa,False,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,0


In [19]:
# create input pipeline. Tensorflow supports dictionary
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    df = dataframe.copy()
    labels = df.pop('target')
    df = {key: np.array(value)[:, tf.newaxis] for key, value in df.items()}
    ds = tf.data.Dataset.from_tensor_slices((df, labels))
    if shuffle==True:
        ds = ds.shuffle(buffer_size=1000)
    ds = ds.batch(batch_size=batch_size)
    ds = ds.prefetch(buffer_size=1000)
    return ds

In [None]:
# Use this batch size for testing
train_ds = df_to_dataset(train, batch_size=5)

In [None]:
# Test input pipeline
[(train_features, label_batch)] = train_ds.take(1)
print(train_features['Age'])

In [20]:
def get_normalization_layer(name, dataset):
    # create a layer for normalization
    normalizer = tf.keras.layers.Normalization(axis=None)

    # create a dataset that yields the train_features
    feature_ds = dataset.map(lambda x,y: x[name])

    # learn the statistics of the feature
    normalizer.adapt(feature_ds)

    return normalizer

In [None]:
# Test normalization
food_court_col = train_features['FoodCourt']
layer = get_normalization_layer('FoodCourt', train_ds)
layer(food_court_col)

In [21]:
def get_category_encoding_layer(name, dataset, dtype, max_tokens=None):
    # Create a layer that turns strings into integer indices
    if dtype=='string':
        index = tf.keras.layers.StringLookup(max_tokens=max_tokens)
    # Otherwise, create a layer that turns integers into integer indices
    else:
        index = tf.keras.layers.IntegerLookup(max_tokens=max_tokens)

    # Create a dataset that yeilds the features
    feature_ds = dataset.map(lambda x, y: x[name])

    # Learn the dataset indices
    index.adapt(feature_ds)

    # create the encoding layer for the integer indices
    encoding = tf.keras.layers.CategoryEncoding(num_tokens=index.vocabulary_size())
                                                
    # create multi-hot-encoding from the encodings. The lambda function captures the
    # layer, so you can use them, or include them in the Keras Functional model later.
    return lambda feature: encoding(index(feature))

In [None]:
# Test categorical encoding
test_homeplanet_col = train_features['HomePlanet']
test_homeplanet_layer = get_category_encoding_layer('HomePlanet', train_ds, 'string')
test_homeplanet_layer(test_homeplanet_col)

In [22]:
# create bigger batch size data
batch_size = 64
train_ds = df_to_dataset(train, batch_size=batch_size)
# create input and output placeholders for model (encode numberical and categorical data)
# group them in 1 list
all_inputs = []
encoded_features = []

for header in ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']:
    numeric_col = tf.keras.Input(shape=(1,), name=header)
    normalization_layer = get_normalization_layer(header, train_ds)
    encoded_numeric_col = normalization_layer(numeric_col)
    all_inputs.append(numeric_col)
    encoded_features.append(encoded_numeric_col)

In [23]:
# category encoded features
age_col = tf.keras.Input(shape=(1,), name='Age')
encoding_layer = get_category_encoding_layer(name='Age', dataset=train_ds, dtype='float32', max_tokens=5)
encoded_age_col = encoding_layer(age_col)
all_inputs.append(age_col)
encoded_features.append(encoded_age_col)

for header in ['CryoSleep','VIP']:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='float32')
    encoding_layer = get_category_encoding_layer(name=header, dataset=train_ds, dtype='bool')
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

for header in ['HomePlanet', 'Destination']:
    categorical_col = tf.keras.Input(shape=(1,), name=header, dtype='string')
    encoding_layer = get_category_encoding_layer(name=header, dataset=train_ds, dtype='string', max_tokens=5)
    encoded_categorical_col = encoding_layer(categorical_col)
    all_inputs.append(categorical_col)
    encoded_features.append(encoded_categorical_col)

In [67]:
# create model
all_features = tf.keras.layers.concatenate(encoded_features)
x = tf.keras.layers.Dense(units=256, activation='relu')(all_features)
x = tf.keras.layers.Dense(units=128, activation='relu')(x)
x = tf.keras.layers.Dense(units=64, activation='relu')(x)
x = tf.keras.layers.Dense(units=16, activation='relu')(x)
x = tf.keras.layers.Dropout(0.1)(x)
output = tf.keras.layers.Dense(units=1, activation='sigmoid')(x)

model = tf.keras.Model(all_inputs, output)

In [68]:
# compile model
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
# To see the model tree structure
# tf.keras.utils.plot_model(model, show_shapes=True, rankdir="LR")

In [69]:
# run model
model.fit(train_ds, epochs=30)

Epoch 1/30


Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x1ebc1fbb910>

In [12]:
test_dict = {name: np.array(value)[:, tf.newaxis] for name, value in test.items()}
test_ds = tf.data.Dataset.from_tensor_slices(test_dict)

In [None]:
for element in test_ds.take(5):
    print(element['Age'])

In [71]:
prediction = model.predict(test_ds)

  25/4277 [..............................] - ETA: 8s 



In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(units=128, activation='relu', input_shape=[tensor_shape]),
    tf.keras.layers.Dense(units=64, activation = 'relu'),
    tf.keras.layers.Dense(units=1, activation='sigmoid')
])

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy')

model.fit(x_train, y_train
          validation_data=[x_valid, y_valid],
          epochs=100,
          )

In [82]:
submission_pred = np.round(prediction).flatten()
submission_id = pd.read_csv(os.path.join(os.getcwd(),'test.csv'))
submission_id = submission_id['PassengerId']

In [86]:
submission = pd.DataFrame({'PassengerId': submission_id, 'Transported': submission_pred.astype('bool')})

In [88]:
submission.to_csv('submission.csv', index=False)