In [6]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [21]:
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

In [22]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [23]:
# Import dependencies
import tensorflow as tf
import tensorflow_decision_forests as tfdf

print(f"Found TF-DF {tfdf.__version__}")

Found TF-DF 1.8.1


### Prepare Dataset
We will apply the following transformations on the dataset.
* Tokenize the names. For example, "Brauind, Mr. Owen Harris" will become ["Braund", "Mr.", "Owen", "Harris"].
* Extract any prefix in the ticket. For example, ticket "STON/O2.3101282" will become "STON/O2." and 3101282.

In [24]:
def preprocess(df):
    df = df.copy()
    
    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])

    def ticket_number(x):
        return x.split(" ")[-1]
    
    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])
    
    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_Number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_Item"] = df["Ticket"].apply(ticket_item)
    return df

preprocessed_train_data = preprocess(train_data)
preprocessed_test_data = preprocess(test_data)

preprocessed_train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Ticket_Number,Ticket_Item
0,1,0,3,Braund Mr Owen Harris,male,22.0,1,0,A/5 21171,7.25,,S,21171,A/5
1,2,1,1,Cumings Mrs John Bradley Florence Briggs Thayer,female,38.0,1,0,PC 17599,71.2833,C85,C,17599,PC
2,3,1,3,Heikkinen Miss Laina,female,26.0,0,0,STON/O2. 3101282,7.925,,S,3101282,STON/O2.
3,4,1,1,Futrelle Mrs Jacques Heath Lily May Peel,female,35.0,1,0,113803,53.1,C123,S,113803,NONE
4,5,0,3,Allen Mr William Henry,male,35.0,0,0,373450,8.05,,S,373450,NONE


Let's keep a list of the input features of the model. Notably, we don't want to train our model on the `PassengerID` and `Ticket` features because those columns likely have nothing to do with whether or not the passenger survived.

In [25]:
input_features = list(preprocessed_train_data.columns)
input_features.remove("Ticket")
input_features.remove("PassengerId")
input_features.remove("Survived")

print(f"Input features: {input_features}")

Input features: ['Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Cabin', 'Embarked', 'Ticket_Number', 'Ticket_Item']


### Convert Pandas dataset to TensorFlow dataset

In [26]:
def tokenize_names(features, labels = None):
    """
    Divide the names into tokens. TF-DF can consume text tokens
    natively."""
    features["Name"] = tf.strings.split(features["Name"])
    return features, labels

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_train_data, label = "Survived").map(tokenize_names)
serving_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preprocessed_test_data).map(tokenize_names)

### Train model with default parameters
#### Train model
First, we are training a `GradientBoostedTreesModel` model with the default params.

In [27]:
model = tfdf.keras.GradientBoostedTreesModel(verbose = 0, 
                                            features = [tfdf.keras.FeatureUsage(name=n) for n in input_features],
                                            exclude_non_specified_features=True,
#                                             min_examples = 1,
#                                             categorical_algorithm="RANDOM",
#                                             shrinkage=0.05,
#                                             split_axis="SPARSE_OBLIQUE",
#                                             sparse_oblique_normalizations="MIN-MAX",
#                                             sparse_oblique_num_projections_exponent = 2.0,
#                                             num_trees=2000,
                                            random_seed = 1234, )
model.fit(train_ds)
self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss: {self_evaluation.loss}")

[INFO 24-07-08 18:52:36.5681 UTC kernel.cc:1233] Loading model from path /tmp/tmp9h3pik8p/model/ with prefix 4c0fd90ea0f54c64
[INFO 24-07-08 18:52:36.5745 UTC quick_scorer_extended.cc:903] The binary was compiled without AVX2 support, but your CPU supports it. Enable it for faster model inference.
[INFO 24-07-08 18:52:36.5749 UTC abstract_model.cc:1344] Engine "GradientBoostedTreesQuickScorerExtended" built
[INFO 24-07-08 18:52:36.5749 UTC kernel.cc:1061] Use fast generic engine


Accuracy: 0.8260869383811951 Loss: 0.8608942627906799


### Train the model with improved default params
Now you'll use specific parameters

In [30]:
model = tfdf.keras.GradientBoostedTreesModel(verbose = 0, 
                                            features = [tfdf.keras.FeatureUsage(name=n) for n in input_features],
                                            exclude_non_specified_features=True,
                                            min_examples = 1,
                                            categorical_algorithm="RANDOM",
                                            shrinkage=0.05,
                                            split_axis="SPARSE_OBLIQUE",
                                            sparse_oblique_normalization="MIN_MAX",
                                            sparse_oblique_num_projections_exponent = 2.0,
                                            num_trees=2000,
                                            random_seed = 1234, )
model.fit(train_ds)
self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss: {self_evaluation.loss}")

[INFO 24-07-08 18:54:42.3359 UTC kernel.cc:1233] Loading model from path /tmp/tmp8vmifbjo/model/ with prefix cfdbb29b5e084be6
[INFO 24-07-08 18:54:42.3448 UTC decision_forest.cc:660] Model loaded with 40 root(s), 2106 node(s), and 10 input feature(s).
[INFO 24-07-08 18:54:42.3448 UTC abstract_model.cc:1344] Engine "GradientBoostedTreesGeneric" built
[INFO 24-07-08 18:54:42.3448 UTC kernel.cc:1061] Use fast generic engine


Accuracy: 0.782608687877655 Loss: 1.0586705207824707


In [31]:
model.summary()

Model: "gradient_boosted_trees_model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
Total params: 1 (1.00 Byte)
Trainable params: 0 (0.00 Byte)
Non-trainable params: 1 (1.00 Byte)
_________________________________________________________________
Type: "GRADIENT_BOOSTED_TREES"
Task: CLASSIFICATION
Label: "__LABEL"

Input Features (11):
	Age
	Cabin
	Embarked
	Fare
	Name
	Parch
	Pclass
	Sex
	SibSp
	Ticket_Item
	Ticket_Number

No weights

Variable Importance: INV_MEAN_MIN_DEPTH:
    1.           "Sex"  0.585997 ################
    2.           "Age"  0.364636 #######
    3.          "Fare"  0.266191 ###
    4.          "Name"  0.207054 #
    5.        "Pclass"  0.179191 
    6. "Ticket_Number"  0.178806 
    7.      "Embarked"  0.177803 
    8.   "Ticket_Item"  0.177009 
    9.         "Parch"  0.175276 
   10.         "SibSp"  0.171694 

Variable Importance: NUM_AS_ROOT:
    1.  "Sex" 34.000000 ######

## Make Predictions

In [35]:
def prediction_to_kaggle_format(model, threshold=0.5):
    proba_survive = model.predict(serving_ds, verbose =0)[:,0]
    return pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": (proba_survive >= threshold).astype(int)
    })
def make_submission(kaggle_preds):
    path = "/kaggle/working/submission.csv"
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")
    
kaggle_predictions = prediction_to_kaggle_format(model)
make_submission(kaggle_predictions)
!head /kaggle/working/submission.csv

Submission exported to /kaggle/working/submission.csv
PassengerId,Survived
892,0
893,0
894,0
895,0
896,0
897,0
898,0
899,0
900,1
