In [None]:
import numpy as np
import pandas as pd
import os
import tensorflow as tf
# import ydf
import tensorflow_decision_forests as tfdf

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    print(f"GPUs found: {gpus}")
else:
    print("No GPUs found.")

In [None]:
train_df=pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_df=pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

In [None]:
# preprocess?
def preprocess(df):
    # def cryoSleep_integer(x):
    #     return x.astype(int)
    # df=df.dropna()
    def normalize_name(x):
        x=str(x)
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")][1:])
    def split_passenger_id(row):
        if pd.isna(row['PassengerId']):
            return pd.Series([None, None, None])
        arr=row['PassengerId'].split('_')
        if len(arr) < 2:
            arr += [None] * (3 - len(arr))
        
        return pd.Series(arr)
    def split_cabin(row):
        if pd.isna(row['Cabin']):
            return pd.Series([None, None, None])  # Handle None case
        arr = row['Cabin'].split('/')
        # Ensure arr has exactly 3 elements
        if len(arr) < 3:
            arr += [None] * (3 - len(arr))  # Pad with None
        return pd.Series(arr)
    def total_spent(row):
        return pd.Series(row['RoomService']+row['FoodCourt']+row['ShoppingMall']+row['Spa']+row['VRDeck'])
        

    # Apply the function and create new columns
    df.fillna({
    'RoomService': df['RoomService'].median(),
    'FoodCourt': df['FoodCourt'].median(),
    'ShoppingMall':df['ShoppingMall'].median(),
    'Spa':df['Spa'].median(),
    'VRDeck':df['VRDeck'].median(),
    # 'Deck':df['Deck'].median()
    }, inplace=True)
    df['Total_Spent']=df.apply(total_spent,axis=1)
    df["Name"] = df["Name"].apply(normalize_name)
    df[['Deck', 'Num', 'Side']] = df.apply(split_cabin, axis=1)
    df[['Group','Place_In_Group']]=df.apply(split_passenger_id,axis=1)
    df['CryoSleep'] = df['CryoSleep'].apply(lambda x: float(x))
    df['VIP'] = df['VIP'].apply(lambda x: float(x))
    
    
    return df
    
train_df=preprocess(train_df)
test_df=preprocess(test_df)

In [None]:
train_df.info()

In [None]:
train_df.head(5)

In [None]:
input_features=list(train_df.columns)
input_features.remove('Transported')
input_features.remove('PassengerId')
input_features.remove('Cabin')
# input_features.remove('Name')
# imput_features.remove('')

In [None]:
def tokenize_names(features, labels=None):
    features['Name']=tf.strings.split(features['Name'])
    return features, labels

train_ds=tfdf.keras.pd_dataframe_to_tf_dataset(train_df,label="Transported").map(tokenize_names)
test_ds=tfdf.keras.pd_dataframe_to_tf_dataset(test_df).map(tokenize_names)

In [None]:
model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0, # Very few logs
    features=[tfdf.keras.FeatureUsage(name=n) for n in input_features],
    exclude_non_specified_features=True, # Only use the features in "features"
    random_seed=1234,
)
model.fit(train_ds)

self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

In [None]:
model.summary()

In [None]:
def prediction_to_kaggle_format(model,df,ds, threshold=0.5):
    proba_survive = model.predict(ds, verbose=0)[:,0]
    return pd.DataFrame({
        "PassengerId": df["PassengerId"],
        "Transported": (proba_survive >= threshold).astype(str)#.replace({'True': 'True', 'False': 'False'})
    })


def make_submission(kaggle_predictions):
    path="/kaggle/working/submission.csv"
    kaggle_predictions.to_csv(path, index=False)
    print(f"Submission exported to {path}")
    
kaggle_predictions = prediction_to_kaggle_format(model,test_df,test_ds)
make_submission(kaggle_predictions)
!head /kaggle/working/submission.csv

In [None]:
kaggle_predictions = prediction_to_kaggle_format(model,train_df,train_ds)
make_submission(kaggle_predictions)
!head /kaggle/working/submission.csv