## Imports

In [None]:

import pandas as pd
import numpy as np
#import random
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt


from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Dropout, LeakyReLU
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

import xgboost

from sklearn.metrics import accuracy_score, mean_absolute_error


## Exploratory

In [None]:
df = pd.read_csv('../input/spaceship-titanic/train.csv')

In [None]:
print(df.shape)
df.head()

In [None]:
## Nulls values
df.isna().sum()

sns.heatmap(df.isnull(), cbar=False)

In [None]:
## PasangerID

print("Number of passengers: {}".format(len(df['PassengerId'].unique())))

df['Group'] = df['PassengerId'].apply(lambda x: x.split('_')[0])

print("Number of groups: {}".format(len(df['Group'].unique())))

## Define the group size as a new variable

df['Group_size'] = df['Group'].apply(lambda x: len(df.query('Group == @x')))

In [None]:
## HomePlanet

#df['HomePlanet'].value_counts()

sns.countplot(data=df,x='HomePlanet')

planet = ['Earth', 'Mars', 'Europa']

#random.shuffle(planet)

df['HomePlanet_n'] = df['HomePlanet'].apply(lambda x: planet.index(x) if type(x)!=np.float else np.nan)
print(df[['HomePlanet_n','Transported']].corr())

In [None]:
## CryoSleep

df['CryoSleep'] = df['CryoSleep'].apply(lambda x: int(x) if type(x)!=np.float else np.nan)

sns.countplot(data=df,x='CryoSleep')

In [None]:
## Cabin

df['Deck'] = df['Cabin'].apply(lambda x: x.split('/')[0] if type(x)!=np.float else x)
df['Side'] = df['Cabin'].apply(lambda x: x.split('/')[2] if type(x)!=np.float else x)

side = ['S','P']
df['Side'] = df['Side'].apply(lambda x: side.index(x) if type(x)!=np.float else np.nan)
#df['Side'] = df['Side'].apply(lambda x: 1 if x=='P' else 0)
fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(10,5))

decks = ['B', 'A', 'D', 'C', 'G', 'E', 'F', 'T']

#random.shuffle(decks)

df['Deck_n'] = df['Deck'].apply(lambda x: decks.index(x) if type(x)!=np.float else np.nan)

print(df[['Deck_n','Transported']].corr())

sns.countplot(data=df,x='Deck',ax=ax1)
sns.countplot(data=df,x='Side',ax=ax2)

In [None]:
distances = {'TRAPPIST-1e':39.46,
             '55 Cancri e':40.00,
             'PSO J318.5-22':80}
#df['Distance'] = df['Destination'].apply(lambda x: distances[x] if type(x)!=np.float else np.nan)


destinations = ['TRAPPIST-1e','PSO J318.5-22','55 Cancri e']

#random.shuffle(destinations)

df['Destination_n'] = df['Destination'].apply(lambda x: destinations.index(x) if type(x)!=np.float else np.nan)

print(df[['Destination_n','Transported']].corr())

sns.countplot(data=df,x='Destination')

In [None]:
## Age

sns.countplot(data=df,x='Age')

In [None]:
## VIP

df['VIP'] = df['VIP'].apply(lambda x: int(x) if type(x)!=np.float else np.nan)

sns.countplot(data=df,x='VIP')

In [None]:
### Services

sns.heatmap(df[['Age','CryoSleep','RoomService','FoodCourt',	'ShoppingMall',	'Spa',	'VRDeck','Transported']].corr(),annot=True,cmap='Blues' )


service_columns = ['RoomService','FoodCourt',	'ShoppingMall',	'Spa',	'VRDeck']
service_columns_ = ['RoomService_','FoodCourt_',	'ShoppingMall_',	'Spa_',	'VRDeck_']

df['total_service'] = df[service_columns].sum(axis=1)

df[service_columns_] = np.array(list(df[service_columns+['total_service']].apply(lambda x: [val/(x[-1]+1e-10) for val in x[:-1]],axis=1)))

In [None]:
sns.set(rc={'figure.figsize':(17.7,11.27)})

fig, ((ax1, ax2,ax3), (ax4, ax5,ax6),(ax7,ax8,ax9)) = plt.subplots(3, 3)

sns.histplot(data=df,x='Age',bins=30,hue="Transported", element="step",ax=ax1)
sns.histplot(data=df,x='Spa',bins=30,hue="Transported", element="step",ax=ax2)
sns.histplot(data=df,x='FoodCourt',bins=30,hue="Transported", element="step",ax=ax3)
sns.histplot(data=df,x='CryoSleep',bins=30,hue="Transported", element="bars", discrete=True,ax=ax4)
sns.histplot(data=df,x='RoomService',bins=30,hue="Transported", element="step",ax=ax5)
sns.histplot(data=df,x='ShoppingMall',bins=30,hue="Transported", element="step",ax=ax6)
sns.histplot(data=df,x='VRDeck',bins=30,hue="Transported", element="step",ax=ax7)
sns.histplot(data=df,x='Deck',bins=30,hue="Transported", element="bars", discrete=True,ax=ax8)
sns.histplot(data=df,x='Destination',bins=30,hue="Transported", element="bars", discrete=True,ax=ax9)

## OneHotEncoding

In [None]:
## OneHotEncoder

oh_columns = ['Deck',
              'Destination',
              'HomePlanet']

oh_encoder = OneHotEncoder(sparse=False)

oh_features = pd.DataFrame(oh_encoder.fit_transform(df[oh_columns]),columns=oh_encoder.get_feature_names_out(),index=df.index)

df = pd.concat([df,oh_features],axis=1)

## NaN Values

In [None]:
cry_index = df.query('CryoSleep == 1')[['RoomService','VRDeck','VIP','Spa','FoodCourt','ShoppingMall']].index
df.loc[cry_index,['RoomService','VRDeck','VIP','Spa','FoodCourt','ShoppingMall']] = df.query('CryoSleep == 1')[['RoomService','VRDeck','VIP','Spa','FoodCourt','ShoppingMall']].fillna(0).values


### Fills

In [None]:
#df['FoodCourt'] = df['FoodCourt'].fillna(df['FoodCourt'].mean())
#df['VRDeck'] = df['VRDeck'].fillna(df['VRDeck'].mean())
df['VIP'] = df['VIP'].fillna(0) 
df_1 = df.copy()
#df_1.isna().sum()

### Age

In [None]:
df_n_index = df_1[['FoodCourt',
                 'Spa',
                 'Group_size',
                 'VRDeck','Cabin']].dropna().index
df_n = df_1.loc[df_n_index]

X_age = df_n[['FoodCourt','Spa','Group_size','VRDeck','Deck_n']]
y_age = df_n['Age']


In [None]:
train_age_index, test_age_index = train_test_split(y_age[y_age.notna()].index,test_size=0.2)

In [None]:
## Create a model
model_age = xgboost.XGBRegressor(learning_rate=0.01,
                 max_depth=5,
                 n_estimators=200,
                 seed=42)
                 #tree_method='gpu_hist', gpu_id=0)

train_age = X_age.loc[train_age_index]
train_y_age = y_age.loc[train_age_index]

model_age.fit(train_age,train_y_age)


test_age = X_age.loc[test_age_index]
test_y_age = y_age.loc[test_age_index]

pred_age = model_age.predict(test_age)

mean_absolute_error(pred_age,test_y_age)

In [None]:
## Fill NAN

df_age = df_1.loc[y_age[y_age.isna()].index,['FoodCourt',
                 'Spa',
                 'Group_size',
                 'VRDeck',
                 'Deck_n']]

age = model_age.predict(df_age)


df_1.loc[y_age[y_age.isna()].index,'Age'] = age

### CryoSleep

In [None]:
df['ShoppingMall'] = df['ShoppingMall'].fillna(df['ShoppingMall'].mean())

df_n_index = df_1[['RoomService',
                 'FoodCourt',
                 'ShoppingMall',
                 'Spa',
                 'VRDeck',
                 'Deck_n']].dropna().index
df_n = df_1.loc[df_n_index]

X_cry = df_n[['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck','Deck_n']]
y_cry = df_n['CryoSleep']

In [None]:
train_cr_index, test_cr_index = train_test_split(y_cry[y_cry.notna()].index,test_size=0.2)

In [None]:
## Create Model
model_cr = xgboost.XGBClassifier(learning_rate=0.01,
                 max_depth=5,
                 n_estimatos=200,
                 seed=42)
                 #tree_method='gpu_hist', gpu_id=0)

train_cr = X_cry.loc[train_cr_index]
train_y_cry = y_cry.loc[train_cr_index].apply(lambda x: int(x))

model_cr.fit(train_cr,train_y_cry)

test_cr = X_cry.loc[test_cr_index]
test_y_cry = y_cry.loc[test_cr_index].apply(lambda x: int(x))

pred_cr = model_cr.predict(test_cr)

accuracy_score(pred_cr,test_y_cry)

In [None]:
## Fill NaN
df_cr = df_1.loc[y_cry[y_cry.isna()].index,['RoomService',
                 'FoodCourt',
                 'ShoppingMall',
                 'Spa',
                 'VRDeck',
                 'Deck_n']]

cr = model_cr.predict(df_cr)

df_1.loc[y_cry[y_cry.isna()].index,'CryoSleep'] = cr

In [None]:
## Fill the rest with KNN Imputer

imputer = KNNImputer(n_neighbors=80, weights='uniform')
#imputer = SimpleImputer()
col_imputer = ['Side','Age','CryoSleep','VIP','RoomService_','FoodCourt_',	'ShoppingMall_',	'Spa_',	'VRDeck_','Deck_n','Destination_n','HomePlanet_n']
#col_imputer = ['Side','Age','CryoSleep','VIP','RoomService','FoodCourt',	'ShoppingMall',	'Spa',	'VRDeck','Deck_n','Destination_n','HomePlanet_n']
df_1[col_imputer] = imputer.fit_transform(df_1[col_imputer])

#Round nominal values
int_cols = ['Side','CryoSleep','Deck_n','Destination_n','HomePlanet_n']
df_1[int_cols] = df_1[int_cols].apply(lambda x: [np.round(val) for val in x],axis=0)

## Train Preprocessing

In [None]:
df_n_index = df_1[['Age',
                 #'HomePlanet',
                 'Destination_n',
                 'CryoSleep',
                 #'Cabin',
                 #'VIP',
                 'RoomService_',
                 'FoodCourt_',
                 'ShoppingMall_',
                 'Spa_',
                 'VRDeck_']].dropna().index
df_n = df_1.loc[df_n_index]
df_n.shape

In [None]:
sns.set(rc={'figure.figsize':(17.7,11.27)})
#['Deck_A',	'Deck_B',	'Deck_C',	'Deck_D','Deck_E','Deck_F',	'Deck_G',	'Deck_T',	'HomePlanet_Earth',	'HomePlanet_Europa',	'HomePlanet_Mars']
sns.heatmap(df_n[['Side','Age','Group_size','CryoSleep','VIP','RoomService_','FoodCourt_','ShoppingMall_','Spa_','VRDeck_','total_service','Destination_n','Deck_n','HomePlanet_n','Transported']].corr(),annot=True,cmap='Blues')

In [None]:
## Train Split

df_train, df_test = train_test_split(df_n['Group'].drop_duplicates(),test_size=0.2) 

df_train = df_n.query('Group in @df_train')
df_test = df_n.query('Group in @df_test')

In [None]:
## Scaling

scaler_columns = ['Age',
                  #'Distance',
                  'RoomService_',
                  'FoodCourt_',
                  #'ShoppingMall_',
                  'Spa_',
                  'Deck_n',
                  'Destination_n',
                  #'HomePlanet_n',
                  'VRDeck_',
                  'total_service']

#scaler = MinMaxScaler()
scaler = StandardScaler()


df_train[scaler_columns] = pd.DataFrame(scaler.fit_transform(df_train[scaler_columns]),columns=scaler_columns,index=df_train.index)
df_test[scaler_columns] = pd.DataFrame(scaler.transform(df_test[scaler_columns]),columns=scaler_columns,index=df_test.index)

In [None]:
train_columns = ['Age',
                 #'Group_size',
                 #'Distance',
                 'RoomService_',
                 'FoodCourt_',
                 #'ShoppingMall',
                 'Spa_',
                 'VRDeck_',
                 'total_service',
                 #'VIP',
                 'Side',
                 #'Deck_n',
                 'Destination_n',
                 #'HomePlanet_n',
                 'CryoSleep'] + ['Deck_A','Deck_B',	'Deck_C',	'Deck_D','Deck_E','Deck_F','Deck_G']+['HomePlanet_Earth',	'HomePlanet_Europa',	'HomePlanet_Mars']

X_train = df_train[train_columns]
X_test = df_test[train_columns]

y_train = df_train['Transported']
y_test = df_test['Transported']

In [None]:
X_train

## Train

In [None]:
inp = Input(shape=(X_train.shape[1],))

x = Dense(128,kernel_regularizer=tf.keras.regularizers.l1(4e-6),kernel_initializer='he_uniform')(inp)
x = LeakyReLU(0.1)(x)
x = Dropout(0.3)(x)
x = Dense(512,kernel_regularizer=tf.keras.regularizers.l1(4e-6),kernel_initializer='he_uniform')(x)
x = LeakyReLU(0.1)(x)
x = Dropout(0.3)(x)
x = Dense(256,kernel_regularizer=tf.keras.regularizers.l1(4e-6),kernel_initializer='he_uniform')(x)
x = LeakyReLU(0.1)(x)
x = Dropout(0.3)(x)
x = Dense(1,activation='sigmoid',kernel_regularizer=tf.keras.regularizers.l1(4e-6),kernel_initializer='he_uniform')(x)

model_deck = Model(inputs = inp, outputs = x)

optimizer = Adam(learning_rate=0.001)

model_deck.compile(loss='binary_crossentropy',
              optimizer=optimizer,
              metrics=['accuracy'])

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2,
                              patience=5, min_lr=0.00001)

model_deck.fit(X_train,y_train,
               epochs=30,
               batch_size=32,
               callbacks = [reduce_lr],
               validation_data=(X_test,y_test))

In [None]:
 y_pred = model_deck.predict(X_test)>0.5

accuracy_score(y_pred,y_test)

# Test

In [None]:
test = pd.read_csv('../input/spaceship-titanic/test.csv')

In [None]:
test.isna().sum()

In [None]:
test['CryoSleep'] = test['CryoSleep'].apply(lambda x: int(x) if type(x)!=np.float else np.nan)
test['Deck'] = test['Cabin'].apply(lambda x: x.split('/')[0] if type(x)!=np.float else x)
test['Side'] = test['Cabin'].apply(lambda x: x.split('/')[2] if type(x)!=np.float else x)
test['Deck_n'] = test['Deck'].apply(lambda x: decks.index(x) if type(x)!=np.float else np.nan)
test['Side'] = test['Side'].apply(lambda x: side.index(x) if type(x)!=np.float else np.nan)
test['Destination_n'] = test['Destination'].apply(lambda x: destinations.index(x) if type(x)!=np.float else np.nan)
test['HomePlanet_n'] = test['HomePlanet'].apply(lambda x: planet.index(x) if type(x)!=np.float else np.nan)

test['VIP'] = test['VIP'].apply(lambda x: int(x) if type(x)!=np.float else np.nan)

cry_index = test.query('CryoSleep == 1')[['RoomService','VRDeck','VIP','Spa','FoodCourt','ShoppingMall']].index
test.loc[cry_index,['RoomService','VRDeck','VIP','Spa','FoodCourt','ShoppingMall']] = test.query('CryoSleep == 1')[['RoomService','VRDeck','VIP','Spa','FoodCourt','ShoppingMall']].fillna(0).values

test['total_service'] = test[service_columns].sum(axis=1)
test[service_columns_] = np.array(list(test[service_columns+['total_service']].apply(lambda x: [val/(x[-1]+1e-10) for val in x[:-1]],axis=1)))

test['Group'] = test['PassengerId'].apply(lambda x: x.split('_')[0])
test['Group_size'] = test['Group'].apply(lambda x: len(df.query('Group == @x')))

### OneHotEncoder

oh_features_test = pd.DataFrame(oh_encoder.transform(test[oh_columns]),columns=oh_encoder.get_feature_names_out(),index=test.index)

test = pd.concat([test,oh_features_test],axis=1)


In [None]:
test['VIP'] = test['VIP'].fillna(0) 

## Fill NaN Age

df_age_index = test[['FoodCourt',
                 'Spa',
                 'Group_size',
                 'VRDeck','Cabin']].dropna().index

y_age = test.loc[df_age_index,'Age']

df_age = test.loc[y_age[y_age.isna()].index,['FoodCourt','Spa','Group_size','VRDeck','Deck_n']]

age = model_age.predict(df_age)


test.loc[y_age[y_age.isna()].index,'Age'] = age






## Fill NaN CyopSleep

test['ShoppingMall'] = test['ShoppingMall'].fillna(test['ShoppingMall'].mean())

df_cry_index = test[['RoomService',
                 'FoodCourt',
                 'ShoppingMall',
                 'Spa',
                 'VRDeck',
                 'Deck_n']].dropna().index

y_cry = test.loc[df_cry_index,'CryoSleep']

df_cr = test.loc[y_cry[y_cry.isna()].index,['RoomService',
                 'FoodCourt',
                 'ShoppingMall',
                 'Spa',
                 'VRDeck',
                 'Deck_n']]

cr = model_cr.predict(df_cr)

test.loc[y_cry[y_cry.isna()].index,'CryoSleep'] = cr


test[col_imputer] = imputer.transform(test[col_imputer])
test[int_cols] = test[int_cols].apply(lambda x: [np.round(val) for val in x],axis=0)

In [None]:
test[scaler_columns] = scaler.transform(test[scaler_columns])

In [None]:
test[train_columns]

## Submission

In [None]:
submission = model_deck.predict(test[train_columns])

In [None]:
submission = submission>0.5
submission

In [None]:
sample = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
sample['Transported'] = submission

sample.to_csv('submission.csv',index=False)
sample