In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import seaborn as sb
from sklearn import ensemble
from tensorflow.keras import layers

train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')

X_train = train_df.iloc[:, :-1]
Y_train = train_df.iloc[:, -1]

X_test = test_df

# Data analysis

In [None]:
train_df.info()
train_df.head(10)

Take a look on general structure of data:
1. All columns with exception of *PassengerId* and *Transported* have data gaps.
2. *CryoSleep* and *VIP* are boolean.
3. *HomePlanet* and *Destination* are categorical.
4. We need to parse *PassengerId* and *Cabin* to get group and cabin number for each passenger because it's more probably that people in one group and cabin were transported together.
5. Also we don't need *Name* column.

# Feature scaling
For scaling *Age* column we will divide every feature by max age.
For scaling *RoomService*, *FoodCourt*, *ShoppingMoll*, *Spa* and *VRDeck* columns we will use logarifmic scale.

In [None]:
max_age = X_train['Age'].max()
X_train['Age'] = X_train['Age'] / max_age
X_test['Age'] = X_test['Age'] / max_age

max_room = np.log(X_train['RoomService'].max() + 1)
X_train['RoomService'] = np.log(X_train['RoomService'] + 1) / max_room
X_test['RoomService'] = np.log(X_test['RoomService'] + 1) / max_room

max_food = np.log(X_train['FoodCourt'].max() + 1)
X_train['FoodCourt'] = np.log(X_train['FoodCourt'] + 1) / max_food
X_test['FoodCourt'] = np.log(X_test['FoodCourt'] + 1) / max_food

max_shopping = np.log(X_train['ShoppingMall'].max() + 1)
X_train['ShoppingMall'] = np.log(X_train['ShoppingMall'] + 1) / max_shopping
X_test['ShoppingMall'] = np.log(X_test['ShoppingMall'] + 1) / max_shopping

max_spa = np.log(X_train['Spa'].max() + 1)
X_train['Spa'] = np.log(X_train['Spa'] + 1) / max_spa
X_test['Spa'] = np.log(X_test['Spa'] + 1) / max_spa

max_vr = np.log(X_train['VRDeck'].max() + 1)
X_train['VRDeck'] = np.log(X_train['VRDeck'] + 1) / max_vr
X_test['VRDeck'] = np.log(X_test['VRDeck'] + 1) / max_vr

# Feature engineering

In [None]:
X_train.drop(['Name', 'PassengerId'], axis=1, inplace=True)
X_test.drop(['Name', 'PassengerId'], axis=1, inplace=True)


d = {'55 Cancri e': '1', 'PSO J318.5-22':'2', 'TRAPPIST-1e': '3'}

X_train['Destination'].replace(d, inplace=True)
X_test['Destination'].replace(d, inplace=True)

In [None]:
def cabin_parser(row):
    s = row[2]
    if s is np.nan:
        return [np.nan] * 2
    deck, _, side = s.split('/')
    return [deck, side]

In [None]:
new_cols = ['Deck', 'Side']

X_train[new_cols] = X_train.apply(cabin_parser, axis=1, result_type='expand')
X_train.drop(["Cabin"], axis=1, inplace=True)

X_test[new_cols] = X_test.apply(cabin_parser, axis=1, result_type='expand')
X_test.drop(["Cabin"], axis=1, inplace=True)

Let's bring categorical features to one-hot vectors.

In [None]:
names = ['CryoSleep', 'VIP', 'Side', 'Dest', 'Planet', 'Deck']
cols = ['CryoSleep', 'VIP', 'Side', 'Destination', 'HomePlanet', 'Deck']

X_train = pd.get_dummies(X_train, prefix=names, columns=cols, dtype='float64')

X_test = pd.get_dummies(X_test, prefix=names, columns=cols, dtype='float64')

For test dataset find amount of passengers who was in the same group with passengers from train dataset.

Now we get rid of the rest of gaps by filling them up to zeroes.

In [None]:
X_train.fillna(0, axis=0, inplace=True)

X_test.fillna(0, axis=0, inplace=True)

Let's look on processed dataset.

In [None]:
X_train.head()

# Visualization of correlations

In [None]:
df = X_train.copy()
df['Transported'] = Y_train.copy()
sb.set(rc = {'figure.figsize':(15,10)})
sb.heatmap(df.corr(), cmap='coolwarm',vmin=-1, vmax=1);

We can see that *Transported* column has a large correlation with *Cryosleep* column and also less correlation with wastes of money.  

# Building a model

In [None]:
def titanic_nn_model(input_shape):
    
    X = layers.Input(shape=input_shape)
    Y = layers.Dense(2000, activation='relu')(X)
    Y = layers.Dense(5000, activation='relu')(Y)
    Y = layers.Dense(2000, activation='relu')(Y)
    Y = layers.Dense(1, activation='sigmoid')(Y)
    
    model = tf.keras.Model(inputs=X, outputs=Y)
    
    return model

In [None]:
models = []

In [None]:
for i in range(15):
    model_nn = titanic_nn_model(26) 
    model_nn.compile('adam', 'binary_crossentropy', ['accuracy'])
    epochs = 5
    model_nn.fit(X_train, Y_train, epochs=epochs, verbose=0)
    models.append(model_nn)

In [None]:
for i in range(100, 300, 10):
    model_boosting = ensemble.GradientBoostingClassifier(n_estimators=i)
    model_boosting.fit(X_train, Y_train);
    models.append(model_boosting)

In [None]:
for i in range(15):
    model_forest = ensemble.RandomForestClassifier(n_estimators=200)
    model_forest.fit(X_train, Y_train);
    models.append(model_forest)

In [None]:
s = np.zeros((len(X_test),))
for model in models:
    Y_pred = (model.predict(X_test) > 0.5).reshape((-1,))
    s = s +  Y_pred

In [None]:
Y_pred = s > 37

# Submission

In [None]:
submission=pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
submission['Transported'] = Y_pred
submission.to_csv('submission.csv', index=False)