In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### About the data

The task is to predict whether a passenger was transported to an alternate dimension during the Spaceship Titanic's collision with the spacetime anomaly. To make these predictions,  a set of personal records recovered from the ship's damaged computer system is given.

Following are the attributes:
- PassengerId - A unique Id for each passenger. Each Id takes the form gggg_pp where gggg indicates a group the passenger is travelling with and pp is their number within the group. People in a group are often family members, but not always.
- HomePlanet - The planet the passenger departed from, typically their planet of permanent residence.
- CryoSleep - Indicates whether the passenger elected to be put into suspended animation for the duration of the voyage. Passengers in cryosleep are confined to their cabins.
- Cabin - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.
- Destination - The planet the passenger will be debarking to.
- Age - The age of the passenger.
- VIP - Whether the passenger has paid for special VIP service during the voyage.
- RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Amount the passenger has billed at each of the Spaceship Titanic's many luxury amenities.
- Name - The first and last names of the passenger.
- Transported - Whether the passenger was transported to another dimension. This is the target, the column you are trying to predict.

In [None]:
train_data = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test_data = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')

## EDA

In [None]:
train_data.head()

In [None]:
train_data.shape

The training data consists of 8693 training examples and 14 attributes.

In [None]:
train_data.info()

In [None]:
train_data.isnull().sum()

I can see there are null values present for some attributes such as HomePlanet, CryoSleep, Cabin, Age, etc. This needs to be taken care during data preprocessing.

#### A careful handling of missing values is needed for each attribute.

In [None]:
categorical_features = train_data.select_dtypes('object').columns.to_list()
print(categorical_features)

numerical_features = train_data.drop(['Transported'], axis=1).select_dtypes(np.number).columns.to_list()
print(numerical_features)

#### Straightforward ones - RoomService, FoodCourt, ShoppingMall, Spa, VRDeck - Fill with 0s

In [None]:
spend_feature_list = train_data.drop(['Transported', 'Age'], axis=1).select_dtypes(np.number).columns.to_list()
for col in spend_feature_list:
    train_data[col] = train_data[col].fillna(0)
    test_data[col] = test_data[col].fillna(0)

#### Use KNNImputer for Age

In [None]:
from sklearn.impute import KNNImputer

train_data_num = train_data[numerical_features]
numerical_imputer = KNNImputer(n_neighbors = 2)
train_num_transformed = numerical_imputer.fit_transform(train_data_num)

train_num_df = pd.DataFrame(train_num_transformed,
                           columns=numerical_features,
                           index=train_data.index)
print(train_num_df.head())
print('------------------')
print('Missing values:')
print(train_num_df.isnull().sum())

In [None]:
test_data_num = test_data[numerical_features]
test_num_transformed = numerical_imputer.fit_transform(test_data_num)

test_num_df = pd.DataFrame(test_num_transformed,
                           columns=numerical_features,
                           index=test_data.index)
print(test_num_df.head())
print('------------------')
print('Missing values:')
print(test_num_df.isnull().sum())

In [None]:
target = train_data['Transported']
train_cat_df = train_data[categorical_features]
train_df = pd.concat([train_cat_df, train_num_df], axis=1)
train_df['Transported'] = target
train_df

In [None]:
test_cat_df = test_data[categorical_features]
test_df = pd.concat([test_cat_df, test_num_df], axis=1)
test_df

#### All numeric attribute missing values are taken care of.

In [None]:
train_df.describe()

##### We need some more understanding before handling missing values for categorical attributes. 

Let's use seaborn for some visualizations!

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(train_df.corr(), annot=True)

In [None]:
train_df['Transported'] = train_df['Transported'].astype(int)

In [None]:
sns.countplot(data=train_df, x='Transported')

#### Looks good! Dataset is class balanced.

In [None]:
fig, ax = plt.subplots(2,2, figsize=(15,10))
sns.countplot(x='CryoSleep', data=train_df, hue='Transported', ax=ax[0][0])
sns.countplot(x='VIP', data=train_df, hue='Transported', ax=ax[0][1])
sns.countplot(x='Destination', data=train_df, hue='Transported', ax=ax[1][0])
sns.countplot(x='HomePlanet', data=train_df, hue='Transported', ax=ax[1][1])

Some conclusions:
- If CryoSleep = False, more likely to not get transported. If CryoSleep = True, more likely to get transported.
- VIP status doesn't matter in your fate for being transported or not.
- If Destination = 55 Cancri, then more likely to get transported. If Destination = TRAPPIST-1e, then slightly  lower chances of getting transported. If Destination = PSO J318.5-22, then 50-50 chances, can't really say :/
- If HomePlanet = Europa, then slighly better chances to get transported as compared to other two home planets.

#### Age

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data=train_df, x='Age')

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(data=train_df,x='Transported', y='Age')

From the above boxplot, we can conclude that there is no visible correlation between the age of a person and the target variable.

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data=train_df, x = 'Age', hue = 'Transported', multiple = 'stack')

#### Age Categorization

In [None]:
def age_categorize(row):
    if row['Age'] >= 0 and row['Age'] < 13:
        return 'Child'
    elif row['Age'] >= 13 and row['Age'] < 20:
        return 'Teen'
    elif row['Age'] >=20 and row['Age'] < 35:
        return 'Young Adult'
    elif row['Age'] >= 35 and row['Age'] < 55:
        return 'Middle-aged Adult'
    else :
        return 'Elderly'

In [None]:
train_df['AgeGroup'] = train_df.apply(lambda row: age_categorize(row), axis=1)
test_df['AgeGroup'] = test_df.apply(lambda row: age_categorize(row), axis=1)

In [None]:
plt.figure(figsize=(8,6))
sns.histplot(data=train_df, x = 'AgeGroup', hue = 'Transported', multiple='stack', shrink=0.8)

#### PassengerId
PassengerId is of the form gggg_pp where gggg is the group and pp is ID.

Split the PassengerId column into two columns - Group and PersonNumber. I think, PersonNumber has no significance of its own though.

In [None]:
train_df['PassengerId']

In [None]:
train_df[['Group', 'PersonNumber']] = train_df['PassengerId'].str.split('_', expand=True)
test_df[['Group', 'PersonNumber']] = test_df['PassengerId'].str.split('_', expand=True)

In [None]:
train_df['Group'].value_counts()

There are 6217 unique groups. Now, a group can have passengers from same family, but not always.
Let's find out how many families are on board.

#### Name - handle missing values carefully.
Passengers from same family should come from same home planet, and we also assume that they go to same destination, for simplicity. Use this assumption to fill empty fields.

In [None]:
train_df['Name'].isnull().sum()

In [None]:
missing_names = train_df[train_df['Name'].isna()][['HomePlanet','Destination','Group','Name']]
missing_names 

In [None]:
missing_names.groupby('HomePlanet').count()

#### Look for passenger with same homeplanet, destination, and group, and get their last names. Missing passenger name probably is from their family.

In [None]:
def find_passenger(ind, hp, dst, grp, df):
    name =  df[((df['HomePlanet']==hp) & (df['Group']==grp)) | ((df['Destination']==dst) & (df['Group']==grp))]['Name']
    return name.tolist()

In [None]:
for index, row in missing_names.iterrows():
    homeplanet = row['HomePlanet']
    dest = row['Destination']
    group = row['Group']
    name_list = find_passenger(index, homeplanet, dest, group, train_df)
    for i in name_list:
        if type(i) == float:
            row['Name'] = np.nan
        else:
            row['Name'] = 'Dummy ' + i.split()[1]

In [None]:
missing_names

In [None]:
missing_names.isna().sum()

#### Still some missing names, try to match them using only group.

In [None]:
def find_passenger_with_group(ind, group, df):
    name = df[df['Group'] == group]['Name']
    return name.tolist()

In [None]:
for index, row in missing_names.iterrows():
    if type(row['Name']) == float:
        group = row['Group']
        name_list = find_passenger_with_group(index, group, train_df)
        for i in name_list:
            if type(i) == float:
                row['Name'] = np.nan
            else:
                row['Name'] = 'Dummy ' + i.split()[1]

In [None]:
missing_names.isna().sum()

Move these values back to original dataframe.

In [None]:
org_indices_with_missing_names = missing_names.index
# missing_names.reset_index(inplace=True)
temp_df = missing_names.reset_index()
print(org_indices_with_missing_names)
j = 0

for i in org_indices_with_missing_names:
    train_df.at[i,'Name'] = temp_df.iloc[j]['Name']
    j += 1

# train_df.iloc[58]['Name'], temp_df.iloc[1]['Name']

#### I'm gonna use ffill for the rest of missing names

In [None]:
train_df['Name'] = train_df['Name'].fillna(method='ffill')
test_df['Name'] = test_df['Name'].fillna(method='ffill')
train_df['Name'].isnull().sum()

In [None]:
test_df['Name'].isnull().sum()

#### Split Name into FirstName and LastName

In [None]:
train_df[['FirstName', 'LastName']] = train_df['Name'].str.split(' ', expand = True)
test_df[['FirstName', 'LastName']] = test_df['Name'].str.split(' ', expand = True)

In [None]:
train_df['LastName'].value_counts()

#### Total number of families on board = 2217

Out of curiosity, do passengers belonging to same family have same destination and home planets? Let's try to check.

In [None]:
train_df.loc[train_df['LastName'] == 'Acobson']

Not necessarily the same destination.

In [None]:
train_df.groupby(by = ['LastName'])['Destination'].unique()

In [None]:
train_df.groupby(by = ['LastName'])['HomePlanet'].nunique()

However, passengers from same family are most likely coming from same planet. And it makes complete sense.

#### Now, how many relatives or family members does a passenger have in the spacecraft?

In [None]:
train_relatives = train_df.groupby('LastName')['PassengerId'].count().reset_index()
train_relatives = train_relatives.rename(columns = {'PassengerId':'NumRelatives'})
train_relatives

In [None]:
train_df = train_df.merge(train_relatives[['LastName','NumRelatives']], how='left', on=['LastName'])
train_df.head()

In [None]:
test_relatives = test_df.groupby('LastName')['PassengerId'].count().reset_index()
test_relatives = test_relatives.rename(columns = {'PassengerId':'NumRelatives'})
test_df = test_df.merge(test_relatives[['LastName', "NumRelatives"]], how='left', on=['LastName'])

#### Number of people traveling together using Group

In [None]:
train_grpsize = train_df.groupby('Group')['PassengerId'].count().reset_index()
train_grpsize = train_grpsize.rename(columns = {'PassengerId':'GroupSize'})
train_grpsize

In [None]:
train_df = train_df.merge(train_grpsize[['Group','GroupSize']], how='left', on=['Group'])
train_df.head()

In [None]:
test_grpsize = test_df.groupby('Group')['PassengerId'].count().reset_index()

In [None]:
test_grpsize = test_grpsize.rename(columns = {'PassengerId':'GroupSize'})
test_df = test_df.merge(test_grpsize[['Group','GroupSize']], how='left', on=['Group'])

In [None]:
test_df.head()

#### Total amount spent by the passengers during their space journey.

In [None]:
train_df['TotalCost'] = train_df['RoomService'] + train_df['FoodCourt'] + train_df['ShoppingMall'] + train_df['Spa'] + train_df['VRDeck']
test_df['TotalCost'] = test_df['RoomService'] + test_df['FoodCourt'] + test_df['ShoppingMall'] + test_df['Spa'] + test_df['VRDeck']
train_df.head()

#### Family to spend the highest.

In [None]:
train_df.groupby(by = ['LastName']).sum()['TotalCost'].sort_values()

Hetforhaft family spent the highest of all!

In [None]:
train_df[train_df['LastName'] == 'Hetforhaft']

### HomePlanet attribute

In [None]:
print(train_df['HomePlanet'].isnull().sum())
print(test_df['HomePlanet'].isnull().sum())
print(train_df['HomePlanet'].mode()[0])

In [None]:
train_df['HomePlanet'] = train_df['HomePlanet'].fillna(train_df['HomePlanet'].mode()[0])
test_df['HomePlanet'] = test_df['HomePlanet'].fillna(train_df['HomePlanet'].mode()[0])
print(train_df['HomePlanet'].isnull().sum())
print(test_df['HomePlanet'].isnull().sum())

### CryoSleep attribute


In [None]:
train_df['CryoSleep'].isnull().sum()

In [None]:
train_df['CryoSleep']

In [None]:
def cryosleep_values(row):
    if row['TotalCost']==0 and type(row['CryoSleep'])==float:
        return True
    elif type(row['CryoSleep'])==float:
        return False
    else:
        return row['CryoSleep']

train_df['CryoSleep'] = train_df.apply(lambda row: cryosleep_values(row), axis=1)
    

In [None]:
train_df['CryoSleep'].isnull().sum()

#### Got rid of all missing values of CryoSleep!
Now do it simple way for test set

In [None]:
test_df['CryoSleep'].isnull().sum()

In [None]:
test_df['CryoSleep'] = test_df.apply(lambda row: cryosleep_values(row), axis=1)
print(test_df['CryoSleep'].isnull().sum())

### Splitting the Cabin attribute.

In [None]:
print(train_df['Cabin'].isnull().sum())
print(test_df['Cabin'].isnull().sum())

In [None]:
train_df['Cabin'] = train_df['Cabin'].fillna(train_df['Cabin'].mode()[0])
test_df['Cabin'] = test_df['Cabin'].fillna(train_df['Cabin'].mode()[0])
print(train_df['Cabin'].isnull().sum())
print(test_df['Cabin'].isnull().sum())

In [None]:
train_df[['CabinDeck', 'CabinNum', 'CabinSide']] = train_df['Cabin'].str.split('/', expand = True)
test_df[['CabinDeck', 'CabinNum', 'CabinSide']] = test_df['Cabin'].str.split('/', expand = True)
train_df.head(10)

In [None]:
sns.histplot(data=train_df, x = 'CabinDeck', hue = 'Transported', multiple = 'stack', shrink=0.8)

In [None]:
sns.histplot(data=train_df, x = 'CabinSide', hue = 'Transported', multiple = 'stack', shrink=.8)

### VIP attribute

In [None]:
print(train_df['VIP'].isnull().sum())
print(test_df['VIP'].isnull().sum())

In [None]:
train_df['VIP'].value_counts()

#### What is special about VIP passengers? I don't know.

In [None]:
sns.displot(train_df[train_df['VIP']==True]['Age'])
plt.title('Age distribution of VIP passengers')

In [None]:
sns.displot(train_df[train_df['VIP']==True]['TotalCost'])
plt.title('Spend distribution of VIP passengers')

In [None]:
train_df['VIP'] = train_df['VIP'].fillna(False)
test_df['VIP'] = test_df['VIP'].fillna(False)

In [None]:
train_df['VIP'] = train_df.VIP.apply(lambda x: str(x))
test_df['VIP'] = test_df.VIP.apply(lambda x: str(x))

train_df['CryoSleep'] = train_df.CryoSleep.apply(lambda x: str(x))
test_df['CryoSleep'] = test_df.CryoSleep.apply(lambda x: str(x))

In [None]:
train_df.dtypes

### Final data preprocessing

In [None]:
train_df1 = train_df.drop(['PassengerId',
               'Name', 
#                'FirstName', 
#                'LastName', 
               'PersonNumber', 
               'CabinNum'], axis = 1)
test_df1 = test_df.drop(['PassengerId',
              'Name', 
#               'FirstName', 
#               'LastName', 
              'PersonNumber', 
              'CabinNum'], axis = 1)

#### Label encoding 

In [None]:
from sklearn.preprocessing import LabelEncoder
def label_encoding(train, test, columns_train, columns_test):
    train = train.copy()
    test = test.copy()
    for col in columns_train:
        encoder = LabelEncoder()
        train[col] = encoder.fit_transform(train[col])
    for col in columns_test:
        encoder = LabelEncoder()
        test[col] = encoder.fit_transform(test[col])
    return train, test

In [None]:
categorical_features_train = train_df1.select_dtypes('object').columns.to_list()
categorical_features_test = test_df1.select_dtypes('object').columns.to_list()
print(categorical_features_train)
print(train_df1.dtypes)
train_df2, test_df2 = label_encoding(train_df1, test_df1, categorical_features_train, categorical_features_test)

In [None]:
train_df2.head()

In [None]:
test_df2.head()

#### One-hot encoding

In [None]:
train_df2 = pd.get_dummies(train_df2, columns=['HomePlanet','Destination','AgeGroup','CabinDeck','CabinSide','CryoSleep', 'VIP'])
test_df2 = pd.get_dummies(test_df2, columns=['HomePlanet','Destination','AgeGroup','CabinDeck','CabinSide', 'CryoSleep', 'VIP'])

In [None]:
train_df2

### Now we can start training our models.

### Train-test split

In [None]:
from sklearn.model_selection import train_test_split
X = train_df2.drop(['Transported'], axis=1)
y = train_df2['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

## Models


### Data standardization

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Random Forest

In [None]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [None]:
from sklearn.ensemble import RandomForestClassifier

random_forest_clf = RandomForestClassifier(max_depth=10,
                                          random_state=101)
# cv = RepeatedStratifiedKFold(n_splits = 10, n_repeats = 3, random_state = 101)
# n_scores = cross_val_score(random_forest_clf, X_train_scaled, y_train, scoring='accuracy', cv=cv, n_jobs=1, error_score='raise')
# print(np.mean(n_scores))
random_forest_clf.fit(X_train_scaled, y_train)
# random_forest_clf.score(X_train_scaled, y_train)

In [None]:
from sklearn.metrics import accuracy_score
rf_predictions = random_forest_clf.predict(X_test_scaled)
rf_acc = accuracy_score(y_test, rf_predictions)
rf_acc

Prediction on given test set

In [None]:
test_df.head()

In [None]:
# test_df_scaled = scaler.transform(test_df)

In [None]:
y_pred_rf = random_forest_clf.predict(test_df2)
y_pred_rf

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_regression_clf = LogisticRegression(max_iter=10000)

logistic_regression_clf.fit(X_train_scaled, y_train)
# logistic_regression_clf.score(X_train_scaled, y_train)

In [None]:
lr_predictions = logistic_regression_clf.predict(X_test_scaled)
lr_acc = accuracy_score(y_test, lr_predictions)
lr_acc

### Support Vector Classifier

In [None]:
from sklearn.svm import SVC

svc = SVC()
svc.fit(X_train_scaled, y_train)
# svc.score(X_train_scaled, y_train)

In [None]:
svc_predictions = svc.predict(X_test_scaled)
svc_acc = accuracy_score(y_test, svc_predictions)
svc_acc

### CatBoost

In [None]:
from catboost import CatBoostClassifier

catboost_clf = CatBoostClassifier(iterations=1000, eval_metric='Accuracy', verbose=100)
catboost_clf.fit(X_train_scaled, y_train);

In [None]:
catboost_predictions = catboost_clf.predict(X_test_scaled)
# catboost_clf.plot_tree(0)
accuracy_score(y_test, catboost_predictions)

### XGBoost

In [None]:
from xgboost import XGBClassifier

xgb_clf = XGBClassifier()
xgb_clf.fit(X_train_scaled, y_train)

In [None]:
xgb_predictions = xgb_clf.predict(X_test_scaled)
accuracy_score(y_test, xgb_predictions)

### Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

n_features = X_train.shape[1]

early_stop = EarlyStopping(patience = 15, monitor='val_loss', restore_best_weights=True)
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(n_features,)))
model.add(BatchNormalization())
model.add(Dense(64, activation = 'relu', kernel_initializer='he_normal'))
model.add(BatchNormalization())
model.add(Dense(32, activation = 'relu', kernel_initializer='he_normal'))
model.add(BatchNormalization())
model.add(Dense(32, activation='relu', kernel_initializer='he_normal'))
model.add(BatchNormalization())
model.add(Dense(16, activation='relu', kernel_initializer='he_normal'))
model.add(BatchNormalization())
model.add(Dense(16, activation='relu', kernel_initializer='he_normal'))
model.add(BatchNormalization())
model.add(Dense(8, activation='relu', kernel_initializer='he_normal'))
model.add(BatchNormalization())
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train_scaled, y_train, epochs=100, verbose=1, validation_split=0.15, batch_size=32, callbacks=[early_stop])

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Loss')
plt.legend(['Train','Val'])

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Accuracy')
plt.legend(['Train','Val'])

In [None]:
nn_predictions = model.predict(X_test_scaled).round().astype(int)
# nn_predictions = np.argmax(nn_predictions, axis=-1)
accuracy_score(y_test, nn_predictions)

#### VotingClassifier

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

clf1 = LogisticRegression(max_iter=10000, random_state=101)
clf2 = RandomForestClassifier(n_estimators=100, random_state=101)
clf3 = CatBoostClassifier(iterations=1000, eval_metric='Accuracy', verbose=100)
clf4 = XGBClassifier()


eclf = VotingClassifier(
    estimators=[("lr", clf1), ("rf", clf2), ("cat", clf3), ("xgb", clf4)],
    voting="soft",
    weights=[1, 1, 5, 1],
)

# predict class probabilities for all classifiers
probas = [c.fit(X_train_scaled, y_train).predict_proba(X_train_scaled) for c in (clf1, clf2, clf3, clf4, eclf)]

# get class probabilities for the first sample in the dataset
class1_1 = [pr[0, 0] for pr in probas]
class2_1 = [pr[0, 1] for pr in probas]


# plotting
N = 5  # number of groups
ind = np.arange(N)  # group positions
width = 0.35  # bar width

fig, ax = plt.subplots()

# bars for classifier 1-3
p1 = ax.bar(ind, np.hstack(([class1_1[:-1], [0]])), width, color="green", edgecolor="k")
p2 = ax.bar(
    ind + width,
    np.hstack(([class2_1[:-1], [0]])),
    width,
    color="lightgreen",
    edgecolor="k",
)

# bars for VotingClassifier
p3 = ax.bar(ind, [0, 0, 0, 0, class1_1[-1]], width, color="blue", edgecolor="k")
p4 = ax.bar(
    ind + width, [0, 0, 0, 0, class2_1[-1]], width, color="steelblue", edgecolor="k"
)

# plot annotations
plt.axvline(2.8, color="k", linestyle="dashed")
ax.set_xticks(ind + width)
ax.set_xticklabels(
    [
        "LogisticRegression\nweight 1",
        "RandomForestClassifier\nweight 1",
        "CatBoostClassifier\nweight 5",
        "XGBoostClassifier\nweight 1",
        "VotingClassifier\n(average probabilities)",
    ],
    rotation=40,
    ha="right",
)
plt.ylim([0, 1])
plt.title("Class probabilities for sample 1 by different classifiers")
plt.legend([p1[0], p2[0]], ["class 1", "class 2"], loc="upper left")
plt.tight_layout()
plt.show()

In [None]:
ensemble_predictions = eclf.predict(X_test_scaled)
accuracy_score(y_test, ensemble_predictions)

### Submission

##### Catboost prediction on test set

In [None]:
y_catboost_predictions = catboost_clf.predict(test_df2)

##### NN prediction on test set

In [None]:
test_df_scaled = scaler.transform(test_df2)
y_nn_predictions = model.predict(test_df_scaled).round().astype(int)
y_catboost_predictions = catboost_clf.predict(test_df_scaled)
y_ensemble_predictions = eclf.predict(test_df_scaled)
accuracy_score(y_catboost_predictions, y_ensemble_predictions)

In [None]:
y_ensemble_predictions

In [None]:
submission = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')
submission['Transported'] = y_ensemble_predictions
submission['Transported'] = submission['Transported'].replace({1:True, 0:False})
submission

In [None]:
submission.to_csv('submission.csv', index=False)