In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import pandas_profiling
from xgboost import XGBClassifier
from sklearn.base import TransformerMixin
from sklearn.model_selection import RandomizedSearchCV, train_test_split, KFold
from matplotlib import pyplot as plt

sns.set()

In [None]:
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')

X_train = train_df.iloc[:, :-1]
Y_train = train_df.iloc[:, -1]

X_test = test_df

# Explaratory data analysis

In [None]:
train_df.info()
train_df.head(10)

Let's look at data with pandas profiling report.

In [None]:
pandas_profiling.ProfileReport(train_df)

In [None]:
pandas_profiling.ProfileReport(test_df)

Take a look on general structure of data:

1. All columns with exception of PassengerId and Transported have data gaps.
2. CryoSleep and VIP are boolean.
3. HomePlanet and Destination are categorical.
4. We need to parse PassengerId and Cabin to get group and cabin number for each passenger because it's more probably that people in one group and cabin were transported together.
5. We need to scale money wastes columns (probably log scale will work).
6. We don't need Name column.
7. Also we can see that target column is not unbalanced.

Now we will compare distributions for train and test set.

## Categorical features analysis

At first, look at categorical features.

In [None]:
cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']

fig, axes = plt.subplots(4, 1, figsize=(15, 30))

for i, col in enumerate(cat_cols):
    axis = axes[i]
    col_data_train = train_df[col].value_counts().to_frame() / len(train_df)
    col_data_train['Set'] = 'Train'
    col_data_test = test_df[col].value_counts().to_frame() / len(test_df)
    col_data_test['Set'] = 'Test'
    col_data = pd.concat([col_data_train, col_data_test])
    col_data.index = map(str, col_data.index)
    sns.barplot(data=col_data, x=col_data.index, y=col, hue='Set', ax=axis)
    axis.set(ylabel=None, title=col)

Categorical features for both sets have the same distribution.

Now let's look at their connection with target column.

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(15, 30))

for i, col in enumerate(cat_cols):
    axis = axes[i]
    col_data_trans = train_df[col][Y_train == 0].value_counts().to_frame()
    col_data_trans['Target'] = 'Transpoted'
    col_data_not = train_df[col][Y_train == 1].value_counts().to_frame()
    col_data_not['Target'] = 'Not transported'
    col_data = pd.concat([col_data_trans, col_data_not])
    col_data.index = map(str, col_data.index)
    sns.barplot(data=col_data, x=col_data.index, y=col, hue='Target', ax=axis)
    axis.set(ylabel=None, title=col)

## Numerical feature analysis

Now we should look at numeric features.

In [None]:
num_cols = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

fig, axes = plt.subplots(6, 1, figsize=(15, 30))

for i, col in enumerate(num_cols[:]):
    axis = axes[i]
    log_scale = (i > 0) * 10
    col_data = pd.concat([train_df[col], test_df[col]], axis=1, ignore_index=True)  
    if log_scale:
        col_data.replace({0: np.nan}, inplace=True)
        sns.histplot(data=col_data, ax=axis, kde=True, log_scale=log_scale)
    else:
        sns.histplot(data=col_data, ax=axis, kde=True, bins=40)
    axis.legend(['Train', 'Test'])

Numeric features also have almost the same distribution. 

Let's look at the correlation numeric feautures with target column and with each other.

In [None]:
money_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

df = np.log1p(X_train[money_cols]) / np.log(1000)
df['Transported'] = Y_train.copy()
sns.set(rc = {'figure.figsize':(15,10)})
sns.heatmap(df.corr(), cmap='coolwarm',vmin=-1, vmax=1);

All money columns have high positive correlations with each other and high negative correlation with target column. So it may be useful to find sum of the money columns.

# Feature engineering and scaling

Get rid of *Name* column.

In [None]:
X_train.drop(['Name'], axis=1, inplace=True)
X_test.drop(['Name'], axis=1, inplace=True)

For *RoomService, FoodCourt, ShoppingMall, Spa, VRDeck* columns find their sum, fill NaN values with zeroes (because for these columns at least 60% non-empty values equal 0). And then apply a logarithmic scale to sum (logarithmic base 1000).

In [None]:
X_train['SummaryExpenses'] = X_train[money_cols].sum(axis=1)
X_test['SummaryExpenses'] = X_test[money_cols].sum(axis=1)    
  
X_train[money_cols] = X_train.loc[:, money_cols].div(X_train['SummaryExpenses'], axis=0)
X_test[money_cols] = X_test.loc[:, money_cols].div(X_test['SummaryExpenses'], axis=0)
    
X_train['SummaryExpenses'] = np.log1p(X_train['SummaryExpenses']) / np.log(1000)
X_test['SummaryExpenses'] = np.log1p(X_test['SummaryExpenses']) / np.log(1000)

X_train[money_cols] = X_train.loc[:, money_cols].fillna(0)
X_test[money_cols] = X_test.loc[:, money_cols].fillna(0)

X_train['SummaryExpenses'].fillna(0, inplace=True)
X_test['SummaryExpenses'].fillna(0, inplace=True)

Fill NaN in *VIP*, *HomePlanet* and *Destination* columns with the most comon values: 0 (95%), Earth (54%) and Trappist-1e (68%) respectively. For filling *CryoSleep* column we will use information about expenses, because people which were in cryosleep did not use any services.

In [None]:
X_train['VIP'].fillna(False, inplace=True)
X_test['VIP'].fillna(False, inplace=True)

X_train['CryoSleep'].fillna(X_train['SummaryExpenses'] == 0, inplace=True)
X_test['CryoSleep'].fillna(X_test['SummaryExpenses'] == 0, inplace=True)

Fill NaN in *Age* column with the mean value.

In [None]:
mean = X_train['Age'].mean()

X_train['Age'].fillna(mean, inplace=True)
X_test['Age'].fillna(mean, inplace=True)

## Deal with PassengerId

From PassengerId column we need number of the group and number in the group.

In [None]:
def id_parser(row):
    s = row['PassengerId']
    group, _ = s.split('_')
    return int(group)

In [None]:
new_col = 'GroupNumber'

X_train[new_col] = X_train.apply(id_parser,axis=1)
X_test[new_col] = X_test.apply(id_parser,axis=1)

X_train.drop(['PassengerId'], axis=1, inplace=True)
X_test.drop(['PassengerId'], axis=1, inplace=True)

Look at new columns.

Now we will try to extract usefull information from *GroupNumber*.

In [None]:
def get_group_dict(group_column):
    groups = group_column.value_counts()
    groups = dict(groups)
    return groups

In [None]:
group_dict = get_group_dict(X_train['GroupNumber'])
group_count = X_train['GroupNumber'].replace(group_dict)
y = []
for i in range(1, 9):
    x = group_count[group_count == i]
    y.append(len(x[Y_train == 1]) / len(x))
sns.lineplot(y=y, x=range(1, 9));

We also see that number people in one group can influence on target column. Let's add this column to our data.

In [None]:
group_dict = get_group_dict(X_train['GroupNumber'])
X_train['GroupNumber'].replace(group_dict, inplace=True)

group_dict = get_group_dict(X_test['GroupNumber'])
X_test['GroupNumber'].replace(group_dict, inplace=True)

## Deal with Cabin

From Cabin column parse Deck (categorical), Side (boolean) and number of cabin features.

In [None]:
def cabin_parser(row):
    s = row['Cabin']
    if s is np.nan:
        return [np.nan] * 3
    deck, number, side = s.split('/')
    return [deck, int(number), side == 'S']

In [None]:
new_cols = ['Deck', 'CabinNumber', 'IsSideS']

X_train[new_cols] = X_train.apply(cabin_parser,axis=1, result_type='expand')
X_test[new_cols] = X_test.apply(cabin_parser,axis=1, result_type='expand')

X_train.drop(["Cabin"], axis=1, inplace=True)
X_test.drop(["Cabin"], axis=1, inplace=True)

Look at connection new categorical column with target column.

In [None]:
new_cat_cols = ['Deck', 'IsSideS']
fig, axes = plt.subplots(2, 1, figsize=(15, 20))

for i, col in enumerate(new_cat_cols):
    axis = axes[i]
    col_data_trans = X_train[col][Y_train == 0].value_counts().to_frame()
    col_data_trans['Target'] = 'Transpoted'
    col_data_not = X_train[col][Y_train == 1].value_counts().to_frame()
    col_data_not['Target'] = 'Not transported'
    col_data = pd.concat([col_data_trans, col_data_not])
    col_data.index = map(str, col_data.index)
    sns.barplot(data=col_data, x=col_data.index, y=col, hue='Target', ax=axis)
    axis.set(ylabel=None, title=col)

Perform the same operation to *CabinNumber* coluns as to *GroupNumber*.

In [None]:
group_dict = get_group_dict(X_train['CabinNumber'])
cabin_count = X_train['CabinNumber'].replace(group_dict)
y = []
for i in range(1, 19):
    x = cabin_count[cabin_count == i]
    y.append(len(x[Y_train == 1]) / len(x))
sns.lineplot(y=y, x=range(1, 19));

We see that number people in one cabin probably can influence on target column. Let's add this column to out data.

In [None]:
group_dict = get_group_dict(X_train['CabinNumber'])
X_train['CabinNumber'].replace(group_dict, inplace=True)

group_dict = get_group_dict(X_test['CabinNumber'])
X_test['CabinNumber'].replace(group_dict, inplace=True)

Let's look on processed data.

In [None]:
X_train.info()

In [None]:
X_train.describe()

In [None]:
X_train.head()

# Visualization of correlations

In [None]:
df = X_train.copy()
df['Transported'] = Y_train.copy()
sns.set(rc = {'figure.figsize':(15,10)})
sns.heatmap(df.corr(), cmap='coolwarm',vmin=-1, vmax=1);

We can see that *Transported* column has a large correlation with *Cryosleep* and money wastes columns.

# KFold Mean Target Encoding Implementation

Use mean target encoding to encode categorical features.

In [None]:
class KFoldTargetEncoder(TransformerMixin):
    
    def __init__(self, col_names, n_folds=10, smooth=0):
        self.col_names = col_names
        self.n_folds = n_folds
        self.smooth = smooth
        self.replaces = {}
        
    def fit(self, X, y):
        self.global_mean = y.mean()
        X = X.copy()
        local_means = {}
        
        for col_name in self.col_names:
            local_means[col_name] = pd.DataFrame(index=pd.unique(X[col_name]))
            
        kf = KFold(self.n_folds, shuffle=True)
        for train_ind, _ in kf.split(X):
            X_train = X.iloc[train_ind]
            y_train = y.iloc[train_ind]
            for col_name in self.col_names:
                local_mean = y_train.groupby(X_train[col_name]).mean()
                local_means[col_name] = pd.concat([local_means[col_name], local_mean], axis=1)
            
        for col_name in self.col_names:
            local_means[col_name].fillna(self.global_mean, inplace=True)
            self.replaces[col_name] = (local_means[col_name].mean(axis=1) * X[col_name].value_counts() +
                                       self.smooth * self.global_mean) / (X[col_name].value_counts() + self.smooth)
        
        return self
        
    def transform(self, X):
        X = X.copy()
        for col_name in self.col_names:
            new_col_name = col_name + '_encoded'
            X[new_col_name] = X[col_name].map(
                lambda x: self.replaces[col_name][x] if x in self.replaces[col_name] else self.global_mean)  
            X[new_col_name].fillna(self.global_mean, inplace=True)
            X.drop(col_name, axis=1, inplace=True)
        return X  

In [None]:
to_target_encoding = ['HomePlanet', 'Destination', 'GroupNumber', 'Deck', 'CabinNumber', 'IsSideS']

In [None]:
encoder = KFoldTargetEncoder(to_target_encoding)

# Creating a model

Use XGBoostClassifier for prediction. Params were found earlier with random search.

In [None]:
# debug code
#x_train, x_val, y_train, y_val = train_test_split(X_train, Y_train, test_size=0.2)
#x_train = encoder.fit_transform(x_train, y_train)
#x_val = encoder.transform(x_val)

In [None]:
X_train = encoder.fit_transform(X_train, Y_train)
X_test = encoder.transform(X_test)

In [None]:
params = {'learning_rate': 0.15,
          'min_child_weight': 0.65,
          'gamma': 0.1,
          'subsample': 0.75,
          'colsample_bytree': 0.85,
          'colsample_bylevel': 0.65,
          'max_depth': 4,
          'n_estimators': 100,
          'reg_lambda': 2.25,
          'monotone_constraints': '(0, -1)',
          'eval_metric': 'logloss',
          'use_label_encoder': False
          }

In [None]:
model = XGBClassifier()
model.set_params(**params)
model.fit(X_train, Y_train)

# Submission

In [None]:
Y_pred = model.predict(X_test) == True

In [None]:
submission=pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
submission['Transported'] = Y_pred
submission.to_csv('submission.csv', index=False)