In [None]:
#import library
import pandas as pd
from sklearn.feature_selection import mutual_info_regression
import matplotlib.pyplot as plt
import numpy as np

import seaborn as sns
import xgboost as xgb
from xgboost import XGBClassifier


train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
submission = pd.read_csv('/kaggle/input/spaceship-titanic/sample_submission.csv')

#concatenate train and test
data= pd.concat([train,test], axis=0)
print("train:",train.shape,"test:",test.shape,"all data:",data.shape)
data.head(5)

In [None]:
#check the empty  or Nan values in the dataset
data.isnull().sum()

In [None]:
data.info()

There are a lot of missing values, try to correct that.

# Preprocess Data

In [None]:
#separate the passenger_id to have the group number
data[['group_num','passenger_num']] = data.PassengerId.str.split('_',expand=True,).astype(int)

#separate the cabin column to have 3 new columns with deck, num, side
data[['cabin_deck', 'cabin_num', 'cabin_side']] = data.Cabin.str.split('/',expand=True,)

In [None]:
# Replace categorical variables with specific values (False, None and  with the value before).
data['CryoSleep'].fillna(False, inplace=True)
data['VIP'].fillna(False, inplace=True)
data['HomePlanet'].fillna(data.HomePlanet.mode()[0], inplace=True)
data['Destination'].fillna(data.Destination.mode()[0], inplace=True)
data['Name'].fillna(data.Name.mode()[0], inplace=True)
data['cabin_deck'].fillna(data.cabin_deck.mode()[0], inplace=True)
data['cabin_deck'].astype("category")
data['cabin_num'].fillna(data.cabin_num.mode()[0], inplace=True)
data['cabin_num'].astype(int)
data['cabin_side'].fillna(data.cabin_side.mode()[0], inplace=True)
data['cabin_side'].astype("category")

#Replace numerical variables with mean and 0
data['Age'].fillna(data.Age.mean(), inplace=True)
data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']] = data[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].fillna(0)

#create a total amount column for the  money spent used for the services
data['TotalSpend'] = data['RoomService'] + data['FoodCourt'] + data['ShoppingMall'] + data['Spa'] + data['VRDeck']

In [None]:
#delete PassengerId and Cabin 
df = data.drop(['Cabin','PassengerId'], axis=1)
df.isnull().sum()

Cool there is no missing data


# Features selection with correlation

In [None]:
# Select the best features for the model
df_train = df.copy()
# Label encoding for categoricals
for colname in df_train.select_dtypes("object"):
    df_train[colname], _ = df_train[colname].factorize()

# selection features
features = df_train[[
    'Transported',
    'HomePlanet',
    'CryoSleep',
    'Destination',
    'Age',
    'VIP',
    'RoomService',
    'FoodCourt',
    'ShoppingMall',
    'Spa',
    'VRDeck',
    'Name',
    'TotalSpend',
    'group_num',
    'passenger_num',
    'cabin_deck',
    'cabin_num',
    'cabin_side'
]]
plt.figure(figsize=(20,18))
sns.heatmap(features.corr(), linewidths=0.1, vmax=1.0, square=True, cmap=plt.cm.RdBu, annot=True)

Name and VIP are the variables low correlated with the target

# Train the model

- i  use a XGBoost model,
- i keep the Label encoding for categoricals used for the correlation

In [None]:
#Split Train /test

x_train, x_test = df_train.iloc[:train.shape[0]], df_train.iloc[train.shape[0]:].drop(['Transported'], axis=1)
x_train, y_train = x_train.drop(['Transported'], axis=1), x_train['Transported']
print("x_train:",x_train.shape, "y_train:",y_train.shape,"x_test:",x_test.shape)

In [None]:
#remove features with low correlation with the target 
trn_feat = [
            'CryoSleep',
      'Destination',
      'Age',
      #'VIP',
      'RoomService',
      'FoodCourt',
      'ShoppingMall',
      'Spa',
      'VRDeck',
      #'Name',
      'TotalSpend',
      #'group_num',
      'passenger_num',
      'cabin_deck',
      'cabin_num',
      'cabin_side'
]



model = XGBClassifier(booster="gbtree", colsample_bylevel=1,
       colsample_bytree=0.65, gamma=2, learning_rate=0.3, max_delta_step=1,
       max_depth=4, min_child_weight=2, n_estimators=500,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,subsample=1 )



# Preprocessing of training data, fit model 
model.fit(x_train[trn_feat], y_train)

# Preprocessing of validation data, get predictions
preds = model.predict(x_test[trn_feat])

print(model.score(x_train[trn_feat], y_train))

In [None]:
#visualize the features importances
plot_x = x_train[trn_feat]
pd.Series(model.feature_importances_, index = plot_x.columns).sort_values(ascending = True).plot(kind='barh')

In [None]:
from sklearn import model_selection

#see the score with cross validation
scores = model_selection.cross_val_score(model, x_train[trn_feat], y_train, cv=8, scoring='accuracy')
#print(scores)
print("Kfold on XGBClassifier: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std()))

# Submission

In [None]:
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"], 
    "Transported": preds 
})

In [None]:
submission.to_csv('submission.csv', index=False)