In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**IMPORT LIBRARIES**

In [None]:
import gc
from tqdm.auto import tqdm

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score

from catboost import CatBoostClassifier
import xgboost as xgb

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
train_df = pd.read_csv("../input/spaceship-titanic/train.csv", index_col=0)
test_df = pd.read_csv("../input/spaceship-titanic/test.csv", index_col=0)

In [None]:
train_df.head()

In [None]:
# check the distribution of the targets
# very close, not that much separate them
train_df['Transported'].value_counts().plot(kind='bar')
plt.title('Distribution of Transported', fontsize=15);

**DATA TYPES AND THEIR QUANTITY**

Knowing the quantity of specific data type will help in choosing the right algorithms for modelling.

Based on the analysis below, the number of object datatype is equal to the number of floats.

There are also some Boolean

In [None]:
# checking for the distribution of the data types
train_df.dtypes.value_counts().plot(kind='bar')
plt.title("Distribution of data types", fontsize=15);

**MISSING DATA IN BOTH TRAINING AND TEST SET**

It's also important to handle missing values in appropriate ways, depending on the data types.

Following this, then, it's good to visualize the the categories that are missing the missing amounts of the data.

In [None]:
# checking for missing data
print("Shape of the training set: ", train_df.shape)
print("Total missing data in training set: ", train_df.isna().sum().sum())
print("\nShape of the test set: ", test_df.shape)
print("Total missing data in test set: ", test_df.isna().sum().sum())

In [None]:
# This is a condensed form of getting the missing values.
# First getting the missing the features and their respective percentages, 
# Afterwards, construct data frames for both the missing training and test set features
missing_train_data = [(col, (train_df[col].isna().sum()/len(train_df))*100) for col in train_df.columns.tolist()
                      if train_df[col].isna().sum() > 0]

missing_test_data = [(col, (test_df[col].isna().sum()/len(test_df))*100) for col in test_df.columns.tolist()
                    if test_df[col].isna().sum() > 0]

# Data frames for both the training and test set missing values
# Sort the values so as to find out which feature has the most missing data
missing_train_data = pd.DataFrame(missing_train_data, columns=['feature', 'MissingPct']).sort_values(by='MissingPct', ascending=False)
missing_test_data = pd.DataFrame(missing_test_data, columns=['feature', 'MissingPct']).sort_values(by='MissingPct', ascending=False)

**VISUAL DISTRIBUTION OF MISSING VALUES**

In [None]:
sns.histplot(x=missing_train_data['MissingPct'], 
             data=missing_train_data, bins=10)
plt.title('Distribution of missing data from the training set', fontsize=15);

In [None]:
sns.histplot(x=missing_test_data['MissingPct'],
            data=missing_test_data, bins=10)
plt.title('Distribution of missing values from the test set');

**SEPARATING NUMERICAL AND CATEGORICAL DATA**

Now, it's time to work on numerical and categorical data independently.

In [None]:
# use the test set columns to assign both the numerical and categorical columns
# Use .select_dtypes to identify the data types
numerical_cols = test_df.select_dtypes(include=[np.number, np.bool8]).columns.tolist()

# the categorical includes object and category data types
categorical_col = test_df.select_dtypes(include=['object', 'category']).columns.tolist()

# Our Target columns
target = 'Transported'

**FILL IN THE MISSING VALUES**

Now, that numerical and categorical features have been separated.

Using, scikit-learn's SimpleImputer, the missing values filled in.

In [None]:
# imputing the numerical values first
imputer_num = SimpleImputer(strategy='mean')
train_df[numerical_cols] = imputer_num.fit_transform(train_df[numerical_cols])
test_df[numerical_cols] = imputer_num.fit_transform(test_df[numerical_cols])

# imputing the categorical values
imputer_cat = SimpleImputer(strategy='constant')
train_df[categorical_col] = imputer_cat.fit_transform(train_df[categorical_col])
test_df[categorical_col] = imputer_cat.fit_transform(test_df[categorical_col])

**CORRELATION**

This mainly works with numerical features, therefore, its good to write a function that will be applied anywhere

In [None]:
# train_df[numerical_cols].corrwith(train_df[target])

# the correlation function
def corr_func(df, features=numerical_cols, target=target):
    # correlations
    corr = df[features].corrwith(df[target])
    # return it as a dataframe
    return pd.DataFrame({'feature': corr.index, 'correlation': corr.values}).sort_values(by='correlation', ascending=False)

In [None]:
# I have considered taking the absolute values of the correlation, 
corr_func(train_df)

**PREPROCESSING**

In [None]:
# Convert the categoricals into numbers
# I will be using .factorize(), but there are many other ways to perform
# This include label encoding, getting dummies, one hot encoding etc.
# factorize() returns an array, and index.
# for this case, index is not needed
for col in categorical_col:
    train_df[col], _ = train_df[col].factorize()
    test_df[col], _ = test_df[col].factorize()

In [None]:
# checking for correlation categorical data that have been converted into integers
# categorical features have stronger correlations as compared to the numeric features. 
corr_func(train_df, features=categorical_col)

In [None]:
# correlation of the numerical features
corr_func(train_df)

**MODELLING**

Considering the correlations, it's best to use CatBoost for this first model, but any kind of model would work

I will be starting with xgboost. I just like it.

In [None]:
# using all the features, provides better results
features = [*numerical_cols, *categorical_col]
# checking for features with positive correlations
# features = ['CryoSleep', 'Cabin', 'Destination', 'Name', 'FoodCourt', 'ShoppingMall']

In [None]:
# split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(train_df[features], 
                                                      train_df[target], 
                                                      test_size=0.1, 
                                                      random_state=1223, 
                                                      shuffle=True)

In [None]:
# params = {
#     'eta': 1e-3,
#     'objective': 'binary:logitraw',
#     'eval_metric': 'auc',
# #     'use_label_encoder': False
# }

In [None]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.svm import SVC

# search = RandomizedSearchCV(
#     n_iter=10,
#     estimator=ExtraTreesClassifier(),
#     param_distributions=params,
#     random_state=1223,
#     n_jobs=-1
# )

ada_clf = AdaBoostClassifier(n_estimators=300, learning_rate=0.265)
hist_clf = HistGradientBoostingClassifier(learning_rate=0.02)
extra_clf = ExtraTreesClassifier(n_estimators=600, max_leaf_nodes=13, max_depth=9)

model_ada = ada_clf.fit(X_train, y_train)
model_hist = hist_clf.fit(X_train, y_train)
model_extra = extra_clf.fit(X_train, y_train)
# search.fit(X_train, y_train)

# model = xgb.XGBClassifier(**params).fit(X_train, y_train)

# pred_tr = model.predict(X_train)
# print("Training classification report\n", classification_report(pred_tr, y_train))
# pred_val = model.predict(X_valid)
# print("\nValidation classification report\n", classification_report(pred_val, y_valid))

In [None]:
# print metrics
def print_metrics(model, name='boosting'):
    pred_tr = model.predict(X_train)
    print(f"\nModel {name}")
    print('-'*60)
    print("Training classification report:\n", classification_report(pred_tr, y_train))
    pred_val = model.predict(X_valid)
    print("\nValidation classification report: \n", classification_report(pred_val, y_valid))
    print("."*60)

In [None]:
print_metrics(model_ada, 'AdaBoost')
print_metrics(model_extra, 'ExtraTrees')
print_metrics(model_hist, 'HistBoost')

In [None]:
# ensembling
preds_ada = model_ada.predict(test_df[features])
preds_extra = model_extra.predict(test_df[features])
preds_hist = model_hist.predict(test_df[features])
preds_ensemble = 0.65*preds_ada + 0.1*preds_extra + 0.25*preds_hist

In [None]:
predictions = np.round(preds_ensemble,0)

In [None]:
sample_df = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
sample_df['Transported'] = predictions.astype(bool)

**SUBMISSION**

In [None]:
sample_df.to_csv('submission.csv', index=False)

In [None]:
pd.read_csv('submission.csv')