## Feature Engineering

In [1]:
def do_dtype_convertion(X, dtypes):
    for dtype, features in dtypes.items() :
        if dtype == 'int' :
            for feature in features :
                if feature in X.columns :
                    X[feature] = X[feature].astype('int', errors='ignore')
        elif dtype == 'float32' :
            for feature in features :
                if feature in X.columns :
                    X[feature] = X[feature].astype('float32', errors='ignore')
        elif dtype == 'bool' :
            for feature in features :
                if feature in X.columns :
                    X[feature] = X[feature].astype(np.bool, errors='ignore')                    
        elif dtype == 'object' :
            for feature in features :
                if feature in X.columns :
                    X[feature] = X[feature].astype('object', errors='ignore')  
    return X

def do_skewed_target_normalization(y): 
    return np.log(y)

def do_skewed_features_normalization(X_train, X_test):
    from scipy.stats import skew, norm, probplot
    # Fetch all numeric features
    numeric_features = X_train.dtypes[(X_train.dtypes != object)].index
    skewed_features = X_train[numeric_features].apply(lambda x: skew(x)).sort_values(ascending=False)
    high_skew = skewed_features[skewed_features > 0.5]
    skew_index = high_skew.index
    # Normalize skewed features using log_transformation
    X_train_nskew = X_train.copy()
    X_test_nskew = X_test.copy()
    for i in skew_index:
        X_train_nskew[i] = np.log1p(X_train_nskew[i], where = X_train_nskew[i] > 0)
        X_test_nskew[i] = np.log1p(X_test_nskew[i], where = X_test_nskew[i] > 0)
    return X_train_nskew, X_test_nskew

def do_missing_values_imputation(X, na_strategy):
    for strategy, features in na_strategy.items() :
        if strategy == 'median' :
            for feature in features :
                if feature in X.columns :
                    X[feature].fillna(X[feature].median(), inplace=True)
        elif strategy == 'mean' :
            for feature in features :
                if feature in X.columns :
                    X[feature].fillna(X[feature].mean(), inplace=True)
        elif strategy == 'most_common' :
            for feature in features :
                if feature in X.columns :
                    X[feature].fillna(X[feature].mode()[0], inplace=True)
        elif strategy == 'zero' :
            for feature in features :
                if feature in X.columns :
                    X[feature].fillna(0, inplace=True)   
    return X

def do_feature_ordinal_encoding(X, mapping) :
    for feature_name, map_dict in mapping.items() :
        if feature_name in X.columns :
            X[feature_name] = X[feature_name].map(map_dict)
    return X

def do_feature_encoding(X_train, X_test) :
    from sklearn.preprocessing import OneHotEncoder
    object_cols = X_train.select_dtypes(include=['object']).columns
    # X_train fitting & transformation
    OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
    OH_cols = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols].astype(str)))
    OH_cols.index = X_train.index
    OH_cols.columns = OH_encoder.get_feature_names(object_cols)
    X_train = X_train.drop(object_cols, axis=1)
    X_train = pd.concat([X_train, OH_cols], axis=1)
    # X_test transformation
    OH_cols = pd.DataFrame(OH_encoder.transform(X_test[object_cols].astype(str)))
    OH_cols.index = X_test.index
    OH_cols.columns = OH_encoder.get_feature_names(object_cols)
    X_test = X_test.drop(object_cols, axis=1)
    X_test = pd.concat([X_test, OH_cols], axis=1)
    return X_train, X_test

def do_feature_clustering(X_train, X_test, y_train, features) :

    def get_cluster_dict(table, feature, target, n_clusters = 5):
        from sklearn.cluster import KMeans
        clustering = KMeans(n_clusters=n_clusters, random_state = 42)
        feature_describe = table.groupby([feature])[target].describe().fillna(0)
        clustering.fit(feature_describe)
        cluster_table = pd.DataFrame(zip(list(feature_describe.index),
                                         list(feature_describe.loc[:,'mean']),
                                         list(clustering.labels_)),
                                         columns = ['feature','mean_target_value', 'cluster'])
        cluster_dict = {}
        print('Clustering for '+feature+' :')
        for i in range(len(cluster_table.groupby('cluster')['feature'].unique())):
            print(str(i)+'-'+str(cluster_table.groupby('cluster')['feature'].unique()[i]))
            for f in cluster_table.groupby('cluster')['feature'].unique()[i]:
                cluster_dict[f] = i
        print()
        return cluster_dict 
    
    df_train = pd.concat([y_train, X_train], axis=1)
    for feature, n_clusters in features.items():
        if feature in X_train.columns :
            cluster_dict = get_cluster_dict(df_train, feature, y_train.name, n_clusters)
            X_test[feature] = X_test[feature].map(cluster_dict)
            X_train[feature] = X_train[feature].map(cluster_dict)
    
    return X_train, X_test
    
def do_feature_scaling(X_train, X_test) :
    from sklearn.preprocessing import RobustScaler
    scaler = RobustScaler()
    numeric_cols = X_train.select_dtypes(exclude=['object']).columns
    for col in X_train[numeric_cols].columns:
        X_train[col] = scaler.fit_transform(X_train[[col]])
        X_test[col] = scaler.transform(X_test[[col]])
    return X_train, X_test

def do_categorical_dimension_reduction(X_cat, min_counts) :
    X_cat_counts = X_cat.value_counts()
    mask = X_cat.isin(X_cat_counts[X_cat_counts < min_counts].index)
    X_cat[mask] = 'Other'
    return X_cat

def drop_outliers_values(X_train, y_train) :
    from sklearn.ensemble import IsolationForest
    iso_forest = IsolationForest(random_state=0)
    df_train = pd.concat([y_train, X_train], axis=1)
    numeric_features = df_train.dtypes[(df_train.dtypes != object)].index
    df_train_without_outlier = pd.Series(iso_forest.fit_predict(df_train[numeric_features].fillna(0)), index=df_train.index)
    df_train = df_train.loc[df_train_without_outlier.index[df_train_without_outlier == 1], :]
    X_train = df_train.drop(columns=y_train.name)
    y_train = df_train.loc[:, y_train.name]
    return X_train, y_train

def adapt_test_set_to_train_set_data_structure(X_train, X_test) :
    # Suppression des colonnes du test set non présentes dans le train set
    X_test_corrected = X_test.drop(columns=list(set(X_test.columns.tolist())-set(X_train.columns.tolist())))
    # Ajout des colonnes du train set non présentes dans le test set
    X_test_corrected = pd.DataFrame(X_test_corrected, columns=X_train.columns.tolist())
    # On applique la valeur 0 à ces nouvelles colonnes
    X_test_corrected[list(set(X_train.columns.tolist())-set(X_test_corrected.columns.tolist()))] = 0
    return X_test_corrected

def build_classifier(classifier, 
                    init_hyperparameters, 
                    tuning_hyperparameters, 
                    hyperparameters, 
                    X_train, 
                    y_train, 
                    settings) :
    
    from datetime import datetime

    from sklearn.model_selection import train_test_split
    from sklearn.model_selection import RandomizedSearchCV
    from sklearn.model_selection import GridSearchCV
    from sklearn.model_selection import StratifiedKFold

    from sklearn.metrics import roc_curve, roc_auc_score
    from sklearn.metrics import mean_squared_error, mean_squared_log_error
    from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report

    def timer(start_time=None):
        if not start_time:
            start_time = datetime.now()
            return start_time
        elif start_time:
            thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
            tmin, tsec = divmod(temp_sec, 60)
            print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))

    if settings['do_tuning'] not in [True, False] :
        settings['do_tuning'] = False
    if settings['do_kfold'] not in [True, False] :
        settings['do_kfold'] = True
    if settings['tuning_type'] not in ['grid', 'randomized', 'bayes'] :
        settings['tuning_type'] = 'randomized'
    if settings['tuning_n_iter'] > 1000 :
        settings['tuning_n_iter'] = 1000        

    if settings['do_tuning'] == True :
        
        estimator = classifier(**init_hyperparameters)
        kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

        if settings['tuning_type'] == 'bayes' :
            model = BayesSearchCV(estimator=estimator, 
                                    search_spaces = tuning_hyperparameters, 
                                    n_iter=settings['tuning_n_iter'], 
                                    scoring='roc_auc', 
                                    verbose=0,
                                    cv=kf.split(X_train, y_train.values))

        elif settings['tuning_type'] == 'randomized' :
            model = RandomizedSearchCV(estimator=estimator, 
                                     param_distributions=tuning_hyperparameters, 
                                     n_iter=settings['tuning_n_iter'], 
                                     scoring='roc_auc', 
                                     verbose=0,
                                     cv=kf.split(X_train, y_train.values))      

        elif tuning_type == 'grid' :
            model = GridSearchCV(estimator=estimator, 
                                   param_grid=tuning_hyperparameters,
                                   scoring='roc_auc',
                                   verbose=0,
                                   cv=kf.split(X_train, y_train.values)) 

        start_time = timer(None)
        model.fit(X_train, np.ravel(y_train))
        timer(start_time)

        print('Best accuracy for a single model : {}.'.format(model.best_score_))
        print('Best hyperparameters: {}.'.format(model.best_params_))          
        return model

    else :
        
        start_time = timer(None)
        estimator = classifier(**hyperparameters)

        if settings['do_kfold'] == True:
            kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            splits = kf.split(X_train, y_train.values)

            r2 = []
            auc_train = []
            auc_val = []
            for train_index, val_index in splits:
                X_train_split, y_train_split = X_train.iloc[train_index], y_train.iloc[train_index]
                X_val_split, y_val_split = X_train.iloc[val_index], y_train.iloc[val_index]
                estimator.fit(X_train_split, np.ravel(y_train_split))
                y_pred_val_split = estimator.predict(X_val_split)
                y_pred_train_split = estimator.predict(X_train_split)
                y_score_val_split = estimator.predict_proba(X_val_split)[:,1]
                y_score_train_split = estimator.predict_proba(X_train_split)[:,1]
                r2.append(estimator.score(X_val_split, y_val_split))
                auc_val.append(roc_auc_score(y_val_split, y_score_val_split))
                auc_train.append(roc_auc_score(y_train_split, y_score_train_split))

            print('AUC - validation :', np.round(np.mean(auc_val), 3))
            print('AUC - train :', np.round(np.mean(auc_train), 3))
            print('Accuracy score :', np.round(np.mean(r2), 3))

        estimator.fit(X_train, y_train.values)
        timer(start_time)  
        return estimator

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno

# Loading dataset
df_train = pd.read_csv('../input/spaceship-titanic/train.csv')
X_test = pd.read_csv('../input/spaceship-titanic/test.csv')
X_train = df_train.drop(columns=['Transported'])
y_train = df_train['Transported']

# Type 
dtypes = {
    'int' : ['CryoSleep', 'VIP'],
    'float32' : ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
    'bool' : ['Transported'],
    'object' : ['HomePlanet', 'Destination', 'Cabin', 'Name']
}
X_train = do_dtype_convertion(X_train, dtypes)
X_test = do_dtype_convertion(X_test, dtypes)

# Missing values imputation
na_strategy = {
    'median' : ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck'],
    'most_common' : ['HomePlanet', 'Destination', 'Cabin', 'Name', 'CryoSleep', 'VIP'],
    'zero' : X_train.drop(columns=['CryoSleep', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 
                                   'Spa', 'VRDeck', 'HomePlanet', 'Destination', 'Cabin', 'Name', 'VIP']).columns
}
X_train = do_missing_values_imputation(X_train, na_strategy)
X_test = do_missing_values_imputation(X_test, na_strategy)

# Feature creation
def do_feature_creation(table):
    
    table['CryoSleep'] = (table['CryoSleep'] == True).astype('int')
    table['VIP'] = (table['VIP'] == True).astype('int')
    
    table['PassengerGroupID'] = table['PassengerId'].fillna(0).astype(str).apply(lambda x: x.split('_')[0]).fillna(0).astype('int')
    table['PassengerNumberID'] = table['PassengerId'].fillna(0).astype(str).apply(lambda x: x.split('_')[1]).fillna(0).astype('int')
    occurence_passengerGroupId = table['PassengerId'].fillna(0).astype(str).apply(lambda x: x.split('_')[0]).fillna(0).value_counts()
    table['GroupSize'] = table['PassengerId'].fillna(0).astype(str).apply(lambda x: occurence_passengerGroupId[x.split('_')[0]] if (x.split('_')[0] in occurence_passengerGroupId.keys()) & (x.split('_')[0] != 0) else 0)

    table['CabinDeck'] = table['Cabin'].fillna(0).astype(str).apply(lambda x: x.split('/')[0]).fillna(0)
    table['CabinNum'] = table['Cabin'].fillna(0).astype(str).apply(lambda x: x.split('/')[1]).astype('int').fillna(0)
    table['CabinSide'] = table['Cabin'].fillna(0).astype(str).apply(lambda x: x.split('/')[2]).fillna(0)

    occurence_cabins = table['Cabin'].fillna(0).astype(str).value_counts()
    occurence_deck = table['Cabin'].fillna(0).astype(str).apply(lambda x: x.split('/')[0]).fillna(0).value_counts()
    occurence_num = table['Cabin'].fillna(0).astype(str).apply(lambda x: x.split('/')[1]).fillna(0).value_counts()
    table['CabinSize'] = table['Cabin'].fillna(0).astype(str).apply(lambda x: occurence_cabins[x] if (x in occurence_cabins.keys()) & (x != 0) else 0)
    table['CabinDeckSize'] = table['Cabin'].fillna(0).astype(str).apply(lambda x: occurence_deck[x.split('/')[0]] if (x.split('/')[0] in occurence_deck.keys()) & (x.split('/')[0] != '0') else 0)
    table['CabinNumSize'] = table['Cabin'].fillna(0).astype(str).apply(lambda x: occurence_num[x.split('/')[1]] if (x.split('/')[1] in occurence_num.keys()) & (x.split('/')[0] != '0') else 0)

    table['TotalSpent'] = table['RoomService']+table['FoodCourt']+table['ShoppingMall']+table['Spa']+table['VRDeck']
    
    # Removing non-alphanumeric characters
    table['Destination'] = table['Destination'].str.replace('[^a-zA-Z0-9]', '', regex=True).str.strip()
    
    return table

X_train = do_feature_creation(X_train)
X_test = do_feature_creation(X_test)

# Feature selection
columns_to_exclude = ['PassengerId', 'Cabin', 'Name', 'CabinDeckSize']
X_train = X_train.drop(columns_to_exclude, axis=1)
X_test = X_test.drop(columns_to_exclude, axis=1)

# Ordinal features encoding
mapping = {
    'CabinDeck' : {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7},
    'CabinSide' : {'T':1, 'P':2, 'S':3}
}
X_train = do_feature_ordinal_encoding(X_train, mapping)
X_test = do_feature_ordinal_encoding(X_test, mapping)

# Feature encoding
X_train, X_test = do_feature_encoding(X_train, X_test)

# Dummy encoding
X_train = pd.get_dummies(X_train, prefix_sep='_', drop_first=True)
X_test = pd.get_dummies(X_test, prefix_sep='_', drop_first=True)

# Features synchronization
X_test = adapt_test_set_to_train_set_data_structure(X_train, X_test)

X_train = X_train.fillna(0)



## Build Classifier

### XGBoost Model

In [3]:
from xgboost import XGBClassifier

model_xgb = build_classifier(classifier = XGBClassifier, 
                            init_hyperparameters = {
                                'n_estimators': 100, 
                                'random_state': 42, 
                                'use_label_encoder': False
                            },
                            tuning_hyperparameters = {
                                'scale_pos_weight' : [1, 1.3, 1.6],
                                'objective'  : ['reg:logistic'],
                                'alpha' : [0.0001, 0.01, 0.1, 1, 5, 10],
                                'gamma' : [1, 2, 5, 8, 10, 14, 18, 30],
                                'reg_lambda' : [1, 2, 5, 8, 10, 14, 18, 30],
                                'learning_rate' : [0.001, 0.01, 0.1, 1, 5, 10],
                                'colsample_bytree' : [0.6, 0.7, 0.8, 0.9, 0.95],
                                'max_delta_step': [2, 4, 6, 8, 10, 15, 20, 50],
                                'min_child_weight' : [5, 10, 15, 20, 25, 30, 50],
                                'subsample' : [0.60, 0.70, 0.80, 0.85, 0.90, 0.95],
                                'max_depth': [3, 6, 9, 15, 20, 25, 30, 35, 45, None]
                            }, 
                            hyperparameters = {
                                 'subsample': 0.85, 
                                 'scale_pos_weight': 1.6, 
                                 'reg_lambda': 1, 
                                 'objective': 'reg:logistic', 
                                 'min_child_weight': 10, 
                                 'max_depth': 45, 
                                 'max_delta_step': 20, 
                                 'learning_rate': 0.1, 
                                 'gamma': 8, 
                                 'colsample_bytree': 0.8, 
                                 'alpha': 5,
                                 'use_label_encoder': False
                            },  
                            X_train = X_train, 
                            y_train = y_train, 
                            settings = {
                                'do_tuning' : False,
                                'do_kfold' : True,
                                'tuning_type' : 'randomized',
                                'tuning_n_iter' : 60
                            }) 

AUC - validation : 0.902
AUC - train : 0.936
Accuracy score : 0.803

 Time taken: 0 hours 0 minutes and 11.29 seconds.


### AdaBoost Model

In [4]:
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier

model_adb = build_classifier(classifier = AdaBoostClassifier,
                            init_hyperparameters = {
                                'base_estimator' : SVC(probability=True, kernel='linear')
                            },
                            tuning_hyperparameters = {
                                'n_estimators' : [20, 50, 100, 200],
                                'learning_rate': [0.01, 0.05, 0.10, 1],
                                'algorithm' : ['SAMME', 'SAMME.R']
                            }, 
                            hyperparameters = {
                                'n_estimators': 100,
                                'learning_rate': 1, 
                            }, 
                            X_train = X_train, 
                            y_train = y_train, 
                            settings = {
                                'do_tuning' : False,
                                'do_kfold' : True,
                                'tuning_type' : 'randomized',
                                'tuning_n_iter' : 20
                            }) 

AUC - validation : 0.88
AUC - train : 0.898
Accuracy score : 0.794

 Time taken: 0 hours 0 minutes and 7.81 seconds.


### Random Forest Model

In [5]:
from sklearn.ensemble import RandomForestClassifier

model_rf = build_classifier(classifier = RandomForestClassifier, 
                            init_hyperparameters = {
                                'n_estimators' : 300, 
                                'random_state' : 42
                            },
                            tuning_hyperparameters = {
                                'bootstrap': [True, False],
                                'criterion' : ['gini', 'entropy'],
                                'max_features': ['auto', 'sqrt'],
                                'max_depth': [10, 20, 30, 40, 50, 60, 70, None],
                                'min_samples_leaf': [1, 2, 4, 8, 20],
                                'min_samples_split': [2, 5, 10]
                            }, 
                            hyperparameters = {
                                'min_samples_split': 10, 
                                'min_samples_leaf': 4, 
                                'max_features': 'auto', 
                                'max_depth': None, 
                                'criterion': 'entropy', 
                                'bootstrap': False
                            },
                            X_train = X_train, 
                            y_train = y_train, 
                            settings = {
                                'do_tuning' : False,
                                'do_kfold' : True,
                                'tuning_type' : 'randomized',
                                'tuning_n_iter' : 60
                            })  

AUC - validation : 0.894
AUC - train : 0.995
Accuracy score : 0.805

 Time taken: 0 hours 0 minutes and 12.33 seconds.


### DecisionTree Model

In [6]:
from sklearn.tree import DecisionTreeClassifier

model_dt = build_classifier(classifier = DecisionTreeClassifier, 
                            init_hyperparameters = {},
                            tuning_hyperparameters = {
                                'max_depth' : [2, 4, 5, 6, 7, 8, 10, 15, 20],
                                'min_samples_split' : [2, 4, 6, 8, 10, 20, 30],
                                'min_samples_leaf' : [2, 4, 6, 8, 10, 20, 30],
                                'max_features' : [4, 6, 8, 10, 15, 20],
                                'splitter' : ['best', 'random']
                            }, 
                            hyperparameters = {
                                'splitter': 'best', 
                                'min_samples_split': 20, 
                                'min_samples_leaf': 30, 
                                'max_features': 10, 
                                'max_depth': 15
                            }, 
                            X_train = X_train, 
                            y_train = y_train, 
                            settings = {
                                'do_tuning' : False,
                                'do_kfold' : True,
                                'tuning_type' : 'randomized',
                                'tuning_n_iter' : 20
                            }) 

AUC - validation : 0.866
AUC - train : 0.907
Accuracy score : 0.775

 Time taken: 0 hours 0 minutes and 0.23 seconds.


### LightGBM Model

In [7]:
from lightgbm import LGBMClassifier

model_lgbm = build_classifier(classifier = LGBMClassifier, 
                            init_hyperparameters = {
                                'objective' : 'binary', 
                                'random_state' : 1
                            },
                            tuning_hyperparameters = {
                                'objective' : ['binary'],
                                'boosting_type': ['dart'],
                                'num_leaves': np.arange(10, 15, 1),
                                'max_depth': np.arange(6, 8, 1),
                                'learning_rate': np.arange(0.07, 0.12, 0.01),
                                'n_estimators': np.arange(480, 510, 2),
                                'reg_alpha': np.arange(0.4, 0.6, 0.02), 
                                'min_child_samples': np.arange(40, 60, 2),
                                'reg_lambda': np.arange(0.85, 1, 0.01)

                            }, 
                            hyperparameters = {
                                'reg_lambda': 1.0,
                                'reg_alpha': 0.48,
                                'num_leaves': 10,
                                'n_estimators': 508,
                                'min_child_samples': 52,
                                'max_depth': 6,
                                'learning_rate': 0.10,
                                'boosting_type': 'dart'
                            },
                            X_train = X_train, 
                            y_train = y_train, 
                            settings = {
                                'do_tuning' : False,
                                'do_kfold' : True,
                                'tuning_type' : 'randomized',
                                'tuning_n_iter' : 40
                            }) 

AUC - validation : 0.902
AUC - train : 0.94
Accuracy score : 0.808

 Time taken: 0 hours 0 minutes and 14.76 seconds.


### Model stacking

In [8]:
from sklearn.ensemble import StackingClassifier

model_stacking = build_classifier(classifier = StackingClassifier, 
                                init_hyperparameters = {},
                                tuning_hyperparameters = {}, 
                                hyperparameters = {
                                    'estimators' : [
                                        ('xgb', model_xgb),
                                        ('rf', model_rf),
                                        ('lgbm', model_lgbm)
                                    ], 
                                    'final_estimator' : RandomForestClassifier(**{
                                        'n_estimators' : 50,
                                        'min_samples_split': 2, 
                                        'min_samples_leaf': 2, 
                                        'max_features': 'sqrt', 
                                        'max_depth': 5, 
                                        'criterion': 'gini', 
                                        'bootstrap': False
                                    }),
                                    'cv' : 5
                                },
                                X_train = X_train, 
                                y_train = y_train, 
                                settings = {
                                    'do_tuning' : False,
                                    'do_kfold' : True,
                                    'tuning_type' : 'randomized',
                                    'tuning_n_iter' : 30
                                }) 

AUC - validation : 0.881
AUC - train : 0.993
Accuracy score : 0.795

 Time taken: 0 hours 3 minutes and 2.06 seconds.


## Save submission

In [10]:
prediction = model_lgbm.predict(X_test)
submission = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
submission['Transported'] = prediction
submission.to_csv('./submission.csv', index = False)