In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install missingpy

In [None]:
import sklearn.neighbors._base
import sys
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
pd.options.mode.chained_assignment = None  # default='warn'
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from category_encoders import TargetEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [None]:
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
display(train)
test

# Data Preparation/Cleaning

In [None]:
## numerical and categorical features.

numfs = [feature for feature in train.columns if (train[feature].dtype == int)
         or (train[feature].dtype == float)]
catfs = [feature for feature in train.columns if (train[feature].dtype == object)]

print("Numerical Features: ",numfs)
print("Categorical Features: ",catfs)

In [None]:
f,axs = plt.subplots(2,2,figsize = (20,10))

sns.countplot(x = "HomePlanet",data = train, ax=axs[0][0])
sns.countplot(x = "CryoSleep",data = train, ax=axs[0][1])

sns.countplot(x = "Destination",data = train, ax=axs[1][0])
sns.countplot(x = "VIP",data = train, ax=axs[1][1])


In [None]:
## to split data into new features.

def split_data(series,character):
    
    h = {}
    flag = True
    
    for string in series:
        
        if pd.isna(string):
            for j in range(count):
                h[f'list_{j}'].append(np.nan)
            continue
            
        
        list = string.split(character)
            
        if flag:
            
            count = len(list)

            for i in range(count):
                h[f'list_{i}'] = []
                
            flag = False
            
        for j in range(count):
            h[f'list_{j}'].append(list[j])
            
    return h

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

## transformer that creates new features out of existing feature.

class SplitFeatureTransformer(BaseEstimator,TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self,X,y = None):
        
        return self
    
    def transform(self,X,y = None):
        
        X_ = X.copy()
                
        passengerid_data = split_data(X_["PassengerId"],'_')
        X_["GroupId"] = passengerid_data['list_0']
        X_["MemberId"] = passengerid_data['list_1']
        
        cabin_data = split_data(X_["Cabin"],'/')
        X_["Deck"] = cabin_data['list_0']
        X_["Num"] = cabin_data['list_1']
        X_["Side"] = cabin_data['list_2']

        name_data = split_data(X_["Name"],' ')
        X_["FirstName"] = name_data['list_0']
        X_["SecondName"] = name_data['list_1']

        X_.drop(['Cabin', 'Name'], axis=1, inplace=True)          
        
        return X_

In [None]:
## transformer that removes preceding zeros from existing feature values.

class RmvPreZerosTransformer(BaseEstimator,TransformerMixin):

    def __init__(self):
        pass
        
    def fit(self,X,y = None):
        
        return self
    
    def transform(self,X,y = None):
        
        X_ = X.copy()
        
        X_['GroupId'] = X_['GroupId'].apply(lambda x: x.lstrip('0') if not pd.isna(x) else x)
        X_['MemberId'] = X_['MemberId'].apply(lambda x: x.lstrip('0') if not pd.isna(x) else x)  
    
        return X_

In [None]:
## transformer that adds new feature called 'luxury'.

class AddLuxuryTransformer(BaseEstimator,TransformerMixin):

    def __init__(self, flag):
        self.flag = flag
        
    def fit(self,X,y = None):
        
        return self
    
    def transform(self,X,y = None):
        
        X_ = X.copy()
        luxury = []

        for index, row in X_.iterrows():

            sum = row['FoodCourt'] + row['ShoppingMall'] + row['Spa'] + row['VRDeck']
            luxury.append(sum)

        X_['luxury'] = luxury 
        
        if self.flag == True:
            X_ = X_[['GroupId', 'MemberId', 'HomePlanet', 'CryoSleep','Deck', 'Num', 'Side', 'Destination',
                 'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstName', 'SecondName', 'luxury', 'Transported']]
        elif self.flag == False:
            X_ = X_[['PassengerId', 'GroupId', 'MemberId', 'HomePlanet', 'CryoSleep','Deck', 'Num', 'Side', 'Destination',
                 'Age','VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'FirstName', 'SecondName', 'luxury']]  
    
        return X_

In [None]:
## transformer that selects and returns columns by name.

class ColumnSelector(BaseEstimator, TransformerMixin):

    def __init__(self, columns):
        self.columns = columns
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.columns]

In [None]:
# pipelines for preparing train and test data sets.

data_prep_tr = make_pipeline(
    SplitFeatureTransformer(),
    RmvPreZerosTransformer(),
    AddLuxuryTransformer(flag = True))

data_prep_te = make_pipeline(
    SplitFeatureTransformer(),
    RmvPreZerosTransformer(),
    AddLuxuryTransformer(flag = False))

In [None]:
data_prep_tr.fit(train)
train_prep = data_prep_tr.transform(train)

data_prep_te.fit(test)
test_prep = data_prep_te.transform(test)

display(train_prep)
test_prep

In [None]:
train_prep['Transported'] = train_prep['Transported'].apply(lambda x: 1 if x == True else 0)
y = train_prep["Transported"]

passengerid = test["PassengerId"]
passengerid

# Exploratory Data Analysis

In [None]:
for feature in ['GroupId','MemberId','Deck','Num','Side','FirstName','SecondName']:
    print(f'Number of categories in {feature} : {len(train_prep[feature].value_counts())}')

In [None]:
f,axs = plt.subplots(3,1,figsize = (10,20))
sns.countplot(x = "MemberId",data = train_prep, ax=axs[0])
sns.countplot(x = "Deck",data = train_prep, ax=axs[1])
sns.countplot(x = "Side",data = train_prep, ax=axs[2])


In [None]:
catfs.extend(['GroupId','MemberId','Deck','Num','Side','FirstName','SecondName'])
catfs.remove('Cabin')
catfs.remove("Name")
catfs.remove("PassengerId")
catfs

In [None]:
f,ax = plt.subplots(2,3,figsize=(20,10))

sns.set(palette='RdBu')

sns.histplot(train_prep['Age'], kde=True,ax=ax[0][0])

sns.kdeplot(data = train_prep, x = 'RoomService', ax=ax[0][1])

sns.kdeplot(data = train_prep, x = 'FoodCourt', ax=ax[0][2])

sns.kdeplot(data = train_prep, x = 'ShoppingMall', ax=ax[1][0])

sns.kdeplot(data = train_prep, x = 'Spa', ax=ax[1][1])

sns.kdeplot(data = train_prep, x = 'VRDeck', ax=ax[1][2])

In [None]:
sns.kdeplot(data = train_prep, x = 'luxury')

In [None]:
f,ax = plt.subplots(2,3,figsize=(20,10))

sns.set(palette='RdBu')

sns.boxplot(x=train_prep["Age"],ax=ax[0][0])
sns.boxplot(x=train_prep["RoomService"],ax=ax[0][1])
sns.boxplot(x=train_prep["FoodCourt"],ax=ax[0][2])
sns.boxplot(x=train_prep["ShoppingMall"],ax=ax[1][0])
sns.boxplot(x=train_prep["Spa"],ax=ax[1][1])
sns.boxplot(x=train_prep["VRDeck"],ax=ax[1][2])

# Data Preprocessing

In [None]:
train_prep.columns

In [None]:
sns.heatmap(train_prep[numfs] == 0)

In [None]:
sns.heatmap(train_prep.isnull())

> # Wrong value replaced with NaN.

In [None]:

train_prep['Age'] = train_prep['Age'].replace(0, np.nan)
test_prep['Age'] = test_prep['Age'].replace(0,np.nan)

In [None]:
numerical_features =  ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'luxury']
ordinal_features = ['VIP','CryoSleep']
nominal_features = ['HomePlanet','Destination','MemberId','Deck','Side','GroupId','Num','FirstName','SecondName']

> # Feature Encoding in the presence of NaNs.

In [None]:

class OHE_with_nan(BaseEstimator,TransformerMixin):
    """ OHE with NAN. Not super pretty but works..
    """
    def __init__(self, copy=True):
        self.copy = copy
        
    def fit(self, X, y = None):
        """ This transformer does not use a fit procedure """
        return self
    
    def transform(self, X, y = None):
        """ Return the new object here"""
        # Replace nans with "Missing" such that OneHotEncoder can work.
        enc_missing = SimpleImputer(strategy="constant",fill_value="missing")
        data1 = pd.DataFrame(columns=X.columns,data = enc_missing.fit_transform(X))
        #Perform standard OHE
        OHE = OneHotEncoder(sparse=False,handle_unknown="ignore")
        OHE_fit = OHE.fit_transform(data1)
        #save feature names of the OHE dataframe
        data_OHE = pd.DataFrame(columns=OHE.get_feature_names_out(data1.columns),data = OHE_fit)
        
        # Initialize
        Column_names = data1.columns
        Final_OHE = pd.DataFrame()
        # Loop over columns to replace 0s with nan the correct places.
        for i in range(len(data1.columns)):
           tmp_data = data_OHE[data_OHE.columns[pd.Series(data_OHE.columns).str.startswith(Column_names[i])]]
           missing_name = tmp_data.iloc[:,-1:].columns
           missing_index = np.where(tmp_data[missing_name]==1)[0]
           tmp_data.loc[missing_index,:] = np.nan
           tmp_data1 = tmp_data.drop(missing_name,axis=1)
           Final_OHE = pd.concat([Final_OHE, tmp_data1], axis=1)
        
        return Final_OHE

In [None]:
from sklearn.model_selection import KFold

class TargetEncoderCV(TargetEncoder):
    
    def __init__(self, n_splits=3, shuffle=True, cols=None):

        self.n_splits = n_splits
        self.shuffle = shuffle
        self.cols = cols
        

    def fit(self, X, y):

        self._target_encoder = TargetEncoder(cols=self.cols, handle_missing='return_nan')
        self._target_encoder.fit(X, y)
        return self

    
    def transform(self, X, y=None):

        # Use target encoding from fit() if this is test data
        if y is None:
            return self._target_encoder.transform(X)

        # Compute means for each fold
        self._train_ix = []
        self._test_ix = []
        self._fit_tes = []
        kf = KFold(n_splits=self.n_splits, shuffle=self.shuffle)
        for train_ix, test_ix in kf.split(X):
            self._train_ix.append(train_ix)
            self._test_ix.append(test_ix)
            te = TargetEncoder(cols=self.cols, handle_missing='return_nan')
            if isinstance(X, pd.DataFrame):
                self._fit_tes.append(te.fit(X.iloc[train_ix,:],
                                            y.iloc[train_ix]))
            elif isinstance(X, np.ndarray):
                self._fit_tes.append(te.fit(X[train_ix,:],
                                            y[train_ix]))
            else:
                raise TypeError('X must be DataFrame or ndarray')

        # Apply means across folds
        Xo = X.copy()
        for ix in range(len(self._test_ix)):
            test_ix = self._test_ix[ix]
            if isinstance(X, pd.DataFrame):
                Xo.iloc[test_ix,:] = \
                    self._fit_tes[ix].transform(X.iloc[test_ix,:])
            elif isinstance(X, np.ndarray):
                Xo[test_ix,:] = \
                    self._fit_tes[ix].transform(X[test_ix,:])
            else:
                raise TypeError('X must be DataFrame or ndarray')
        return Xo

            
    def fit_transform(self, X, y=None):

        return self.fit(X, y).transform(X, y)

> # Encoding Nominal Features using KFoldTargetEncoder

In [None]:
# Cross-fold Target encode the nominal data.

te = TargetEncoderCV(cols=nominal_features)
te.fit(train_prep, y)
train_enc = te.transform(train_prep, y)
test_enc = te.transform(test_prep)


In [None]:
train_enc = train_enc.drop(['Transported'], axis=1)
test_enc = test_enc.drop(['PassengerId'], axis=1)
y

In [None]:
display(train_enc)
test_enc

In [None]:
train_prep.columns

In [None]:
# ordinal encode the ordinal features.

ct = ColumnTransformer([("ordinal encode", OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=np.nan),ordinal_features)],
                       remainder='passthrough')

ct.fit(train_enc)

columns = ordinal_features + ['GroupId', 'MemberId', 'HomePlanet', 'Deck', 'Num', 'Side',
       'Destination', 'Age', 'RoomService', 'FoodCourt', 'ShoppingMall',
       'Spa', 'VRDeck', 'FirstName', 'SecondName', 'luxury']

train_enc = pd.DataFrame(ct.transform(train_enc), columns = columns)
test_enc = pd.DataFrame(ct.transform(test_enc), columns = columns)

display(train_enc)
test_enc

In [None]:
cat_indexes = []
categorical_features = ordinal_features
for column in categorical_features:
    cat_indexes.append(train_enc.columns.get_loc(column))

cat_indexes

> # Imputation.

In [None]:
from missingpy import MissForest

imputer = MissForest(max_iter=10, decreasing=False, missing_values=np.nan,
             copy=True, n_estimators=100, criterion=('mse', 'gini'),
             max_depth=None, min_samples_split=2, min_samples_leaf=1,
             min_weight_fraction_leaf=0.0, max_features='auto',
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             bootstrap=True, oob_score=False, n_jobs=-1, random_state=1337,
             verbose=0, warm_start=False, class_weight=None)

imputer.fit(train_enc, cat_vars=cat_indexes)

train_imp = pd.DataFrame(imputer.transform(train_enc),columns = columns)
test_imp = pd.DataFrame(imputer.transform(test_enc),columns = columns)
display(train_imp)
display(test_imp)

> # Transforming and Scaling.

In [None]:
from sklearn.preprocessing import FunctionTransformer

transf_pipeline = Pipeline(steps = [
    ('skew', FunctionTransformer(np.arcsinh)),
    ('scaler',StandardScaler())
])

preprocess = ColumnTransformer(
    transformers=[
        ("transform", transf_pipeline, numerical_features)
    ],
    remainder='passthrough'
)

columns = numerical_features + ['GroupId', 'MemberId', 'HomePlanet', 'CryoSleep', 'Deck', 'Num', 'Side',
       'Destination', 'VIP', 'FirstName', 'SecondName']

preprocess.fit(train_imp)
train_preprocessed = pd.DataFrame(preprocess.transform(train_imp), columns = columns)
test_preprocessed = pd.DataFrame(preprocess.transform(test_imp), columns = columns)

display(train_preprocessed)
test_preprocessed

In [None]:
f,ax = plt.subplots(2,3,figsize=(20,10))

sns.set(palette='RdBu')

sns.histplot(train_imp['Age'], kde=True,ax=ax[0][0])

sns.kdeplot(data = train_preprocessed, x = 'RoomService', ax=ax[0][1])

sns.kdeplot(data = train_preprocessed, x = 'FoodCourt', ax=ax[0][2])

sns.kdeplot(data = train_preprocessed, x = 'ShoppingMall', ax=ax[1][0])

sns.kdeplot(data = train_preprocessed, x = 'Spa', ax=ax[1][1])

sns.kdeplot(data = train_preprocessed, x = 'VRDeck', ax=ax[1][2])

# Feature Selection.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(train_preprocessed, y)
print(model.feature_importances_)
feat_importances = pd.Series(model.feature_importances_, index=train_preprocessed.columns)
feat_importances.nlargest(10).plot.bar()
plt.show()
list1=feat_importances.keys().to_list()

In [None]:
corr_matrix = abs(train_preprocessed.corrwith(y))
corr_target = corr_matrix.sort_values(ascending=False)

relevant_features = corr_target[corr_target>0.1]
list2 = relevant_features.keys().to_list()
list2

In [None]:
import statsmodels.api as sm

logit_model = sm.Logit(y,train_preprocessed)
result = logit_model.fit()
print(result.summary2())

In [None]:
ETC_features = ['Age', 'RoomService', 'Spa', 'VRDeck', 'luxury',
                  'MemberId','FirstName' ,'SecondName', 'Side', 'Num']

pearson_features = ['MemberId','luxury','Spa','RoomService','VRDeck','Num','Deck','ShoppingMall','FoodCourt','VIP','Destination']

logit_features = ['Age', 'RoomService', 'FoodCourt', 'Spa', 'VRDeck', 'luxury',
                  'MemberId', 'CryoSleep', 'Deck', 'Destination', 'VIP', 'SecondName']

# Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
pca = PCA().fit(train_preprocessed)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')

In [None]:
# The PCA model
pca = PCA(n_components=10) # estimate only 2 PCs
X = pca.fit_transform(train_preprocessed) # project the original data into the PCA space
X_test = pca.transform(test_preprocessed)

In [None]:
X

# Model Building

In [None]:
'''

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

lr = LogisticRegression()
rf = RandomForestClassifier()
gbc = GradientBoostingClassifier()
abc = AdaBoostClassifier()

param_grid_lr = {'solver': ['liblinear'],
              'penalty': ['l2'],
              'C': np.logspace(-4, 4, 4)}

param_grid_rf = {'n_estimators': [10, 100, 1000],
                 'max_features': ['sqrt', 'log2']}

param_grid_gbc = {'n_estimators':[100,500], 
            'learning_rate': [0.1,0.05,0.02],
            'max_depth':[4], 
            'min_samples_leaf':[3], 
            'max_features':[1.0] }

param_grid_abc = {'n_estimators' : [10, 50, 100, 500], 'learning_rate' : [0.0001, 0.001, 0.01, 0.1, 1.0]}

cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

grid_search = GridSearchCV(estimator=lr, param_grid=param_grid_lr, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(train_preprocessed, y)
print('linear regression',grid_result.best_score_,grid_result.best_params_)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid_rf, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(train_preprocessed, y)
print('random forest',grid_result.best_score_,grid_result.best_params_)

grid_search = GridSearchCV(estimator=gbc, param_grid=param_grid_gbc, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(train_preprocessed, y)
print('gradient boosted',grid_result.best_score_,grid_result.best_params_)

grid_search = GridSearchCV(estimator=abc, param_grid=param_grid_abc, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(train_preprocessed, y)
print('ada boosted',grid_result.best_score_,grid_result.best_params_)
'''

In [None]:
import tensorflow as tf
from tensorflow import keras

model_all =tf.keras.Sequential([
    tf.keras.layers.Dense(len(X),activation='relu'),
    tf.keras.layers.Dense(10,activation='relu'),
    tf.keras.layers.Dense(10,activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')])

model_all.compile(loss = tf.keras.losses.BinaryCrossentropy(),
              optimizer = tf.keras.optimizers.Adam(),
              metrics = ["accuracy"])

history = model_all.fit(X,y,epochs=125,verbose=1)

In [None]:
predictions = tf.squeeze(tf.cast(tf.round(model_all.predict(X_test)),dtype=tf.int64))

In [None]:
output = pd.DataFrame({'PassengerId': passengerid, 'Transported': predictions})
output['Transported'] = output['Transported'].apply(lambda x: True if x == 1 else False)

In [None]:
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")