In [None]:
#Loading in Initial Libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Inspecting training data

In [None]:
data = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv')
data.info()
data.head()

Training data shows that we are missing a lot of values in Cabin, and some in age, Ticket, Fare and embarked. Within the dataset, lets see how balanced it is.

In [None]:
count = 0

for i in np.arange(0, data.shape[0], 1):
    if data['Survived'][i] == 1:
        count = count + 1
    else:
        pass
print('{} Passengers survived while {} did not!'.format(count, (data.shape[0]-count)))

So there is ~43% passengers that survived while ~57% did not survive. Its a bit imbalanced but when we do the train test split lets see how it comes out. </br>
For train test split stratify will be set to the target value to ensure as equal proportion of separations as possible.
# Splitting data
PassengerId, Ticket and Cabin were dropped as they are not going to be relevant as we know from the original titanic dataset.

In [None]:
data_d = data.copy()
data_d = data_d.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_d.iloc[:, 1:], data_d.iloc[:, 0],
                                                   test_size = 0.20, shuffle = True, stratify=data_d.iloc[:, 0])

def index_reseter(variable):
    variable = variable.reset_index()
    variable = variable.drop('index', axis=1)
    if variable.shape[1]==1:
        #Changes variable back to array
        return variable.squeeze()
    else:
        return variable

#Reset index values and turn targets to array
X_train = index_reseter(X_train)
X_test = index_reseter(X_test)
y_train = index_reseter(y_train)
y_test = index_reseter(y_test)

In [None]:
count = 0
for i in np.arange(0, y_train.shape[0], 1):
    if y_train[i] == 1:
        count = count + 1
    else:
        pass

if (count/y_train.shape[0]*100) > 40 and (count/y_train.shape[0]*100) <= 60:
    print('Training data has {}% survived labels'.format(count/y_train.shape[0]*100))
else:
    print('Labels are far from balanced with {}% survived labels'.format(count/y_train.shape[0]*100))

The above code demonstrates that we maintained our proportions as equal as possible during splitting.
# Modeling

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, MinMaxScaler
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.compose import ColumnTransformer
import random
from sklearn.neural_network import MLPClassifier as MLP
from sklearn.metrics import accuracy_score

#Setting up a function to transform name into intials
class NameSplitter(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        print('Initiate')
        pass
        
    def fit(self, X, y=None):
        print('Started')
        return self
    
    def transform(self, X):
            #Print shows custom transformer works
            print('Transform')
            X_new = X.copy()
            X_new[['First_Name','Last_Name']] = X_new['Name'].str.split(",",expand=True)
            X_new['Last_Name'] = X_new['Last_Name'].str.replace(' ', '')
            X_new['First_Name'] = X_new['First_Name'].str[0]
            X_new['Last_Name'] = X_new['Last_Name'].str[0]
            X_new = X_new.drop('Name', axis=1)
            print('Return new array')
            return X_new
        
    def fit_transform(self, X, y=None):
        print('Working')
        return self.fit(X, y).transform(X)

#Setting Name Splitter to Variable
ns = NameSplitter()

#Setting up column transformer
numericals = list(X_train.select_dtypes(include=['int64', 'float64']).columns)
n_t = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant')),
    ('scaler', StandardScaler())])

categoricals = list(X_train.select_dtypes(include=['object', 'bool']).columns)
c_t = Pipeline(steps=[
    ('ns', NameSplitter()),
    ('imputer', SimpleImputer(strategy="most_frequent")),
    ('scaler', OrdinalEncoder())])

column_trans = ColumnTransformer( 
               transformers=[
                ('num', n_t, numericals),
                ('cat', c_t, categoricals)])

#MLP
mlp = MLP(hidden_layer_sizes= (5,), solver='adam', activation='relu',
          learning_rate_init=0.1, learning_rate='adaptive',
          max_iter=1000, batch_size=7000, early_stopping=True)

#Setting up sequential feature selector

sfs = SFS(estimator=mlp,
          k_features =(1,(X_train.shape[1]+1)), 
          scoring='f1', 
          floating=True,
          forward=False,
          cv=5)

#Setting Pipeline
pipe = Pipeline(steps=[
                ('col_t', column_trans),
                ('sfs', sfs),
                ('mlp', mlp)])



#Modeling
model = pipe.fit(X_train, y_train)
print('Done')
print('Predicting')
print('Score is {}%'.format(accuracy_score(y_test, model.predict(X_test))))

In [None]:
test = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv')
test = test.drop(['PassengerId', 'Ticket', 'Cabin'], axis=1)
test.info()
test.head()

In [None]:
X_pred = test
# Saving the result
submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')
submission['Survived'] = model.predict(X_pred)
submission.to_csv("submission.csv", index=False)