In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

A simple, yet quick, baseline for the special edition of the titanic competition: https://www.kaggle.com/c/tabular-playground-series-apr-2021


## Load data

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2021/test.csv')

train['is_train'] = True
test['is_train'] = False

tnt = train.append(test).reset_index(drop = True)

tnt

## Build some features

In [None]:

tnt['Cabin_Letter'] = tnt['Cabin'].str[0].fillna('unknown')
tnt['Cabin_Number'] = tnt['Cabin'].str[1:].fillna(-1).astype('int')
tnt['Embarked'] = tnt['Embarked'].fillna('unknown')
tnt['Sex'] = tnt['Sex'].fillna('unknown')
tnt['Parch'] = tnt['Parch'].astype('str')
tnt['Fare'] = tnt['Fare'].fillna(tnt['Fare'].mean())
tnt['Age'] = tnt['Age'].fillna(tnt['Age'].mean())
tnt['SibSp'] = tnt['SibSp'].astype('str')
tnt['Pclass'] = tnt['Pclass'].astype('str')

# split Ticket in two parts
tnt[['ticket_pt2','ticket_pt1']] = tnt['Ticket'].str[::-1].str.split(' ', expand=True)
tnt['ticket_pt1'] = tnt['ticket_pt1'].str[::-1]
tnt['ticket_pt2'] = tnt['ticket_pt2'].str[::-1]


In [None]:
tnt.head()

In [None]:
#tnt.info()

In [None]:
# select features for model

target = ['Survived']
features = ['Sex', 'Embarked','Cabin_Letter',  
            'Parch', 'Age', 'SibSp', 'Pclass', 'ticket_pt1', 
            'Cabin_Number', 'Fare']

tnt = tnt[features + target +['is_train']]

## ohe
tnt = pd.get_dummies(tnt)

## split into train and test dataset
tt = tnt[tnt.is_train]
tte = tnt[tnt.is_train==False]

## Train model and predict

In [None]:
%%time

N_SPLITS = 5
skf = StratifiedKFold(n_splits=N_SPLITS)

target = ['Survived']
features = list(set(tnt.columns)-set(target))

X = tt[features]
y = tt[target].values

test_preds = np.zeros((tte.shape[0], N_SPLITS))

acc_all = 0

for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index].ravel(), y[test_index].ravel()

    model = GradientBoostingClassifier(random_state=0,).fit(X_train, y_train)
    
    acc = accuracy_score(y_test, model.predict(X_test))
    acc_all += acc / N_SPLITS
    print(f'Accuracy of fold #{fold}: {acc}')
    
    test_preds[:, fold] = model.predict(tte[features])

tte[target] = test_preds.mean(axis=1).round().astype('int')
print(f'\nMean accuracy: {acc_all}')

## Create submission

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-apr-2021/sample_submission.csv')

submission.loc[:,'Survived'] = tte['Survived'].values

submission.to_csv('submission.csv', index = False)

submission.head()