In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # data visualisation

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
synthanic_raw = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/train.csv',
                            index_col='PassengerId')
synthanic_test_raw = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/test.csv',
                                 index_col='PassengerId')

In [None]:
synthanic_raw.head()

In [None]:
# The editable copy
synthanic = synthanic_raw.copy()
synthanic_test = synthanic_test_raw.copy()

# Target encoding

In [None]:
# Extracting first letter of Cabin
synthanic['CabinAlpha'] = synthanic['Cabin'].\
    fillna('Unknown').\
    astype(str).\
    map(lambda row: row[0])

synthanic.drop('Cabin', axis=1, inplace=True)

# Extracting first letter of Ticket
synthanic['TicketAlpha'] = synthanic['Ticket'].\
    fillna('Unknown').\
    astype(str).\
    map(lambda row: row[0])

synthanic.drop('Ticket', axis=1, inplace=True)

synthanic.head()

In [None]:
synthanic_test['CabinAlpha'] = synthanic_test['Cabin'].\
    fillna('Unknown').\
    astype(str).\
    map(lambda row: row[0])

synthanic_test.drop('Cabin', axis=1, inplace=True)

# Extracting first letter of Ticket
synthanic_test['TicketAlpha'] = synthanic_test['Ticket'].\
    fillna('Unknown').\
    astype(str).\
    map(lambda row: row[0])

synthanic_test.drop('Ticket', axis=1, inplace=True)

In [None]:
# Step 1: drop high cardinality categorical columns
display(synthanic.select_dtypes(include=['object']).nunique())

high_cardinality_cols = synthanic.select_dtypes(include=['object']).nunique()[synthanic.select_dtypes(include=['object']).nunique() > 20].index

synthanic.drop(high_cardinality_cols, axis=1, inplace=True)
synthanic_test.drop(high_cardinality_cols, axis=1, inplace=True)

synthanic.head()

In [None]:
# Step 2: drop rows with any null entries
synthanic.drop(synthanic[synthanic.isnull().any(axis=1)].index, axis=0, inplace=True)

synthanic.head()

# Target encoding (for categorical variables)

In [None]:
synthanic_target = pd.DataFrame()
synthanic_test_target = pd.DataFrame()

for col in synthanic.columns:
    if col in synthanic.select_dtypes(include=['object', 'int']).columns:
        synthanic_target[col] = synthanic[col].replace(dict(synthanic.groupby(col)['Survived'].mean()))
    else:
        synthanic_target[col] = synthanic[col]

for col in synthanic_test.columns:
    if col in synthanic_test.select_dtypes(include=['object', 'int']).columns:
        synthanic_test_target[col] = synthanic_test[col].replace(dict(synthanic.groupby(col)['Survived'].mean()))
    else:
        synthanic_test_target[col] = synthanic_test[col]

synthanic_target.to_csv('synthanic_target_encoded.csv')
synthanic_test_target.to_csv('synthanic_test_target_encoded.csv')

# Simple model

In [None]:
from sklearn.model_selection import train_test_split

y = synthanic_target['Survived']
X = synthanic_target.drop('Survived', axis=1)
X_test = synthanic_test_target

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2,
                                                      random_state=1)

In [None]:
from xgboost import XGBClassifier

model = XGBClassifier(objective='binary:logistic', use_label_encoder=False,
                      n_estimators=200,
                      verbosity=1, 
                      learning_rate=0.05, 
                      random_state=1,
                      )

In [None]:
model.fit(X_train, y_train,
          eval_set=[(X_train, y_train), (X_valid, y_valid)],
          early_stopping_rounds=5,
          eval_metric='logloss',
          verbose=1)

evals_result = model.evals_result()

In [None]:
preds = model.predict(X)

In [None]:
sample_submission = pd.read_csv('/kaggle/input/tabular-playground-series-apr-2021/sample_submission.csv')
sample_submission

In [None]:
submission = pd.DataFrame(model.predict(X_test))
submission.columns = ['Survived']
submission.index = X_test.index
submission.reset_index(inplace=True)
submission.to_csv('submission.csv')