# Load files

In [None]:
filename_train = '../input/tabular-playground-series-apr-2021/train.csv'
filename_test  = '../input/tabular-playground-series-apr-2021/test.csv'

In [None]:
import pandas as pd

df_train = pd.read_csv(filename_train, index_col='PassengerId')
df_test  = pd.read_csv(filename_test, index_col='PassengerId')

In [None]:
df_train

In [None]:
df_train.isnull().sum()

In [None]:
df_test

In [None]:
df_test.isnull().sum()

# Feature engineering

## Categorize cabin

In [None]:
import numpy as np

df_train['Cabin'] = df_train['Cabin'].apply(lambda x: x[:2] if x is not np.nan else np.nan)
df_train

In [None]:
df_test['Cabin'] = df_test['Cabin'].apply(lambda x: x[:2] if x is not np.nan else np.nan)
df_test

## Categorize ticket

In [None]:
df_train['Ticket'] = df_train['Ticket'].apply(lambda x: str(x).split()[0])
df_train

In [None]:
df_test['Ticket'] = df_test['Ticket'].apply(lambda x: str(x).split()[0])
df_test

## Add the number of families

In [None]:
df_train['Family'] = df_train['SibSp'] + df_train['Parch']
df_train

In [None]:
df_test['Family'] = df_test['SibSp'] + df_test['Parch']
df_test

# Preprocessing

## Explanatory variables

In [None]:
drop_cols = ['Name', 'SibSp', 'Parch']
obj_col = 'Survived'

In [None]:
X_train = df_train.drop(drop_cols + [obj_col], axis=1)
X_train

In [None]:
X_test = df_test.drop(drop_cols, axis=1)
X_test

## Objective variables

In [None]:
y_train = df_train['Survived']
y_train

## Change object-type to category-type

In [None]:
def to_category(df):
    for col, type in zip(df.columns, df.dtypes):
        if type == object:
            df[col] = df[col].astype('category')
    
    return df

In [None]:
X_train = to_category(X_train)
X_test  = to_category(X_test)

# Modeling

In [None]:
import numpy as np
import lightgbm as lgb
from sklearn.metrics import accuracy_score

def acc_score(preds, data):
    y_true = data.get_label()
    y_pred = np.where(preds > 0.5, 1, 0)
    acc = accuracy_score(y_true, y_pred)
    return 'acc', acc, True

lgb_train = lgb.Dataset(X_train, y_train)

params = {
    'objective': 'binary',    
    #'min_data_in_leaf': 15,  
    #'force_row_wise': True,
}

cv_rslt = lgb.cv(params, 
                 lgb_train, 
                 nfold=5, 
                 num_boost_round=10000, 
                 early_stopping_rounds=20,               
                 feval=acc_score,
                 return_cvbooster=True)

In [None]:
cvbooster = cv_rslt['cvbooster']

In [None]:
cv_rslt['acc-mean']

# Pseudo-Labeling

In [None]:
import scipy

y_preds = cvbooster.predict(X_test, num_iteration=cvbooster.best_iteration)
y_preds = np.round(y_preds)
y_pred = scipy.stats.mode(y_preds)[0].squeeze()
y_pred = pd.Series(y_pred, name=obj_col).astype(int)
y_pred

In [None]:
X_merge = pd.concat([X_train, X_test], axis=0)
X_merge

In [None]:
X_merge = to_category(X_merge)
X_merge.dtypes

In [None]:
y_merge = pd.concat([y_train, pd.Series(y_pred)], axis=0).reset_index(drop=True)
y_merge

In [None]:
lgb_train = lgb.Dataset(X_merge, y_merge)

cv_rslt = lgb.cv(params, 
                 lgb_train, 
                 nfold=5, 
                 num_boost_round=10000, 
                 early_stopping_rounds=20,               
                 feval=acc_score,
                 return_cvbooster=True)

In [None]:
cv_rslt['acc-mean']

In [None]:
cvbooster = cv_rslt['cvbooster']

# Prediction

In [None]:
import scipy

y_preds = cvbooster.predict(X_test, num_iteration=cvbooster.best_iteration)
y_preds = np.round(y_preds)
y_test = scipy.stats.mode(y_preds)[0].squeeze()
y_test = pd.Series(y_test, name=obj_col).astype(int)
y_test

In [None]:
answer = pd.concat([df_test.index.to_series().reset_index(drop=True), y_test], axis=1)
answer

In [None]:
filename_output = './submission.csv'
answer.to_csv(filename_output, index=False)