#### Please upvote if you find the notebook interesting/useful :)

## We can see:
- from version 4 - we can recieve **0.81420** without pseudolabelling
- from version 6 - **0.81549** if we add OOF and test preds from [catboost model](https://www.kaggle.com/gomes555/tps-apr2021-catboost-run-pseudo-label)

Let's modify the version 6 using our best submit with score 0.81581 as a pseudo for Catboost.

#### Please upvote Fellipe's kernel for his great work as well - really appreciate it!

# Install [AutoWoe](https://github.com/sberbank-ai-lab/AutoMLWhitebox) library

This library is a part of [LightAutoML](https://github.com/sberbank-ai-lab/LightAutoML) framework and is used in Whitebox preset, but here we will show how to use it separately

In [None]:
!pip install -U autowoe

# Imports 

In [None]:
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from matplotlib import pyplot as plt

from autowoe import AutoWoE, ReportDeco

# Data loading

In [None]:
INPUT_PATH = '../input/tabular-playground-series-apr-2021/'
train_data = pd.read_csv(INPUT_PATH + 'train.csv')
train_data

In [None]:
test_data = pd.read_csv(INPUT_PATH + 'test.csv')
test_data

In [None]:
submission = pd.read_csv(INPUT_PATH + 'sample_submission.csv')
submission

In [None]:
print('TRAIN TARGET MEAN = {:.3f}'.format(train_data['Survived'].mean()))

# Load OOFs and Test predictions

In [None]:
import joblib
oofs, test_preds = joblib.load('../input/catboost-run-pseudolabel-to-recieve-oof/predictions_newcb_as_pseudo.pkl')
train_data['preds'] = oofs
test_data['preds'] = test_preds

In [None]:
train_data

In [None]:
test_data

# Extra features functions creation

In [None]:
def create_extra_features_1(data_0):
    data = data_0.copy()
    data.Cabin = data.Cabin.map(lambda x: str(x)[0].strip())
    
    data['Ticket1'] = data.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else np.nan)
    data['Ticket2'] = data.Ticket.str.replace('\.','', regex=True).\
                    str.replace('(\d+)', '', regex=True).\
                    str.replace(' ', '', regex=True).\
                    replace(r'^\s*$', 'X', regex=True)
    
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    data['FirstName'] = data.Name.map(lambda x: str(x).split(',')[0])
    data['Surname'] = data.Name.map(lambda x: str(x).split(',')[1])
    
    for col in ['Name', 'FirstName', 'Surname']:
        data['Counter_' + col] = data[col].map(data.groupby(col)['PassengerId'].count().to_dict())
        
    data.drop(columns = ['Name', 'Surname', 'Ticket'], inplace = True)
    
    for pair in [('Ticket1', 'Cabin'), ('Ticket2', 'Cabin'), 
                 ('Cabin', 'Parch'), ('Ticket1', 'Parch'), ('Ticket2', 'Parch'),
                 ('Cabin', 'Embarked'), ('Ticket1', 'Embarked'), ('Ticket2', 'Embarked'), ('Embarked', 'Parch')
                ]:
        data[pair[0] + '_' + pair[1]] = data[pair[0]].astype(str) + '_' + data[pair[1]].astype(str)
        
    return data

def create_extra_features_2(data_0):
    data = data_0.copy()
    data.Cabin = data.Cabin.map(lambda x: str(x)[0].strip())
    data.Ticket = data.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else np.nan)
    
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    data['FirstName'] = data.Name.map(lambda x: str(x).split(',')[0])
    data['Surname'] = data.Name.map(lambda x: str(x).split(',')[1])
    
    for col in ['Name', 'FirstName', 'Surname']:
        data['Counter_' + col] = data[col].map(data.groupby(col)['PassengerId'].count().to_dict())
        
    data.drop(columns = ['Name', 'Surname'], inplace = True)
    
    return data


all_df = pd.concat([train_data, test_data]).reset_index(drop = True)
print(all_df.shape)

# Train 2 separate models for Sex on full train and pseudolabelled test

In [None]:
TARGET = 'Survived'

def get_model():
    return AutoWoE(monotonic=False,
                     vif_th=20.,
                     imp_th=0,
                     th_const=32,
                     force_single_split=True,
                     min_bin_size = 0.01,
                     oof_woe=True,
                     n_folds=10,
                     n_jobs=4,
                     regularized_refit=True,
                     verbose=2
            )

def fit_autowoe(data, model_name):
    auto_woe = get_model()
    auto_woe.fit(data.drop('PassengerId', axis = 1), target_name=TARGET)
    train_pred = auto_woe.predict_proba(data)
    print('\t{}: ACCURACY ON TRAIN = {:.5f}'.format(model_name, accuracy_score(data[TARGET], (train_pred > 0.5).astype(int))))
    return auto_woe

def cv_autowoe(data, 
               test_data, 
               n_folds = 5, 
               sex = 'Unknown'):
    
    print('=' * 50)
    print('Start model for sex = {}'.format(sex))
    
    oof_pred = np.array([0.0] * len(data))
    test_pred = np.array([0.0] * len(test_data))
    
    skf = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = 13)
    models = []
    for fold, (train_index, test_index) in enumerate(skf.split(data[TARGET], data[TARGET])):
        print('\tStart model sex = {}, fold = {}'.format(sex, fold))
        train_data = data.iloc[train_index, :]
        valid_data = data.iloc[test_index, :]
        
        model_name = 'Model_{}_{}'.format(sex, fold)
        model = fit_autowoe(train_data, model_name)
        models.append(model)
        
        val_pred = model.predict_proba(valid_data)
        oof_pred[test_index] = val_pred
        print('\t{}: ACCURACY ON VALID = {:.5f}'.format(model_name, accuracy_score(valid_data[TARGET], (val_pred > 0.5).astype(int))))
        
        test_pred += model.predict_proba(test_data) / n_folds
        print('\t' + '*' * 50)
        
    print('ACCURACY ON OOF = {:.5f}'.format(accuracy_score(data[TARGET], (oof_pred > 0.5).astype(int))))
        
    return models, oof_pred, test_pred

def build_model_split_by_sex(train_data, test_data, n_folds):
    male_models, male_oof_pred, male_test_pred = cv_autowoe(train_data[train_data['Sex'] == 'male'], test_data, n_folds, 'male')
    print('=' * 50 + '\n' + '=' * 50)
    female_models, female_oof_pred, female_test_pred = cv_autowoe(train_data[train_data['Sex'] == 'female'], test_data, n_folds, 'female')
    
    oof_preds = np.array([0.0] * len(train_data))
    oof_preds[train_data['Sex'] == 'male'] = male_oof_pred
    oof_preds[train_data['Sex'] == 'female'] = female_oof_pred
    
    test_preds = np.where(test_data['Sex'] == 'male', male_test_pred, female_test_pred)
    return oof_preds, test_preds

In [None]:
all_df.head()

In [None]:
%%time

gen_feats = create_extra_features_1(all_df)
train_data, test_data = gen_feats[:len(train_data)], gen_feats[len(train_data):]
print(train_data.shape, test_data.shape)
oof_preds_1, test_preds_1 = build_model_split_by_sex(train_data, test_data, 10)

In [None]:
print('ACCURACY = {:.5f}'.format(accuracy_score(train_data[TARGET], (oof_preds_1 > 0.5).astype(int))))

In [None]:
# pos_data = np.where(train_data[TARGET] == 1)[0]
# neg_data = np.where(train_data[TARGET] == 0)[0]
# full_target = train_data[TARGET].values

# N = 25000
# res = []
# for _ in range(100): 
#     pos_part = int(np.round(0.34911 * N))
#     neg_part = N - pos_part

#     pos_rnd_idx = np.random.randint(0, len(pos_data), pos_part)
#     neg_rnd_idx = np.random.randint(0, len(neg_data), neg_part)

#     target = np.array(list(full_target[pos_rnd_idx]) + list(full_target[neg_rnd_idx]))
#     preds = np.array(list(oof_preds_1[pos_rnd_idx]) + list(oof_preds_1[neg_rnd_idx]))
#     bound = pd.Series(preds).sort_values(ascending = False).head(pos_part).values[-1]
#     preds = (preds > bound).astype(int)
#     res.append(accuracy_score(target, preds))
    
# print(np.array(res))
# print(np.mean(res))

In [None]:
%%time

gen_feats = create_extra_features_2(all_df)
train_data, test_data = gen_feats[:len(train_data)], gen_feats[len(train_data):]
print(train_data.shape, test_data.shape)
oof_preds_2, test_preds_2 = build_model_split_by_sex(train_data, test_data, 10)

In [None]:
print('ACCURACY = {:.5f}'.format(accuracy_score(train_data[TARGET], (oof_preds_2 > 0.5).astype(int))))

In [None]:
best_score = -1
best_w = None
for w in np.arange(0, 1.01, 0.01):
    comb_pred = w * oof_preds_1 + (1 - w) * oof_preds_2
    score = accuracy_score(train_data[TARGET], (comb_pred > 0.5).astype(int))
    
    if score > best_score:
        best_score = score
        best_w = w
    
    print('{:.2f} ACCURACY = {:.5f}'.format(w, score))

In [None]:
print('BEST W = {:.2f}, BEST ACCURACY = {:.5f}'.format(best_w, best_score))

In [None]:
preds = best_w * test_preds_1 + (1 - best_w) * test_preds_2

# Create submissions

In [None]:
bound = pd.Series(preds).sort_values(ascending = False).head(34911).values[-1]
bound

In [None]:
submission['Survived'] = (preds > bound).astype(int)
submission.to_csv('AutoWoE_submission_combo.csv', index = False)

In [None]:
submission['Survived'].mean()

# Appendix

In [None]:
import joblib
joblib.dump((oof_preds_1, test_preds_1, oof_preds_2, test_preds_2), 'both_preproc_preds.pkl')