In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from ast import literal_eval

os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" 
os.environ["CUDA_VISIBLE_DEVICES"]= "0"    
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = 'true'

import catboost

from catboost import Pool
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier, CatBoostRegressor
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold

print(catboost.__version__)

In [None]:
genders_df = pd.read_csv('./data/bundles_gender.csv', index_col='id')
users_df = pd.read_csv('./data/users.csv', index_col='uid', converters={'ids': eval})

In [None]:
users_df.head(10)

In [None]:
users_df['gend'].hist(backend='plotly')

In [None]:
genders_df.describe()

In [None]:
genders_df[['M', 'F']].hist(backend='plotly', bins=100, barmode="overlay")

In [None]:
genders_df[
    (genders_df['F']>=0.3325) & 
    (genders_df['F']<=0.3375)
].describe()

In [None]:
genders_df[genders_df['cnt']>10][['M', 'F']].hist(backend='plotly', bins=100, barmode="overlay")

In [None]:
genders_df[(genders_df['F']>=0.7475) & (genders_df['F']<=0.7525) & (genders_df['cnt']>10)]

In [None]:
genders_df[genders_df['cnt']>50][['M', 'F']].hist(backend='plotly', bins=100, barmode="overlay")

# New Features

In [None]:
users_df['apps_count'] = users_df['ids'].apply(len)

In [None]:
users_df.groupby('gend')['apps_count'].describe()

In [None]:
users_df['ids_txt'] = \
    users_df['ids'].apply(
        lambda x: " ".join([str(i) for i in x ]))

In [None]:
g_dict = genders_df['F'].to_dict()
users_df['F_prob'] = users_df['ids'].apply(
    lambda x: np.mean(
        list(filter(None.__ne__, list(map(g_dict.get, x))))
    )
)


In [None]:
np.corrcoef(
    users_df['F_prob'],
    users_df['gend'].astype('category').cat.codes
)[0,1]

In [None]:
users_df[['F_prob', 'gend']].hist(backend='plotly', color='gend', barmode="overlay")

In [None]:
g_dict = genders_df['M'].to_dict()
users_df['M_prob'] = users_df['ids'].apply(
    lambda x: np.mean(
        list(filter(None.__ne__, list(map(g_dict.get, x))))
    )
)

In [None]:
users_df[['M_prob', 'gend']].hist(backend='plotly', color='gend', barmode="overlay")

# Baseline

In [None]:
train, test = train_test_split(
    users_df, train_size=0.7,
    random_state=0, stratify=users_df['gend'])

In [None]:
print(f"Accuracy: \
    {accuracy_score(users_df['gend'].astype('category').cat.codes, users_df['F_prob']<0.5)}")
print(f"AUC: \
    {1 - roc_auc_score(users_df['gend'].astype('category').cat.codes, users_df['F_prob'])}")
#(0.740925288445762, 0.7793767183917958)

# Logistic Regression

In [None]:
import itertools

len(set(itertools.chain.from_iterable(users_df['ids'])))

In [None]:
mlb = MultiLabelBinarizer(sparse_output=True)
mlb.fit(users_df['ids'])
train_mlb = mlb.transform(train['ids'])
test_mlb = mlb.transform(test['ids'])

In [None]:
def get_oof_lr(n_folds, x_train, y, x_test, seeds):
    
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]  
        
    oof_train = np.zeros((len(seeds), ntrain, 2))
    oof_test = np.zeros((ntest, 2))
    oof_test_skf = np.empty((len(seeds), n_folds, ntest, 2))
    models = {}
    for iseed, seed in enumerate(seeds):
        kf = StratifiedKFold(
            n_splits=n_folds,
            shuffle=True,
            random_state=seed)          
        for i, (tr_i, t_i) in enumerate(kf.split(x_train, y)):
            print(f'\nSeed {seed}, Fold {i}')
            x_tr = x_train[tr_i, :]
            y_tr = y[tr_i]
            x_te = x_train[t_i, :]
            y_te = y[t_i]
            model = LogisticRegression(
                random_state=seed,
                max_iter = 10000,
                verbose=1,
                n_jobs=20
            )
            model.fit(x_tr, y_tr)
            oof_train[iseed, t_i, :] = \
                model.predict_proba(x_te)
            print(f"AUC: {roc_auc_score(y_te, oof_train[iseed, t_i, :][:,1])}")
            oof_test_skf[iseed, i, :, :] = \
                model.predict_proba(x_test)
            models[(seed, i)] = model
    oof_test[:, :] = oof_test_skf.mean(axis=1).mean(axis=0)
    oof_train = oof_train.mean(axis=0)
    return oof_train, oof_test, models

In [None]:
oof_train_lr, oof_test_lr, models_lr = get_oof_lr(
    n_folds=5,
    x_train=train_mlb,
    y=train['gend'].values,
    x_test=test_mlb,
    seeds=[0, 42, 888]
)

In [None]:
print(f"Accuracy: \
    {1-accuracy_score(test['gend'].values, np.take(['M','F'], oof_test_lr.argmax(axis=1)))}")
print(f"AUC: \
    {roc_auc_score(test['gend'].astype('category').cat.codes, oof_test_lr[:,1])}")
#(0.8208932240918818, 0.8798990678456793)

# Catboost Model #1 

In [None]:
def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(
        task_type='GPU',
        iterations=10000,
        eval_metric='AUC',
        od_type='Iter',
        od_wait=1000,
        learning_rate=0.1,
        **kwargs
    )

    return model.fit(
        train_pool,
        eval_set=test_pool,
        verbose=1000,
        plot=False,
        use_best_model=True
    )

In [None]:
tpo = {
    'tokenizers': [
        {
            'tokenizer_id': 'SenseL',
            'separator_type': 'BySense',
            'lowercasing': 'True'
        }       
    ],
    'dictionaries': [
        {
            'dictionary_id': 'Word',
            'token_level_type': 'Word',
            'occurrence_lower_bound': '10'
        },
        {
            'dictionary_id': 'Bigram',
            'token_level_type': 'Word',
            'gram_order': '2',
            'occurrence_lower_bound': '10'
        },
        {
            'dictionary_id': 'Trigram',
            'token_level_type': 'Word',
            'gram_order': '3',
            'occurrence_lower_bound': '10'
        }
    ],
    'feature_processing': {
        '0': [
            {
                'tokenizers_names': ['SenseL'],
                'dictionaries_names': ['Word'],
                'feature_calcers': ['BoW']
            },
            {
                'tokenizers_names': ['SenseL'],
                'dictionaries_names': ['Bigram', 'Trigram'],
                'feature_calcers': ['BoW']
            }
        ]
    }
}

In [None]:
def get_oof_cb(n_folds, x_train, y, x_test, text_features, seeds):
    
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]  
        
    oof_train = np.zeros((len(seeds), ntrain, 2))
    oof_test = np.zeros((ntest, 2))
    oof_test_skf = np.empty((len(seeds), n_folds, ntest, 2))
    test_pool = Pool(data=x_test, text_features=text_features) 
    models = {}
    for iseed, seed in enumerate(seeds):
        kf = StratifiedKFold(
            n_splits=n_folds,
            shuffle=True,
            random_state=seed)          
        for i, (tr_i, t_i) in enumerate(kf.split(x_train, y)):
            print(f'\nSeed {seed}, Fold {i}')
            x_tr = x_train.iloc[tr_i, :]
            y_tr = y[tr_i]
            x_te = x_train.iloc[t_i, :]
            y_te = y[t_i]
            train_pool = Pool(
                data=x_tr, label=y_tr, text_features=text_features)
            valid_pool = Pool(
                data=x_te, label=y_te, text_features=text_features)
            model = fit_model(
                train_pool, valid_pool,
                random_seed=seed,
                text_processing = tpo
            )
            oof_train[iseed, t_i, :] = \
                model.predict_proba(valid_pool)
            oof_test_skf[iseed, i, :, :] = \
                model.predict_proba(test_pool)
            models[(seed, i)] = model
    oof_test[:, :] = oof_test_skf.mean(axis=1).mean(axis=0)
    oof_train = oof_train.mean(axis=0)
    return oof_train, oof_test, models

In [None]:
columns = ['ids_txt', 'apps_count']
oof_train_cb, oof_test_cb, models_cb = get_oof_cb(
    n_folds=5,
    x_train=train[columns],
    y=train['gend'].values,
    x_test=test[columns],
    text_features=['ids_txt'],
    seeds=[0, 42, 888]
)

In [None]:
accuracy_score(
    test['gend'].values,
    np.take(models_cb[(0,0)].classes_, oof_test_cb.argmax(axis=1))), \
roc_auc_score(test['gend'].astype('category').cat.codes, oof_test_cb[:,1])
#(0.8219498855725884, 0.8855460541892763)

# Catboost Model #2

In [None]:
train_2 = train.copy()
test_2 = test.copy()

train_2['lr'] = oof_train_lr[:,1]
test_2['lr'] = oof_test_lr[:,1]

train_2['cb'] = oof_train_cb[:,1]
test_2['cb'] = oof_test_cb[:,1]


In [None]:
columns = ['ids_txt', 'F_prob', 'lr', 'apps_count']

oof_train_cb_2, oof_test_cb_2, models_cb_2 = get_oof(
    n_folds=5,
    x_train=train_2[columns],
    y=train_2['gend'].values,
    x_test=test_2[columns],
    text_features=['ids_txt'],
    seeds=[0, 42, 888]
)

In [None]:
accuracy_score(
    test_2['gend'].values,
    np.take(models_cb_2[(0,0)].classes_, oof_test_cb_2.argmax(axis=1))), \
roc_auc_score(test_2['gend'].astype('category').cat.codes, oof_test_cb_2[:,1])
#(0.8369661602833339, 0.9009988949146348)