In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import catboost

from catboost import Pool
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

print(catboost.__version__)

In [None]:
# https://www.kaggle.com/sagol79/stemmed-description-tokens-and-application-genres
parse_dates = ['bundle_released_at', 'updated_at', 'bundle_updated_at'] 
tokens_df = pd.read_csv('bundles_desc_tokens.csv.gz', index_col='id')
desc_df = pd.read_csv('bundles_desc.csv.gz', index_col='id')
prop_df = pd.read_csv('bundles_prop.csv.gz', index_col='id', parse_dates=parse_dates)
sum_df = pd.read_csv('bundles_summary.csv.gz', index_col='id')
df = tokens_df.join(desc_df).join(prop_df).join(sum_df)
df.head()

In [None]:
df['bundle_released_at'] = df['bundle_released_at'].fillna(
    pd.to_datetime(
        df['bundle_updated_at'], utc=True).dt.tz_convert(None))
df['bundle_updated_at'] = df['bundle_updated_at'].fillna(
    df['bundle_released_at'])

In [None]:
df['store_os'].hist(backend='plotly')

In [None]:
df[['genre', 'store_os']].hist(
    'genre', backend='plotly', orientation='h', histnorm='probability',
    color='store_os', height=1200)

In [None]:
df[['bundle_released_at', 'store_os']].hist(
    backend='plotly', color='store_os', title="Apps release dates",
    histnorm='probability')

In [None]:
df[['updated_at', 'store_os']].hist(
    backend='plotly', color='store_os', title="Dataset update dates", histnorm='probability')

In [None]:
df[['bundle_updated_at', 'store_os']].hist(
    backend='plotly', color='store_os', title="Bundle update date", histnorm='probability')

In [None]:
df['bundle_update_period'] = \
    (pd.to_datetime(
        df['bundle_updated_at'], utc=True).dt.tz_convert(None).dt.to_period('M').astype('int') - 
     df['bundle_released_at'].dt.to_period('M').astype('int'))


In [None]:
df[df['bundle_update_period'] > 0][['bundle_update_period', 'store_os']].dropna().hist(
    backend='plotly', color='store_os', title="Apps update periods in months",
    histnorm='probability')

In [None]:
def get_lengths(df, columns=['tokens', 'description']):
    lengths_df = pd.DataFrame()
    for i, c in enumerate(columns):
        lengths_df[f"{c}_len"] = df[c].apply(len)
        if i > 0:
            lengths_df[f"{c}_div"] = \
                lengths_df.iloc[:, i-1] / lengths_df.iloc[:, i]
            lengths_df[f"{c}_diff"] = \
                lengths_df.iloc[:, i-1] - lengths_df.iloc[:, i]
    return lengths_df

df = pd.concat([df, get_lengths(df)], axis=1, sort=False, copy=False)

In [None]:
df[['description_len', 'tokens_len']].hist(backend='plotly')

In [None]:
df['released_at_month'] = \
    (pd.Timestamp.today() - df['bundle_released_at']).astype('timedelta64[M]').astype('int')

In [None]:
df[['released_at_month', 'store_os']].hist(
    backend='plotly', color='store_os', title="Months since release",
    histnorm='probability')

In [None]:
android_df = df[df['store_os']=='android']
ios_df = df[df['store_os']=='ios']

In [None]:
columns = [
    'genre', 'tokens', 'bundle_update_period', 'tokens_len',
    'description_len', 'description_div', 'description_diff',
    'description', 'rating', 'reviews',
    'released_at_month'
]

In [None]:
train_df, test_df = train_test_split(
    android_df[columns], train_size=0.7, random_state=0, stratify=android_df['genre'])

y_train, X_train = train_df['genre'], train_df.drop(['genre'], axis=1)
y_test, X_test = test_df['genre'], test_df.drop(['genre'], axis=1)

train_pool = Pool(
    data=X_train, 
    label=y_train,
    text_features=['tokens', 'description']
)

test_pool = Pool(
    data=X_test, 
    label=y_test, 
    text_features=['tokens', 'description']
)

print('Train dataset shape: {}\n'.format(train_pool.shape))

In [None]:
def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(
        task_type='GPU',
        iterations=10000,
        eval_metric='Accuracy',
        od_type='Iter',
        od_wait=1000,
        **kwargs
    )

    return model.fit(
        train_pool,
        eval_set=test_pool,
        verbose=1000,
        plot=True,
        use_best_model=True
    )

In [None]:
tpo = {
    'tokenizers': [
        {
            'tokenizer_id': 'Sense',
            'separator_type': 'BySense',
        }
    ],
    'dictionaries': [
        {
            'dictionary_id': 'Word',
            'token_level_type': 'Word',
            'occurrence_lower_bound': '10'
        },
        {
            'dictionary_id': 'Bigram',
            'token_level_type': 'Word',
            'gram_order': '2',
            'occurrence_lower_bound': '10'
        },
        {
            'dictionary_id': 'Trigram',
            'token_level_type': 'Word',
            'gram_order': '3',
            'occurrence_lower_bound': '10'
        },
    ],
    'feature_processing': {
        '0': [
            {
                'tokenizers_names': ['Sense'],
                'dictionaries_names': ['Word'],
                'feature_calcers': ['BoW']
            },
            {
                'tokenizers_names': ['Sense'],
                'dictionaries_names': ['Bigram', 'Trigram'],
                'feature_calcers': ['BoW']
            },
        ],
        '1': [
            {
                'tokenizers_names': ['Sense'],
                'dictionaries_names': ['Word'],
                'feature_calcers': ['BoW', 'BM25']
            },
            {
                'tokenizers_names': ['Sense'],
                'dictionaries_names': ['Bigram', 'Trigram'],
                'feature_calcers': ['BoW']
            },
        ]
    }
}


In [None]:
model_catboost = fit_model(
    train_pool, test_pool,
    learning_rate=0.1,
    random_seed=0,
    text_processing = tpo
)

In [None]:
fea_imp = pd.DataFrame({'importance': model_catboost.feature_importances_,
                        'col': model_catboost.feature_names_})
fea_imp = fea_imp.sort_values(['importance', 'col'],
                               ascending=[True, False]).iloc[-40:]
fea_imp.plot(kind='barh', x='col', y='importance', figsize=(10, 10))

In [None]:
from sklearn.model_selection import StratifiedKFold

def get_oof(n_folds, x_train, y, x_test, text_features, seeds):
    
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]  
        
    oof_train = np.zeros((len(seeds), ntrain, 48))
    oof_test = np.zeros((ntest, 48))
    oof_test_skf = np.empty((len(seeds), n_folds, ntest, 48))
    test_pool = Pool(data=x_test, text_features=text_features) 
    models = {}
    for iseed, seed in enumerate(seeds):
        kf = StratifiedKFold(
            n_splits=n_folds,
            shuffle=True,
            random_state=seed)          
        for i, (tr_i, t_i) in enumerate(kf.split(x_train, y)):
            print(f'\nSeed {seed}, Fold {i}')
            x_tr = x_train.iloc[tr_i, :]
            y_tr = y[tr_i]
            x_te = x_train.iloc[t_i, :]
            y_te = y[t_i]
            train_pool = Pool(
                data=x_tr, label=y_tr, text_features=text_features)
            valid_pool = Pool(
                data=x_te, label=y_te, text_features=text_features)
            model = fit_model(
                train_pool, valid_pool,
                random_seed=seed,
                text_processing = tpo
            )
            oof_train[iseed, t_i, :] = \
                model.predict_proba(x_te)
            oof_test_skf[iseed, i, :, :] = \
                model.predict_proba(x_test)
            models[(seed, i)] = model
    oof_test[:, :] = oof_test_skf.mean(axis=1).mean(axis=0)
    oof_train = oof_train.mean(axis=0)
    return oof_train, oof_test, models

In [None]:
oof_train, oof_test, models = get_oof(
    n_folds=5,
    x_train=android_df[[x for x in columns if x!='genre']],
    y=android_df['genre'].values,
    x_test=ios_df[[x for x in columns if x!='genre']],
    text_features=['tokens', 'description'],
    seeds=[0, 42, 888]
)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(
    android_df['genre'].values,
    np.take(models[(0,0)].classes_, oof_train.argmax(axis=1)))

In [None]:
df.loc[df['store_os']=='ios', 'android_genre'] = \
    np.take(models[(0,0)].classes_, oof_test.argmax(axis=1))
df.loc[df['store_os']=='android', 'android_genre'] = \
    np.take(models[(0,0)].classes_, oof_train.argmax(axis=1))

In [None]:
df[['android_genre', 'store_os']].hist(
    'android_genre', backend='plotly', orientation='h',
    color='store_os', height=1000, histnorm='probability')

In [None]:
idx = df[df['store_os']=='ios'].index
df.loc[df['store_os']=='ios', 'android_genre_vec'] = \
    pd.Series(list(oof_test), index=idx)
idx = df[df['store_os']=='android'].index
df.loc[df['store_os']=='android', 'android_genre_vec'] = \
    pd.Series(list(oof_train), index=idx)