In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import catboost

from catboost import Pool
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

print(catboost.__version__)

In [None]:
tokens_df = pd.read_csv('/kaggle/input/stemmed-description-tokens-and-application-genres/bundles_desc_tokens.csv', index_col='id')
desc_df = pd.read_csv('/kaggle/input/stemmed-description-tokens-and-application-genres/bundles_desc.csv', index_col='id')
prop_df = pd.read_csv('/kaggle/input/stemmed-description-tokens-and-application-genres/bundles_prop.csv', index_col='id')
df = tokens_df.join(desc_df).join(prop_df)
df.head()

In [None]:
df['desc_len'] = df['description'].apply(len)

In [None]:
columns = ['tokens', 'genre', 'desc_len', 'store_os']
train_df, test_df = train_test_split(
    df[columns], train_size=0.7, random_state=0, stratify=df['genre'])

y_train, X_train = train_df['genre'], train_df.drop(['genre'], axis=1)
y_test, X_test = test_df['genre'], test_df.drop(['genre'], axis=1)

In [None]:
train_pool = Pool(
    data=X_train, 
    label=y_train, 
    text_features=['tokens'],
    cat_features=['store_os']
)

test_pool = Pool(
    data=X_test, 
    label=y_test, 
    text_features=['tokens'],
    cat_features=['store_os']
)

print('Train dataset shape: {}\n'.format(train_pool.shape))

In [None]:
def fit_model(train_pool, test_pool, **kwargs):
    model = CatBoostClassifier(
        random_seed=0,
        max_ctr_complexity=1,
        task_type='GPU',
        iterations=10000,
        eval_metric='Accuracy',
        od_type='Iter',
        od_wait=500,
        **kwargs
    )

    return model.fit(
        train_pool,
        eval_set=test_pool,
        verbose=1000,
        plot=True,
        use_best_model=True
    )


In [None]:
tpo = {
    "dictionaries" : [{
        "dictionary_id" : "Word",
        "gram_order" : "1"
    }],

    "feature_processing" : {
        "default" : [{
            "dictionaries_names" : ["Word"],
            "feature_calcers" : ["BoW"]
        }]
    }
}

In [None]:
model = fit_model(
    train_pool, test_pool,
    text_processing = tpo
)

In [None]:
test_df['predict'] = model.predict(test_pool)
test_df.head(20)

In [None]:
print(test_df[test_df['genre']!=test_df['predict']].shape)
test_df[test_df['genre']!=test_df['predict']].head(30)

In [None]:
fea_imp = pd.DataFrame({'importance': model.feature_importances_,
                        'col': model.feature_names_})
fea_imp = fea_imp.sort_values(['importance', 'col'],
                               ascending=[True, False]).iloc[-40:]
fea_imp.plot(kind='barh', x='col', y='importance', figsize=(10, 10))