This Kaggle notebook is meant to be viewed in conjunction with my article *Categorical embeddings with CatBoost*, which you can find on Towards Data Science. The point of the notebook was not to maximize the competition score, but rather to provide a simple example showing that the use of categorical embeddings can sometimes enhance results.

As the article mentions, the OptimizedRounder class and associated methods were used in many of the public notebooks for this competition. The version I used came from this notebook (https://www.kaggle.com/code/adityaecdrid/8th-place-solution-code/script?scriptVersionId=12171797).

In [None]:
import catboost as cb
from fastai.tabular.all import *
import numpy as np
import os
import pandas as pd
import scipy as sp
import seaborn as sns
from sklearn.model_selection import train_test_split
from typing import Dict, Tuple

In [None]:
# switch to indicate whether or not to feed the original categorical values into the CatBoost regressor
use_categorical = True
# switch to indicate whether or not to feed categorical embeddings into the CatBoost regressor
use_embedded = False

In [None]:
input_folder = '../input/petfinder-adoption-prediction'
df_train = pd.read_csv(f'{input_folder}/train/train.csv')
df_test = pd.read_csv(f'{input_folder}/test/test.csv')
df_sample_submission = pd.read_csv(f'{input_folder}/test/sample_submission.csv')
df_sample_submission.drop('AdoptionSpeed', axis=1, inplace=True)

In [None]:
def get_mixed_breed(breed1: int, breed2: int) -> int:    
    """
    Returns value to indicate mixed_breed status
    
        0 - breed is not mixed
        1 - animal is of mixed breed with both breeds specified
        2 - animal is of mixed breed with only one breed specified
        3 - animal is of mixed breed with no breed specified
    """
    
    if breed1 != 307 and breed2 != 307 and breed1 != 0 and breed2 != 0 and breed1 != breed2:
        return 1
    elif (breed1 != 307 and breed1 != 0 and breed2 == 307) or (breed2 != 307 and breed2 != 0 and breed1 == 307):
        return 2
    elif (breed1 == 307 and breed2 == 307) or (breed1 == 307 and breed2 == 0):
        return 3
    
    return 0

df_train['mixed_breed'] = df_train.apply(lambda row: get_mixed_breed(row.Breed1, row.Breed2), axis=1)
df_test['mixed_breed'] = df_test.apply(lambda row: get_mixed_breed(row.Breed1, row.Breed2), axis=1)

In [None]:
def get_sentiment(filename: str) -> pd.Series:    
    """
    Retrieves document sentiment and language attributes from given file
    
    The files contain the results of sentiment analysis from Google's Natural Language API
    They were provided with the dataset
    The method returns a pandas dataframe containing sentiment magnitude, sentiment score and language
    If the file is not found the method returns an empty pandas dataframe
    """
    try:
        with open(filename) as file:
            data = json.load(file)
        return pd.Series((data['documentSentiment']['magnitude'], data['documentSentiment']['score'], data['language']))
    except FileNotFoundError:
        return pd.Series((np.nan, np.nan))
    
df_train[['description_sentiment_magnitude', 'description_sentiment_score', 'description_language']] = df_train.PetID.apply(lambda pet_id: get_sentiment(f'{input_folder}/train_sentiment/{pet_id}.json'))
df_test[['description_sentiment_magnitude', 'description_sentiment_score', 'description_language']] = df_test.PetID.apply(lambda pet_id: get_sentiment(f'{input_folder}/test_sentiment/{pet_id}.json'))
df_train['description_length'] = df_train.Description.str.count(' ')
df_test['description_length'] = df_test.Description.str.count(' ')

In [None]:
df_train.loc[df_train.PetID=='e3b589e13', 'Age']=2
df_train.loc[df_train.PetID=='e77f9e778', 'Age']=3
df_train.loc[df_train.PetID=='53923463d', 'Age']=3

In [None]:
reproducible_results = True
random_state = 42 if reproducible_results else None
dependent_var = 'AdoptionSpeed'
categorical = ['Breed1', 'Breed2', 'Gender', 'Color1', 'Color2', 'Color3', 'State', 'Vaccinated', 'Dewormed', 'Sterilized', 'mixed_breed', 'Type', 'MaturitySize', 'FurLength', 'Health', 'description_language']
continuous = ['Age', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt', 'description_length', 'description_sentiment_score', 'description_sentiment_magnitude']
df_train[categorical] = df_train[categorical].astype('str')
df_test[categorical] = df_test[categorical].astype('str')

In [None]:
def seed_everything(seed): 
    """
    Set all seeds required to ensure models return reproducible results
    """
    
    random.seed(seed) 
    os.environ['PYTHONHASHSEED'] = str(seed) 
    np.random.seed(seed) 
    torch.manual_seed(seed) 
    torch.cuda.manual_seed_all(seed) 
    torch.cuda.manual_seed(seed) 
    torch.backends.cudnn.deterministic = True
    
if reproducible_results:
    seed_everything(random_state)

In [None]:
train_xs, valid_xs, _, _ = train_test_split(df_train, df_train[dependent_var], test_size=0.2, shuffle=True, stratify=df_train[dependent_var], random_state=random_state)
splits = (train_xs.index.tolist(), valid_xs.index.tolist())

In [None]:
procs = [Categorify, FillMissing, Normalize]
tabdata = TabularPandas(df=df_train, procs=procs, cat_names=categorical.copy(), cont_names=continuous.copy(), splits=splits, y_names=dependent_var, y_block=CategoryBlock())

In [None]:
embedding_sizes = get_emb_sz(tabdata)
embedding_sizes

In [None]:
df_train['Breed1'].nunique()

In [None]:
emb_sz_rule??

In [None]:
[col for col in tabdata.cat_names]

In [None]:
def get_default_nn_layers(num_embeddings: int, num_continuous: int, num_outputs: int, num_layers: int=2) -> List[int]:
    """
    Return suggested starting point for number of neurons
    
    Note, this is a rule of thumb for a starting point only, as suggested by Jeremy Howard from fast.ai
    """
    
    num_input_nodes = num_embeddings + num_continuous  
    first_layer = 2**(num_layers-1) * round((((2 / 3) * num_input_nodes) + num_outputs) / 2**(num_layers-1))
    
    return [first_layer] + [int(first_layer / 2**n) for n in range(1, num_layers)]

num_embeddings = sum(n for _, n in get_emb_sz(tabdata))
num_classes = df_train[dependent_var].nunique()
layers = get_default_nn_layers(num_embeddings, num_continuous=len(continuous), num_outputs=num_classes)
batchsize = 16
train_dl = TabDataLoader(tabdata.train, bs=batchsize, shuffle=True, drop_last=False)
valid_dl = TabDataLoader(tabdata.valid, bs=batchsize, shuffle=False, drop_last=False)
dls = DataLoaders(train_dl, valid_dl)
config = tabular_config(ps=[0.001, 0.01], embed_p=0.04)
nn_model = tabular_learner(dls=dls, layers=layers, config=config, loss_func=CrossEntropyLossFlat(), metrics=accuracy, n_out=num_classes)

In [None]:
nn_model.model

In [None]:
valley = nn_model.lr_find()
plt.show()
nn_model.fit_one_cycle(n_epoch=5, lr_max=valley, wd=0.01)
nn_model.recorder.plot_loss()
plt.show()

In [None]:
def embed_features(learner: tabular_learner, xs: TabularPandas) -> pd.DataFrame:
    """
    Generates fast.ai categorical embeddings
    
    Tabular learner should be fitted before passing
    Pandas dataframe is returned with only the embeddings columns; the original, non-embedded columns are dropped
    """
    
    xs = xs[learner.dls.cat_names]
    for i, col in enumerate(xs.columns):
        embeddings = learner.model.embeds[i]
        embedding_data = embeddings(tensor(xs[col], dtype=torch.int64))
        embedding_names = [f'{col}_{j}' for j in range(embedding_data.shape[1])]
        
        df_local = pd.DataFrame(data=embedding_data, index=xs.index, columns=embedding_names)
        xs = xs.drop(col, axis=1)
        xs = xs.join(df_local)
    
    return xs

df_train_embeddings = embed_features(learner=nn_model, xs=tabdata.train.xs)
df_valid_embeddings = embed_features(learner=nn_model, xs=tabdata.valid.xs)
embedded = df_train_embeddings.columns.tolist() 
df_train_combined = df_train_embeddings.merge(right=df_train, left_index=True, right_index=True)
df_valid_combined = df_valid_embeddings.merge(right=df_train, left_index=True, right_index=True)

state_embeddings = [x for x in df_train_combined.columns if x.startswith('State_')]
df_train_combined[['State']+state_embeddings].head(10)

In [None]:
tabdata_test = tabdata.new(df_test)
tabdata_test.process()
df_test_embeddings = embed_features(learner=nn_model, xs=tabdata_test)
df_test_combined = df_test_embeddings.merge(right=df_test, left_index=True, right_index=True)

In [None]:
df_train['AdoptionSpeed'].value_counts(ascending=True).plot(kind='bar')

In [None]:
def get_catboost_regressor(iterations: int=1000, loss_function: str='RMSE', eval_metric: str='RMSE', ignored_features: List[str]=['PetID'], depth: int=6) -> cb.CatBoostRegressor:
    """
    Get simple CatBoost regressor
    """
    
    return cb.CatBoostRegressor(iterations=iterations, loss_function=loss_function, eval_metric=eval_metric, ignored_features=ignored_features, depth=depth, random_seed=random_state)    

def get_catboost_pool(df: pd.DataFrame, use_categorical: bool, use_embedded: bool, has_label: bool=True) -> cb.Pool:
    """
    Get CatBoost data pool
    
        use_categorical - switch to indicate whether we should use original, discrete categorical values
        use_embedded - switch to indicate whether we should use continuous categorical embedding values
        has_label - pass True for training and validation pools and False for test pool
    """
        
    columns = continuous + (categorical if use_categorical else []) + (embedded if use_embedded else []) + ['PetID']
    cat_features = ['PetID'] + (categorical if use_categorical else [])    
    label = df[dependent_var] if has_label else None
    
    return cb.Pool(data=df[columns], label=label, cat_features=cat_features)

In [None]:
# Note that all code in this cell came from this notebook - 
# https://www.kaggle.com/code/adityaecdrid/8th-place-solution-code/script?scriptVersionId=12171797
def histogram(ratings, min_rating=None, max_rating=None):
    """
    Returns the counts of each type of rating that a rater made
    """
    if min_rating is None:
        min_rating = min(ratings)
    if max_rating is None:
        max_rating = max(ratings)
    num_ratings = int(max_rating - min_rating + 1)
    hist_ratings = [0 for x in range(num_ratings)]
    for r in ratings:
        hist_ratings[r - min_rating] += 1
    return hist_ratings

def rater_confusion_matrix(rater_a, rater_b, min_rating=None, max_rating=None):
    """
    Returns the confusion matrix between rater's ratings
    """
    assert (len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(rater_a + rater_b)
    if max_rating is None:
        max_rating = max(rater_a + rater_b)
    num_ratings = int(max_rating - min_rating + 1)
    conf_mat = [[0 for i in range(num_ratings)]
                for j in range(num_ratings)]
    for a, b in zip(rater_a, rater_b):
        conf_mat[a - min_rating][b - min_rating] += 1
    return conf_mat

def quadratic_weighted_kappa(y, y_pred):
    rater_a = y
    rater_b = y_pred
    min_rating = None
    max_rating = None
    rater_a = np.array(rater_a, dtype=int)
    rater_b = np.array(rater_b, dtype=int)
    assert (len(rater_a) == len(rater_b))
    if min_rating is None:
        min_rating = min(min(rater_a), min(rater_b))
    if max_rating is None:
        max_rating = max(max(rater_a), max(rater_b))
    conf_mat = rater_confusion_matrix(rater_a, rater_b,
                                min_rating, max_rating)
    num_ratings = len(conf_mat)
    num_scored_items = float(len(rater_a))

    hist_rater_a = histogram(rater_a, min_rating, max_rating)
    hist_rater_b = histogram(rater_b, min_rating, max_rating)

    numerator = 0.0
    denominator = 0.0

    for i in range(num_ratings):
        for j in range(num_ratings):
            expected_count = (hist_rater_a[i] * hist_rater_b[j]
                              / num_scored_items)
            d = pow(i - j, 2.0) / pow(num_ratings - 1, 2.0)
            numerator += d * conf_mat[i][j] / num_scored_items
            denominator += d * expected_count / num_scored_items

    return (1.0 - numerator / denominator)

class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4

        ll = quadratic_weighted_kappa(y, X_p)
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [0.5, 1.5, 2.5, 3.5]
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 0
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 1
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 2
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 3
            else:
                X_p[i] = 4
        return [int(n) for n in X_p]

    def coefficients(self):
        return self.coef_['x']

In [None]:
train_pool = get_catboost_pool(df=df_train_combined, use_categorical=use_categorical, use_embedded=use_embedded)
valid_pool = get_catboost_pool(df=df_valid_combined, use_categorical=use_categorical, use_embedded=use_embedded)
test_pool = get_catboost_pool(df=df_test_combined, use_categorical=use_categorical, use_embedded=use_embedded, has_label=False)

model = get_catboost_regressor(iterations=10000)

model.fit(X=train_pool, eval_set=valid_pool, verbose=1000)

predictions = model.predict(train_pool)

optR = OptimizedRounder()
optR.fit(predictions, df_train_combined[dependent_var].values)
coefficients = optR.coefficients()

test_predictions = model.predict(test_pool)
test_predictions = optR.predict(test_predictions, coefficients)
df_predictions = pd.concat([df_test[['PetID']], pd.DataFrame(test_predictions, columns=[dependent_var])], axis=1)
df_submission = pd.merge(df_sample_submission, df_predictions, on='PetID')
df_submission.to_csv('submission.csv', index=False) 