In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRanker

## Data preprocessing

In [2]:
post_data = pd.read_csv('post_data_rating.csv')
user_data = pd.read_csv('user_data.csv')

In [26]:
all_data = pd.read_csv('all_users.csv')

In [83]:
all_data.head(2)

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-12-27 10:31:01,488,4620,view,1
1,2021-12-27 10:02:20,488,3608,view,1


In [84]:
all_data = all_data.sort_values("timestamp").reset_index(drop=True)

The data has been preprocessed for each user while preserving class balance.
The function below splits the dataset into train and test sets, ensuring that each user is present in both splits and that the class balance is maintained

In [85]:
def make_split(all_data):
    """
    The function splits into train/test, keeping each user in both sets and maintaining class balance.
    """
    train_parts = []
    test_parts = []

    for user_id, group in all_data.groupby("user_id"):
    
        if (group["target"].sum() < 28) or ((group["target"] == 0).sum() < 28):
            continue 

        # split targets
        class_1 = group[group["target"] == 1]
        class_0 = group[group["target"] == 0]
        # 
        half_1 = len(class_1) // 2
        half_0 = len(class_0) // 2

        class_1_train = class_1.iloc[:half_1]
        class_1_test = class_1.iloc[half_1:]

        class_0_train = class_0.iloc[:half_0]
        class_0_test = class_0.iloc[half_0:]

        train_parts.append(pd.concat([class_1_train, class_0_train]))
        test_parts.append(pd.concat([class_1_test, class_0_test]))


    train = pd.concat(train_parts).sort_values("timestamp").reset_index(drop=True)
    test = pd.concat(test_parts).sort_values("timestamp").reset_index(drop=True)

    return train,test


In [None]:
train,test = make_split(all_data)
# train.to_csv('train_balanced.csv')
# test.to_csv('test_balanced.csv')

In [None]:
# merge tables
train = train.merge(post_data,on='post_id',how='left')
train = train.merge(user_data,on='user_id',how='left')

test = test.merge(post_data,on='post_id',how='left')
test = test.merge(user_data,on='user_id',how='left')

train.head(2)


In [3]:
train = pd.read_csv('train_balanced.csv')
test=pd.read_csv('test_balanced.csv')

### Get random users 

In [5]:
def get_random_users(df,n=100,seed=42):
    rng = np.random.default_rng(seed=seed) 
    unique_users = df['user_id'].unique()
    sample_users = rng.choice(unique_users, size=n, replace=False)
    return sample_users

In [6]:
N_USERS = 1_000 # number of users (for quick test)

sample_users = get_random_users(train,n=N_USERS,seed=42)

In [64]:
train_data = train[train['user_id'].isin(sample_users)]
test_data = test[test['user_id'].isin(sample_users)]

In [None]:
# train_data = train
# test_data = test

## Features

### Time features

In [31]:
# day weekday encodings rating (transform columns )
def make_time_features(data_set):
    data_set['timestamp'] = pd.to_datetime(data_set['timestamp'])
    data_set['day_of_week'] = data_set.timestamp.dt.dayofweek
    data_set['hour'] = data_set.timestamp.dt.hour
    return data_set


In [None]:
test_data = make_time_features(test_data)
train_data = make_time_features(train_data)

### Post rating feature

In [6]:
# join post rating
train_data = train_data.merge(post_data[['post_id','rating']],how='left',on='post_id')
test_data = test_data.merge(post_data[['post_id','rating']],how='left',on='post_id')

### Text embeddings

In [22]:
posts_info = pd.read_csv('post_data.csv')

In [None]:
from transformers import AutoTokenizer
from transformers import BertModel  
from transformers import RobertaModel  
from transformers import DistilBertModel  

def get_model(model_name):
    assert model_name in ['bert', 'roberta', 'distilbert']

    checkpoint_names = {
        'bert': 'bert-base-cased',  
        'roberta': 'roberta-base', 
        'distilbert': 'distilbert-base-cased' 
    }

    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [None]:
tokenizer, model = get_model('distilbert')

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

class PostDataset(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_tensors='pt',
            truncation=True,
            padding=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])
    
    
dataset = PostDataset(posts_info['text'].values.tolist(), tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(dataset, batch_size=32, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [None]:
import torch
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    
    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
print(torch.cuda.get_device_name())
model = model.to(device)

In [None]:
embeddings = get_embeddings_labels(model, loader).numpy()
embeddings

In [None]:
from sklearn.decomposition import PCA

centered = embeddings - embeddings.mean()
pca = PCA(n_components=50)
pca_decomp = pca.fit_transform(centered)

In [None]:
from sklearn.cluster import KMeans

n_clusters = 15

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(pca_decomp)

posts_info['TextCluster'] = kmeans.labels_

dists_columns = [f'DistanceToCluster_{i}' for i in range(n_clusters)]

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()

In [None]:
posts_info = pd.concat((posts_info, dists_df), axis=1)
posts_info.drop(["text"], axis=1, inplace=True)
posts_info

In [None]:
model.cpu()

In [None]:
# clear memory 
del model
del tokenizer

del dataset
del loader

del embeddings
del centered
del pca
del pca_decomp

In [None]:
import gc

gc.collect()

In [None]:
# posts_info.to_csv('post_embeddings.csv')

In [None]:
train_data = train_data.merge(posts_info,on='post_id',how='left')
test_data = test_data.merge(posts_info,on='post_id',how='left')

## Train

Select features columns

In [12]:
# features = ['gender','age','day_of_week','hour','topic','city','os','rating',]
features = ['gender','age','topic','country','city','os','rating']
cat_features = ['topic','city','country','os',]
embedding_features = ['text_embedding']

In [13]:
train_data = train_data.sort_values(by='user_id') # sorting for catboosranker
test_data = test_data.sort_values(by='user_id')

In [14]:
from catboost import Pool

train_pool = Pool(
    data=train_data[features],
    label=train_data['target'],
    group_id=train_data['user_id'],
    cat_features=cat_features,
)

test_pool = Pool(
    data=test_data[features],
    label=test_data['target'],
    group_id=test_data['user_id'],
    cat_features=cat_features,
)


In [None]:
rank_model = CatBoostRanker(
    iterations=1000,
    early_stopping_rounds=300,
    loss_function='YetiRank',
    eval_metric='NDCG:top=10',
    custom_metric=[
        'PrecisionAt:top=5',
        'RecallAt:top=5',
        'MAP:top=5'
    ],
    verbose=100,
    thread_count=10,
    random_seed=42,
    task_type='GPU',   
 
)

In [None]:
# del rank_model

In [None]:
rank_model.fit(train_pool, eval_set=test_pool, use_best_model=True)

In [22]:
rank_model.get_best_score()['validation']

{'NDCG:top=10;type=Base': 0.6049860733654226,
 'PFound': 0.9023667764396479,
 'RecallAt:top=5': 0.11906980450233327,
 'MAP:top=5': 0.47033586052817367,
 'PrecisionAt:top=5': 0.6022888439985761}

## Save model

In [23]:
rank_model.save_model('rank_model',format='cbm')

In [None]:
best_params = rank_model.get_params()
best_params

Train on full dataset

In [None]:
rank_model = CatBoostRanker().load_model('rank_model')

In [None]:
full_data = pd.concat([train_data, test_data], axis=0)
full_data = full_data.sort_values(by='user_id')

X_full = full_data[features]
y_full = full_data['target']

group_full = full_data['user_id']  

In [None]:

full_pool = Pool(
    data=X_full,
    label=y_full,
    group_id=group_full,
    cat_features=cat_features
)

In [None]:
final_model = CatBoostRanker(**best_params)
final_model.fit(full_pool)

In [None]:
rank_model.save_model('rank_model_full',format='cbm')

## Hitrate test

In [11]:
rank_model = CatBoostRanker().load_model('rank_model')

In [12]:
def get_predict(df,model,n=5):
    df = df.copy()
    cols = model.feature_names_    
    df['score'] = model.predict(df[cols])
    recs = df.sort_values(by='score',ascending=False).post_id.tolist()
    recs = recs[:n]
    return recs 

In [13]:
def get_candidates(user_data,post_data,uid,history):
    # take user info
    user_row = user_data[user_data['user_id'] == uid]
    liked_post_ids = history[(history['user_id'] == uid) & (history['target'] == 1)].post_id.tolist()
    # remove liked on train
    posts = post_data[~post_data['post_id'].isin(liked_post_ids)]
    merge_features = posts.merge(user_row,how='cross')
    return merge_features

In [14]:
from tqdm import tqdm

def evaluate_hitrate(model,train,test):
    users_hitrate = []    
    # user list
    users = test["user_id"].unique()    
    for user in tqdm(users,desc='Evaluating users'):
        # posts 
        df = get_candidates(user_data=user_data,post_data=post_data,uid=user,history=train)
        user_records = test[(test["user_id"] == user) & (test['target'] == 1)][['post_id','day_of_week','hour']] 

        user_hits = 0
        for _,row in user_records.iterrows():
            # simulate time
            df['hour'] = row['hour']
            df['day_of_week'] = row['hour']  
            post_id = row['post_id']
            recs = get_predict(df=df,model=model)        
            # evaluate hitrate@5
            if post_id in recs:
                user_hits += 1                

        users_hitrate.append(user_hits)
    # hitrate per user
    hit_rate = np.mean(users_hitrate)
    print(sum(users_hitrate))
    return hit_rate

In [31]:
sample_users = get_random_users(train,n=50_000,seed=475)
quick_test = test[test['user_id'].isin(sample_users)]

In [None]:
rank_hit_rate = evaluate_hitrate(model=rank_model,train=train_data,test=quick_test)
print(rank_hit_rate) 