### Подключение к базе и таблицы с юзерами и постами

In [1]:
from sqlalchemy import create_engine, text


engine = create_engine("postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml")

connection = engine.connect().execution_options(stream_results=True)

In [2]:
### Посты и темы
### На этот раз сделаем эмбеддинги с помощью моделей на основе Bert

import pandas as pd


posts_info = pd.read_sql(
    text("SELECT * FROM public.post_text_df"),
    con=connection
)

posts_info.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [3]:
### Импортируем предобученные модели с HuggingFace

from transformers import AutoTokenizer
from transformers import BertModel  # https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel
from transformers import RobertaModel  # https://huggingface.co/docs/transformers/model_doc/roberta#transformers.RobertaModel
from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model(model_name):
    """
    Функция принимает на вход название модели из множества ('bert', 'roberta', 'distilbert') и возвращает предобученные Токенайзер и Модель
    """
    assert model_name in ['bert', 'roberta', 'distilbert'] # Проверка ввода

    checkpoint_names = {
        'bert': 'bert-base-cased',  # https://huggingface.co/bert-base-cased
        'roberta': 'roberta-base',  # https://huggingface.co/roberta-base
        'distilbert': 'distilbert-base-cased'  # https://huggingface.co/distilbert-base-cased
    }

    model_classes = {
        'bert': BertModel,
        'roberta': RobertaModel,
        'distilbert': DistilBertModel
    }

    return AutoTokenizer.from_pretrained(checkpoint_names[model_name]), model_classes[model_name].from_pretrained(checkpoint_names[model_name])

In [4]:
tokenizer, model = get_model('distilbert')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/465 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/263M [00:00<?, ?B/s]

In [5]:
### Напишем класс датасета для постов

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding


class PostDataset(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_tensors='pt',
            truncation=True,
            padding=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])

In [6]:
### Создадим датасет для постов, Data_Collartor для задания автоматических паддингов
### И ДатаЛодер

dataset = PostDataset(posts_info['text'].values.tolist(), tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(dataset, batch_size=32, collate_fn=data_collator, pin_memory=True, shuffle=False)

In [7]:
### Создадим функцию получения эмбенддингов
import torch
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()

    total_embeddings = []

    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())

model = model.to(device)

cuda:0
Tesla T4


In [9]:
embeddings = get_embeddings_labels(model, loader).numpy()

embeddings

  0%|          | 0/220 [00:00<?, ?it/s]You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|██████████| 220/220 [01:48<00:00,  2.03it/s]


array([[ 3.6315086e-01,  4.8937496e-02, -2.6408118e-01, ...,
        -1.4159346e-01,  1.5918216e-02,  9.1982896e-05],
       [ 2.3641640e-01, -1.5950108e-01, -3.2779828e-01, ...,
        -2.8993604e-01,  1.1936528e-01, -1.6235473e-03],
       [ 3.7519148e-01, -1.1394388e-01, -2.4054705e-01, ...,
        -3.3891949e-01,  5.8694065e-02, -2.1265799e-02],
       ...,
       [ 3.4038273e-01,  6.6492192e-02, -1.6318429e-01, ...,
        -8.6562753e-02,  2.0340374e-01,  3.2090571e-02],
       [ 4.3209219e-01,  1.1091532e-02, -1.1730607e-01, ...,
         7.5401559e-02,  1.0273975e-01,  1.5274222e-02],
       [ 3.0427766e-01, -7.6215670e-02, -6.7758739e-02, ...,
        -5.4348916e-02,  2.4438348e-01, -1.4148588e-02]], dtype=float32)

In [10]:
### Кластеризуем тексты

from sklearn.decomposition import PCA

centered = embeddings - embeddings.mean()

pca = PCA(n_components=50)
pca_decomp = pca.fit_transform(centered)

In [11]:
from sklearn.cluster import KMeans

n_clusters = 15

kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(pca_decomp)

posts_info['TextCluster'] = kmeans.labels_

dists_columns = [f'DistanceToCluster_{i}' for i in range(n_clusters)]

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()



Unnamed: 0,DistanceToCluster_0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14
0,1.814031,3.408664,3.369235,3.444153,3.007193,2.830589,3.414067,3.620403,3.387786,2.251045,2.379187,3.668078,3.474052,1.901512,3.462158
1,2.187007,3.326662,3.323498,2.983316,2.852729,2.548178,3.222261,3.357583,3.368556,2.245466,2.333275,3.470835,3.249569,1.427362,3.137871
2,1.856511,3.357605,3.265581,2.971859,3.042614,2.880629,3.287361,3.35989,3.495842,3.059945,2.40271,3.454373,3.396241,1.681691,3.134001
3,2.465487,3.740721,3.511484,3.719928,3.281064,3.37263,3.694228,3.795827,3.748154,3.414405,2.824813,3.155125,4.067282,2.437433,3.794484
4,1.475957,2.812194,3.036488,2.646454,2.646883,2.139604,2.859638,3.052041,2.805004,2.957282,2.04922,3.172234,3.245495,2.118897,2.778889


In [12]:
posts_info = pd.concat((posts_info, dists_df), axis=1)

posts_info.drop(["text"], axis=1, inplace=True)

posts_info

Unnamed: 0,post_id,topic,TextCluster,DistanceToCluster_0,DistanceToCluster_1,DistanceToCluster_2,DistanceToCluster_3,DistanceToCluster_4,DistanceToCluster_5,DistanceToCluster_6,DistanceToCluster_7,DistanceToCluster_8,DistanceToCluster_9,DistanceToCluster_10,DistanceToCluster_11,DistanceToCluster_12,DistanceToCluster_13,DistanceToCluster_14
0,1,business,0,1.814031,3.408664,3.369235,3.444153,3.007193,2.830589,3.414067,3.620403,3.387786,2.251045,2.379187,3.668078,3.474052,1.901512,3.462158
1,2,business,13,2.187007,3.326662,3.323498,2.983316,2.852729,2.548178,3.222261,3.357583,3.368556,2.245466,2.333275,3.470835,3.249569,1.427362,3.137871
2,3,business,13,1.856511,3.357605,3.265581,2.971859,3.042614,2.880629,3.287361,3.359890,3.495842,3.059945,2.402710,3.454373,3.396241,1.681691,3.134001
3,4,business,13,2.465487,3.740721,3.511484,3.719928,3.281064,3.372630,3.694228,3.795827,3.748154,3.414405,2.824813,3.155125,4.067282,2.437433,3.794484
4,5,business,0,1.475957,2.812194,3.036488,2.646454,2.646883,2.139604,2.859638,3.052041,2.805004,2.957282,2.049220,3.172234,3.245495,2.118897,2.778889
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,movie,1,2.991282,1.276353,3.134545,3.053045,2.791668,2.329026,2.046031,2.964862,1.797697,3.362825,2.739899,2.135261,3.397066,3.020623,1.815795
7019,7316,movie,1,2.940053,0.928139,2.931023,3.199624,2.479943,2.228951,1.805972,2.613651,1.448903,3.199755,2.451044,1.955735,3.385375,3.036096,1.839652
7020,7317,movie,1,3.175848,1.498381,2.837746,3.156679,2.500653,2.440658,2.214021,2.394461,2.009284,3.412001,2.812884,2.391287,3.464171,3.266215,1.982772
7021,7318,movie,8,3.183270,1.488344,3.431957,3.218362,3.085255,2.308771,1.897867,3.317999,1.042558,3.450446,2.994181,1.784949,3.411267,3.304142,1.526337


In [13]:
### Очищаем память

model.cpu()

del model
del tokenizer

del dataset
del loader

del embeddings
del centered
del pca
del pca_decomp

In [14]:
import gc

gc.collect()

0

In [15]:
posts_info.to_sql(
   "posts_info_features_dl",
    con="postgresql://robot-startml-ro:pheiph0hahj1Vaif@postgres.lab.karpov.courses:6432/startml",
    schema="public",
    if_exists='replace'
)

23

## Теперь приступаем к обработке действий

In [17]:
### Заберем 10 миллионов записей, сразу очистим и оставим только view


feed_data = pd.read_sql(
    text("""
    SELECT
        cast(extract(hour from timestamp) as int) as hour,
        cast(extract(month from timestamp) as int) as month,
        post_id,
        gender,
        age,
        country,
        city,
        exp_group,
        os,
        source,
        target
    FROM public.feed_data JOIN public.user_data ON public.feed_data.user_id = public.user_data.user_id
    WHERE action = 'view'
    LIMIT 10000000
    """),
    con=connection
)

feed_data.head()

Unnamed: 0,hour,month,post_id,gender,age,country,city,exp_group,os,source,target
0,20,12,684,1,15,Russia,Barnaul,3,Android,ads,0
1,20,12,841,1,15,Russia,Barnaul,3,Android,ads,0
2,20,12,6696,1,15,Russia,Barnaul,3,Android,ads,0
3,20,12,2796,1,15,Russia,Barnaul,3,Android,ads,0
4,9,12,4009,1,15,Russia,Barnaul,3,Android,ads,0


In [19]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [20]:
from catboost import CatBoostClassifier, Pool
from tqdm import tqdm


object_cols = [
    'topic', 'TextCluster', 'gender', 'country',
    'city', 'exp_group', 'hour', 'month',
    'os', 'source'
]

catboost = CatBoostClassifier(
    iterations=200,
    learning_rate=1,
    depth=2,
    random_seed=42,
    thread_count=-1,
    task_type="GPU"
)

feed_data = pd.merge(
    feed_data,
    posts_info,
    on='post_id',
    how='left'
)

feed_data.drop(['post_id'], axis=1, inplace=True)

catboost.fit(X=feed_data.drop(['target'], axis=1), y=feed_data['target'], cat_features=object_cols)

catboost.save_model(
    'catboost_model.cbm',
    format="cbm"
)

0:	learn: 0.3628994	total: 558ms	remaining: 1m 51s
1:	learn: 0.3555012	total: 1.07s	remaining: 1m 46s
2:	learn: 0.3546694	total: 1.44s	remaining: 1m 34s
3:	learn: 0.3539535	total: 1.96s	remaining: 1m 35s
4:	learn: 0.3535691	total: 2.47s	remaining: 1m 36s
5:	learn: 0.3533970	total: 2.85s	remaining: 1m 32s
6:	learn: 0.3533135	total: 3.22s	remaining: 1m 28s
7:	learn: 0.3531112	total: 3.64s	remaining: 1m 27s
8:	learn: 0.3520599	total: 4.03s	remaining: 1m 25s
9:	learn: 0.3519760	total: 4.42s	remaining: 1m 24s
10:	learn: 0.3516844	total: 4.8s	remaining: 1m 22s
11:	learn: 0.3509160	total: 5.17s	remaining: 1m 20s
12:	learn: 0.3508605	total: 5.69s	remaining: 1m 21s
13:	learn: 0.3504085	total: 6.07s	remaining: 1m 20s
14:	learn: 0.3503214	total: 6.43s	remaining: 1m 19s
15:	learn: 0.3500834	total: 6.79s	remaining: 1m 18s
16:	learn: 0.3499659	total: 7.32s	remaining: 1m 18s
17:	learn: 0.3497556	total: 7.7s	remaining: 1m 17s
18:	learn: 0.3496659	total: 8.1s	remaining: 1m 17s
19:	learn: 0.3493986	tota

In [23]:
### Замерим качество работы модели на ROC-AUC
### С эмбеддингами на базе TF-IDF ROC-AUC на трейне был 0.667, теперь 0.679. +1.8% прироста

from sklearn.metrics import roc_auc_score

print(f"Качество на трейне: {roc_auc_score(feed_data['target'], catboost.predict_proba(feed_data.drop(['target'], axis=1))[:, 1])}")

Качество на трейне: 0.679689351854054
