### The code bellow representes the Catboost model training, which futher will be used for posts recommendation to users

In [2]:
from transformers import BertTokenizer, BertModel, DataCollatorWithPadding, AutoTokenizer
import torch
import pandas as pd
import numpy as np
from torch.utils.data import DataLoader
from tqdm import tqdm
from datasets import Dataset
from torch.utils.data import TensorDataset
from torch.utils.data import Subset

from catboost import CatBoostClassifier, Pool

In [3]:
#import data
data = pd.read_csv('data_500K_all.csv')

In [4]:
data = data.iloc[:400000]

In [5]:
#load data with embeddings for topis and text
emb= pd.read_csv('posts_emb_topic_text.csv')
emb.shape, emb.columns

((7023, 17),
 Index(['Unnamed: 0', 'post_id', 'pca_0', 'pca_1', 'pca_2', 'pca_3', 'pca_4',
        'pca_5', 'pca_6', 'pca_7', 'pca_8', 'pca_9', 'pca_10', 'pca_11',
        'pca_12', 'pca_13', 'pca_14'],
       dtype='object'))

In [6]:
emb = emb.drop('Unnamed: 0', axis=1)

In [11]:
#use TF-IDF to make new features from post texts
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(df['text'])

In [12]:
# Transform the text data
tfidf_matrix = tfidf.transform(df['text'])

# Get feature names
names = tfidf.get_feature_names_out()

# Create a new DataFrame to store mean TF-IDF values
mean_tfidf = pd.DataFrame(columns=['mean_tfidf'])

# Calculate mean TF-IDF for each document
for i in range(len(df)):
    f_df = pd.DataFrame(tfidf_matrix[i].T.todense(), index=names, columns=['tfidf'])
    f_df_mean = f_df.mean()
    mean_tfidf.loc[i] = f_df_mean['tfidf']

In [14]:
df = pd.concat([df, mean_tfidf], axis=1)
df.rename(columns={'mean_tfidf': 'Mean_TFIDF'}, inplace=True)

In [None]:
#here merge emb and df, delete topic, text and action

In [None]:
df = df.drop(['action', 'topic', 'text'], axis=1)

Since the data is structured chronologically, we can leverage this temporal aspect to split the data into training and testing sets. Specifically, we'll use the earlier data for training and the latest data for testing

In [13]:
df["timestamp"] = pd.to_datetime(df["timestamp"])

In [15]:
#data split
dfm = df.sort_values("timestamp")

train = dfm.iloc[:-200].copy()
test = dfm.iloc[-200:].copy()

X_train = train.drop('target', axis=1)
X_test = test.drop('target', axis=1)

y_train = train['target']
y_test = test['target']

#Categorical columns, we reviel during EDA
cat_cols = ['country', 'city', 'source', 'os']
cat_features = [X_train.columns.get_loc(col) for col in cat_cols]

In [16]:
from catboost import CatBoostClassifier, Pool

In [19]:
catboost = CatBoostClassifier(learning_rate = 0.005, depth=10)
catboost.fit(X_train, y_train, cat_features = cat_features, eval_set=(X_test, y_test), verbose=True)

0:	learn: 0.6889167	test: 0.6891603	best: 0.6891603 (0)	total: 591ms	remaining: 9m 50s
1:	learn: 0.6847293	test: 0.6854512	best: 0.6854512 (1)	total: 1.21s	remaining: 10m 3s
2:	learn: 0.6806206	test: 0.6815517	best: 0.6815517 (2)	total: 1.71s	remaining: 9m 26s
3:	learn: 0.6765406	test: 0.6778060	best: 0.6778060 (3)	total: 2.22s	remaining: 9m 13s
4:	learn: 0.6725233	test: 0.6742712	best: 0.6742712 (4)	total: 2.54s	remaining: 8m 25s
5:	learn: 0.6685694	test: 0.6705575	best: 0.6705575 (5)	total: 2.75s	remaining: 7m 35s
6:	learn: 0.6646355	test: 0.6669618	best: 0.6669618 (6)	total: 3.13s	remaining: 7m 23s
7:	learn: 0.6607578	test: 0.6635613	best: 0.6635613 (7)	total: 3.55s	remaining: 7m 20s
8:	learn: 0.6569115	test: 0.6599057	best: 0.6599057 (8)	total: 3.98s	remaining: 7m 18s
9:	learn: 0.6530197	test: 0.6564605	best: 0.6564605 (9)	total: 4.52s	remaining: 7m 27s
10:	learn: 0.6492687	test: 0.6530354	best: 0.6530354 (10)	total: 5.06s	remaining: 7m 35s
11:	learn: 0.6455826	test: 0.6496044	best

93:	learn: 0.4514684	test: 0.4745109	best: 0.4745109 (93)	total: 38.2s	remaining: 6m 7s
94:	learn: 0.4500300	test: 0.4732841	best: 0.4732841 (94)	total: 38.5s	remaining: 6m 6s
95:	learn: 0.4486189	test: 0.4721242	best: 0.4721242 (95)	total: 38.9s	remaining: 6m 6s
96:	learn: 0.4472123	test: 0.4709120	best: 0.4709120 (96)	total: 39.3s	remaining: 6m 5s
97:	learn: 0.4458345	test: 0.4698368	best: 0.4698368 (97)	total: 39.8s	remaining: 6m 6s
98:	learn: 0.4445075	test: 0.4687022	best: 0.4687022 (98)	total: 40.2s	remaining: 6m 6s
99:	learn: 0.4431854	test: 0.4676077	best: 0.4676077 (99)	total: 40.7s	remaining: 6m 6s
100:	learn: 0.4418868	test: 0.4664818	best: 0.4664818 (100)	total: 41s	remaining: 6m 4s
101:	learn: 0.4406152	test: 0.4653956	best: 0.4653956 (101)	total: 41.3s	remaining: 6m 3s
102:	learn: 0.4393540	test: 0.4643343	best: 0.4643343 (102)	total: 41.7s	remaining: 6m 3s
103:	learn: 0.4380640	test: 0.4631283	best: 0.4631283 (103)	total: 42.2s	remaining: 6m 3s
104:	learn: 0.4367821	test

184:	learn: 0.3729072	test: 0.4097655	best: 0.4097655 (184)	total: 1m 13s	remaining: 5m 23s
185:	learn: 0.3724340	test: 0.4093549	best: 0.4093549 (185)	total: 1m 13s	remaining: 5m 23s
186:	learn: 0.3719450	test: 0.4090143	best: 0.4090143 (186)	total: 1m 14s	remaining: 5m 23s
187:	learn: 0.3714454	test: 0.4085800	best: 0.4085800 (187)	total: 1m 14s	remaining: 5m 23s
188:	learn: 0.3709922	test: 0.4082427	best: 0.4082427 (188)	total: 1m 15s	remaining: 5m 23s
189:	learn: 0.3705342	test: 0.4078715	best: 0.4078715 (189)	total: 1m 15s	remaining: 5m 23s
190:	learn: 0.3700569	test: 0.4075541	best: 0.4075541 (190)	total: 1m 16s	remaining: 5m 23s
191:	learn: 0.3696119	test: 0.4071984	best: 0.4071984 (191)	total: 1m 16s	remaining: 5m 22s
192:	learn: 0.3691701	test: 0.4068417	best: 0.4068417 (192)	total: 1m 16s	remaining: 5m 20s
193:	learn: 0.3687271	test: 0.4065275	best: 0.4065275 (193)	total: 1m 17s	remaining: 5m 21s
194:	learn: 0.3682720	test: 0.4062288	best: 0.4062288 (194)	total: 1m 17s	remain

274:	learn: 0.3453028	test: 0.3889309	best: 0.3889309 (274)	total: 1m 49s	remaining: 4m 49s
275:	learn: 0.3451226	test: 0.3887992	best: 0.3887992 (275)	total: 1m 50s	remaining: 4m 49s
276:	learn: 0.3449506	test: 0.3887022	best: 0.3887022 (276)	total: 1m 50s	remaining: 4m 49s
277:	learn: 0.3447543	test: 0.3885754	best: 0.3885754 (277)	total: 1m 51s	remaining: 4m 49s
278:	learn: 0.3445888	test: 0.3884458	best: 0.3884458 (278)	total: 1m 51s	remaining: 4m 48s
279:	learn: 0.3444387	test: 0.3883325	best: 0.3883325 (279)	total: 1m 52s	remaining: 4m 48s
280:	learn: 0.3442330	test: 0.3882126	best: 0.3882126 (280)	total: 1m 52s	remaining: 4m 48s
281:	learn: 0.3440247	test: 0.3880978	best: 0.3880978 (281)	total: 1m 53s	remaining: 4m 48s
282:	learn: 0.3438647	test: 0.3879944	best: 0.3879944 (282)	total: 1m 53s	remaining: 4m 48s
283:	learn: 0.3436683	test: 0.3878493	best: 0.3878493 (283)	total: 1m 54s	remaining: 4m 48s
284:	learn: 0.3434925	test: 0.3877169	best: 0.3877169 (284)	total: 1m 54s	remain

364:	learn: 0.3341858	test: 0.3814336	best: 0.3814336 (364)	total: 2m 31s	remaining: 4m 23s
365:	learn: 0.3341091	test: 0.3813998	best: 0.3813998 (365)	total: 2m 31s	remaining: 4m 23s
366:	learn: 0.3340483	test: 0.3813552	best: 0.3813552 (366)	total: 2m 32s	remaining: 4m 22s
367:	learn: 0.3339724	test: 0.3812994	best: 0.3812994 (367)	total: 2m 32s	remaining: 4m 22s
368:	learn: 0.3339100	test: 0.3812543	best: 0.3812543 (368)	total: 2m 33s	remaining: 4m 21s
369:	learn: 0.3338343	test: 0.3812184	best: 0.3812184 (369)	total: 2m 33s	remaining: 4m 21s
370:	learn: 0.3337551	test: 0.3811875	best: 0.3811875 (370)	total: 2m 33s	remaining: 4m 21s
371:	learn: 0.3336922	test: 0.3811439	best: 0.3811439 (371)	total: 2m 34s	remaining: 4m 20s
372:	learn: 0.3336054	test: 0.3810706	best: 0.3810706 (372)	total: 2m 34s	remaining: 4m 19s
373:	learn: 0.3335383	test: 0.3810313	best: 0.3810313 (373)	total: 2m 34s	remaining: 4m 19s
374:	learn: 0.3334560	test: 0.3809713	best: 0.3809713 (374)	total: 2m 35s	remain

454:	learn: 0.3291526	test: 0.3782994	best: 0.3782994 (454)	total: 3m 12s	remaining: 3m 51s
455:	learn: 0.3291256	test: 0.3782804	best: 0.3782804 (455)	total: 3m 13s	remaining: 3m 50s
456:	learn: 0.3290810	test: 0.3782373	best: 0.3782373 (456)	total: 3m 13s	remaining: 3m 50s
457:	learn: 0.3290608	test: 0.3782301	best: 0.3782301 (457)	total: 3m 14s	remaining: 3m 49s
458:	learn: 0.3290211	test: 0.3781912	best: 0.3781912 (458)	total: 3m 14s	remaining: 3m 49s
459:	learn: 0.3289893	test: 0.3781809	best: 0.3781809 (459)	total: 3m 15s	remaining: 3m 49s
460:	learn: 0.3289413	test: 0.3781413	best: 0.3781413 (460)	total: 3m 15s	remaining: 3m 48s
461:	learn: 0.3289108	test: 0.3781465	best: 0.3781413 (460)	total: 3m 16s	remaining: 3m 48s
462:	learn: 0.3288759	test: 0.3781418	best: 0.3781413 (460)	total: 3m 16s	remaining: 3m 48s
463:	learn: 0.3288444	test: 0.3781466	best: 0.3781413 (460)	total: 3m 17s	remaining: 3m 48s
464:	learn: 0.3288181	test: 0.3781326	best: 0.3781326 (464)	total: 3m 17s	remain

544:	learn: 0.3267281	test: 0.3774895	best: 0.3774720 (541)	total: 3m 56s	remaining: 3m 17s
545:	learn: 0.3266957	test: 0.3774488	best: 0.3774488 (545)	total: 3m 57s	remaining: 3m 17s
546:	learn: 0.3266634	test: 0.3774234	best: 0.3774234 (546)	total: 3m 57s	remaining: 3m 16s
547:	learn: 0.3266535	test: 0.3774262	best: 0.3774234 (546)	total: 3m 57s	remaining: 3m 16s
548:	learn: 0.3266196	test: 0.3774131	best: 0.3774131 (548)	total: 3m 58s	remaining: 3m 15s
549:	learn: 0.3265876	test: 0.3773935	best: 0.3773935 (549)	total: 3m 59s	remaining: 3m 15s
550:	learn: 0.3265640	test: 0.3773840	best: 0.3773840 (550)	total: 3m 59s	remaining: 3m 15s
551:	learn: 0.3265548	test: 0.3773791	best: 0.3773791 (551)	total: 3m 59s	remaining: 3m 14s
552:	learn: 0.3265353	test: 0.3773717	best: 0.3773717 (552)	total: 4m	remaining: 3m 14s
553:	learn: 0.3265138	test: 0.3773616	best: 0.3773616 (553)	total: 4m 1s	remaining: 3m 14s
554:	learn: 0.3265035	test: 0.3773598	best: 0.3773598 (554)	total: 4m 1s	remaining: 3

634:	learn: 0.3250719	test: 0.3766982	best: 0.3766982 (634)	total: 4m 38s	remaining: 2m 39s
635:	learn: 0.3250538	test: 0.3767545	best: 0.3766982 (634)	total: 4m 38s	remaining: 2m 39s
636:	learn: 0.3250476	test: 0.3767549	best: 0.3766982 (634)	total: 4m 38s	remaining: 2m 38s
637:	learn: 0.3250327	test: 0.3767596	best: 0.3766982 (634)	total: 4m 39s	remaining: 2m 38s
638:	learn: 0.3250156	test: 0.3767411	best: 0.3766982 (634)	total: 4m 39s	remaining: 2m 38s
639:	learn: 0.3249969	test: 0.3767107	best: 0.3766982 (634)	total: 4m 40s	remaining: 2m 37s
640:	learn: 0.3249824	test: 0.3766995	best: 0.3766982 (634)	total: 4m 40s	remaining: 2m 37s
641:	learn: 0.3249642	test: 0.3767233	best: 0.3766982 (634)	total: 4m 41s	remaining: 2m 36s
642:	learn: 0.3249463	test: 0.3766901	best: 0.3766901 (642)	total: 4m 41s	remaining: 2m 36s
643:	learn: 0.3249356	test: 0.3766916	best: 0.3766901 (642)	total: 4m 42s	remaining: 2m 36s
644:	learn: 0.3249212	test: 0.3766861	best: 0.3766861 (644)	total: 4m 42s	remain

724:	learn: 0.3239853	test: 0.3761406	best: 0.3761406 (724)	total: 5m 19s	remaining: 2m 1s
725:	learn: 0.3239721	test: 0.3761277	best: 0.3761277 (725)	total: 5m 19s	remaining: 2m
726:	learn: 0.3239599	test: 0.3761196	best: 0.3761196 (726)	total: 5m 20s	remaining: 2m
727:	learn: 0.3239458	test: 0.3761453	best: 0.3761196 (726)	total: 5m 20s	remaining: 1m 59s
728:	learn: 0.3239282	test: 0.3761141	best: 0.3761141 (728)	total: 5m 21s	remaining: 1m 59s
729:	learn: 0.3239114	test: 0.3761088	best: 0.3761088 (729)	total: 5m 22s	remaining: 1m 59s
730:	learn: 0.3239020	test: 0.3761133	best: 0.3761088 (729)	total: 5m 22s	remaining: 1m 58s
731:	learn: 0.3238928	test: 0.3761114	best: 0.3761088 (729)	total: 5m 23s	remaining: 1m 58s
732:	learn: 0.3238853	test: 0.3761134	best: 0.3761088 (729)	total: 5m 23s	remaining: 1m 57s
733:	learn: 0.3238773	test: 0.3761178	best: 0.3761088 (729)	total: 5m 24s	remaining: 1m 57s
734:	learn: 0.3238682	test: 0.3761152	best: 0.3761088 (729)	total: 5m 24s	remaining: 1m 5

814:	learn: 0.3229898	test: 0.3756370	best: 0.3756243 (813)	total: 6m 4s	remaining: 1m 22s
815:	learn: 0.3229841	test: 0.3756380	best: 0.3756243 (813)	total: 6m 5s	remaining: 1m 22s
816:	learn: 0.3229739	test: 0.3756299	best: 0.3756243 (813)	total: 6m 5s	remaining: 1m 21s
817:	learn: 0.3229645	test: 0.3756240	best: 0.3756240 (817)	total: 6m 6s	remaining: 1m 21s
818:	learn: 0.3229545	test: 0.3756351	best: 0.3756240 (817)	total: 6m 6s	remaining: 1m 21s
819:	learn: 0.3229455	test: 0.3756269	best: 0.3756240 (817)	total: 6m 7s	remaining: 1m 20s
820:	learn: 0.3229313	test: 0.3756318	best: 0.3756240 (817)	total: 6m 7s	remaining: 1m 20s
821:	learn: 0.3229214	test: 0.3756582	best: 0.3756240 (817)	total: 6m 8s	remaining: 1m 19s
822:	learn: 0.3229163	test: 0.3756561	best: 0.3756240 (817)	total: 6m 9s	remaining: 1m 19s
823:	learn: 0.3229075	test: 0.3756713	best: 0.3756240 (817)	total: 6m 9s	remaining: 1m 18s
824:	learn: 0.3228996	test: 0.3756967	best: 0.3756240 (817)	total: 6m 10s	remaining: 1m 18

<catboost.core.CatBoostClassifier at 0x7fc1f89cd510>

In [20]:
#save model
catboost.save_model('model', format="cbm")