# План системы:
- находим DBSCAN для юзеров и добавляем номер класстера к признакам
- берем рандомных N юзеров (предварительно 1000)
- пробуем вытащить данные с постами которые они смотрели и лайкали
- конвертируем текст и OHE топики
- сплитуем по времени и обучаем модель

In [None]:
import pandas as pd
import datetime as dt
import pickle
from catboost import CatBoostClassifier
from sqlalchemy import create_engine
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.cluster import DBSCAN, KMeans
from sklearn.metrics import f1_score
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report



In [None]:
#Загрузка признаков для модели
import pandas as pd
from sqlalchemy import create_engine


def batch_load_sql(query: str) -> pd.DataFrame:
    CHUNKSIZE = 200000
    engine = create_engine(
        """connection"""
    )
    conn = engine.connect().execution_options(stream_results=True)
    chunks = []
    for chunk_dataframe in pd.read_sql(query, conn, chunksize=CHUNKSIZE):
        chunks.append(chunk_dataframe)
    conn.close()
    return pd.concat(chunks, ignore_index=True)


In [None]:
#Загрузка модели и скачивание:

import os


def get_model_path(path: str) -> str:
    if os.environ.get("IS_LMS") == "1":  # проверяем где выполняется код в лмс, или локально.
        MODEL_PATH = '/workdir/user_input/model'
    else:
        MODEL_PATH = path
    return MODEL_PATH

def load_models():
    from catboost import CatBoostClassifier
    model_path = get_model_path("catboost_model")
    model = CatBoostClassifier()  # здесь не указываем параметры, которые были при обучении, в дампе модели все есть
    model.load_model(model_path)
    return model


In [None]:
# подключаемся и скачваем базу данных
conn_uri = "connection_uri"

user_data = pd.read_sql(
    "SELECT * FROM public.user_data;",
    conn_uri)

post_text = pd.read_sql(
    "SELECT * FROM public.post_text_df;",
    conn_uri)


In [None]:
user_data

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads
...,...,...,...,...,...,...,...,...
163200,168548,0,36,Russia,Kaliningrad,4,Android,organic
163201,168549,0,18,Russia,Tula,2,Android,organic
163202,168550,1,41,Russia,Yekaterinburg,4,Android,organic
163203,168551,0,38,Russia,Moscow,3,iOS,organic


In [None]:
post_text

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie
7019,7316,I give this movie 2 stars purely because of it...,movie
7020,7317,I cant believe this film was allowed to be mad...,movie
7021,7318,The version I saw of this film was the Blockbu...,movie


# Класстеризируем юзеров

In [None]:
columns_labenc = ['country', 'city'] #кодируем два столбца с большим количеством уникальных значений

for i in columns_labenc:
    lab_enc = LabelEncoder()
    user_data[i] = lab_enc.fit_transform(user_data[i])
user_data

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,7,651,3,Android,ads
1,201,0,37,7,2,0,Android,ads
2,202,1,17,7,2931,4,Android,ads
3,203,0,18,7,1953,1,iOS,ads
4,204,0,36,7,125,3,Android,ads
...,...,...,...,...,...,...,...,...
163200,168548,0,36,7,1124,4,Android,organic
163201,168549,0,18,7,3287,2,Android,organic
163202,168550,1,41,7,3698,4,Android,organic
163203,168551,0,38,7,1953,3,iOS,organic


In [None]:
#кодируем некоторые признаки
user_object = ['gender', 'exp_group', 'os', 'source'] #кодируем бинарные признаки

enc_user = OneHotEncoder(drop='first')
transform = enc_user.fit_transform(user_data[user_object]).toarray()
user_ohe = pd.DataFrame(transform, columns=enc_user.get_feature_names_out())
user_data = user_data.join(user_ohe)
user_data = user_data.drop(user_object, axis = 1)
user_data

Unnamed: 0,user_id,age,country,city,gender_1,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic
0,200,34,7,651,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,201,37,7,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,202,17,7,2931,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,203,18,7,1953,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,204,36,7,125,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
163200,168548,36,7,1124,0.0,0.0,0.0,0.0,1.0,0.0,1.0
163201,168549,18,7,3287,0.0,0.0,1.0,0.0,0.0,0.0,1.0
163202,168550,41,7,3698,1.0,0.0,0.0,0.0,1.0,0.0,1.0
163203,168551,38,7,1953,0.0,0.0,0.0,1.0,0.0,1.0,1.0


In [None]:
#ранжируем признаки для баланса
stan_scal = StandardScaler()
stand_column = stan_scal.fit_transform(user_data[['age', 'country', 'city']])
user_data[['age', 'country', 'city']] = stand_column
user_data

Unnamed: 0,user_id,age,country,city,gender_1,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic
0,200,0.664568,0.067952,-1.386484,1.0,0.0,0.0,1.0,0.0,0.0,0.0
1,201,0.957562,0.067952,-2.032648,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,202,-0.995730,0.067952,0.883552,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,203,-0.898065,0.067952,-0.090174,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,204,0.859897,0.067952,-1.910186,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
163200,168548,0.859897,0.067952,-0.915552,0.0,0.0,0.0,0.0,1.0,0.0,1.0
163201,168549,-0.898065,0.067952,1.237996,0.0,0.0,1.0,0.0,0.0,0.0,1.0
163202,168550,1.348220,0.067952,1.647200,1.0,0.0,0.0,0.0,1.0,0.0,1.0
163203,168551,1.055226,0.067952,-0.090174,0.0,0.0,0.0,1.0,0.0,1.0,1.0


In [None]:
#кластеризщируем изеров с помощью DBSCAN
dbscan = DBSCAN(eps=0.2, n_jobs=-1)
db_clusters = dbscan.fit_predict(user_data.drop(['user_id'], axis =1))
db_clusters

array([  0,   1,   2, ..., 479, 488, 479])

In [None]:
pd.unique(db_clusters).shape #смотрим количество кластеров

(763,)

In [None]:
#добавляем кластеры к юзерам
user_dbscan = pd.Series(db_clusters, name='DBSCAN_cluster')
user_data['DBSCAN_cluster'] = user_dbscan
user_data

Unnamed: 0,user_id,age,country,city,gender_1,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic,DBSCAN_cluster
0,200,0.664568,0.067952,-1.386484,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,201,0.957562,0.067952,-2.032648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,202,-0.995730,0.067952,0.883552,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2
3,203,-0.898065,0.067952,-0.090174,0.0,1.0,0.0,0.0,0.0,1.0,0.0,3
4,204,0.859897,0.067952,-1.910186,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...
163200,168548,0.859897,0.067952,-0.915552,0.0,0.0,0.0,0.0,1.0,0.0,1.0,483
163201,168549,-0.898065,0.067952,1.237996,0.0,0.0,1.0,0.0,0.0,0.0,1.0,489
163202,168550,1.348220,0.067952,1.647200,1.0,0.0,0.0,0.0,1.0,0.0,1.0,479
163203,168551,1.055226,0.067952,-0.090174,0.0,0.0,0.0,1.0,0.0,1.0,1.0,488


In [None]:
(user_data[user_data['DBSCAN_cluster'] == -1]).shape

(14108, 12)

# Конвертируем таблицу с постами

In [None]:
post_text

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business
...,...,...,...
7018,7315,"OK, I would not normally watch a Farrelly brot...",movie
7019,7316,I give this movie 2 stars purely because of it...,movie
7020,7317,I cant believe this film was allowed to be mad...,movie
7021,7318,The version I saw of this film was the Blockbu...,movie


In [None]:
pt = post_text.copy()

In [None]:
pd.unique(pt['topic'])

array(['business', 'covid', 'entertainment', 'sport', 'politics', 'tech',
       'movie'], dtype=object)

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

# Загрузка стоп-слов и пунктуации
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('omw-1.4')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    words = word_tokenize(text.lower())  # Приводим к нижнему регистру и токенизировать
    filtered_words = [word for word in words if word not in stop_words and word not in punctuation]
    return " ".join(filtered_words)

[nltk_data] Downloading package stopwords to /home/karpov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/karpov/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/karpov/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/karpov/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /home/karpov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
pt['text'] = pt['text'].apply(preprocess_text)

In [None]:
for i in range(pt['text'].shape[0]):
    tokens = word_tokenize(pt['text'][i])
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens] #лемматизируем слова из текстов
    pt['text'][i] = ' '.join(lemmatized_words)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pt['text'][i] = ' '.join(lemmatized_words)


In [None]:
topic_names = pd.unique(pt['topic']).tolist()
topic_names

['business', 'covid', 'entertainment', 'sport', 'politics', 'tech', 'movie']

In [None]:
#делаем TFIDF и PCA
tfidf = TfidfVectorizer()
pca = PCA(n_components=5)
pt_clone = pt.copy()
pt_1 = pd.DataFrame()
for topics in topic_names:
    raw = tfidf.fit_transform(pt_clone[pt_clone['topic'] == topics]['text']) #матрица после tfidf по топикам
    tfidf_df = pd.DataFrame(raw.toarray(), columns=tfidf.get_feature_names_out(), index=pt_clone[pt_clone['topic'] == topics].index) #переводими матрицу в датафрейм
    tfidf_centred = tfidf_df.subtract(tfidf_df.mean()) #центрируем данные датафрейма tfidf
    pca_matrix = pca.fit_transform(tfidf_centred) #применяем к датафрейму tfidf PCA
    pt_clone[pca.get_feature_names_out().tolist()] = pd.DataFrame(pca_matrix, index = (pt_clone[pt_clone['topic'] == topics]['text'].index.tolist())) #создаем датафрейм PCA
    pt_1 = pd.concat([pt_1, pt_clone[pt_clone['topic'] == topics]], ignore_index=True) #Добавляем данные PCA к иизначальному датафрейму

In [None]:
#Кодируем топики
lab_enc = LabelEncoder()

pt_1['topic_copy'] = pt_1['topic']
pt_1['topic_copy'] = lab_enc.fit_transform(pt_1['topic_copy'])
pt_1

Unnamed: 0,post_id,text,topic,pca0,pca1,pca2,pca3,pca4,topic_copy
0,1,uk economy facing major risk uk manufacturing ...,business,-0.251639,0.163960,0.071754,-0.103709,0.037717,0
1,2,aid climate top davos agenda climate change fi...,business,0.002299,-0.036814,-0.074391,0.069905,0.020913,0
2,3,asian quake hit european share share europe le...,business,-0.065608,-0.036552,-0.033421,0.123101,-0.036437,0
3,4,india power share jump debut share india large...,business,0.044904,-0.069898,-0.038411,0.031289,-0.076950,0
4,5,lacroix label bought u firm luxury good group ...,business,0.051058,-0.069827,-0.019382,-0.071333,-0.049758,0
...,...,...,...,...,...,...,...,...,...
7018,7315,ok would normally watch farrelly brother movie...,movie,0.156940,0.007568,0.040797,0.034853,0.011007,3
7019,7316,give movie 2 star purely slightly liberal plot...,movie,0.091994,-0.009262,0.026543,-0.010390,0.022783,3
7020,7317,cant believe film allowed made people drug bea...,movie,-0.061933,-0.067855,0.032780,-0.010957,0.025628,3
7021,7318,version saw film blockbuster rental similar ti...,movie,-0.072014,-0.033861,-0.007452,0.044782,-0.008183,3


In [None]:
#Добавляем кластеры
km = KMeans(n_clusters=7, random_state = 13)

pt_1['KMeans_cluster'] = km.fit_predict(pt_1[pca.get_feature_names_out().tolist()])
pt_1

Unnamed: 0,post_id,text,topic,pca0,pca1,pca2,pca3,pca4,topic_copy,KMeans_cluster
0,1,uk economy facing major risk uk manufacturing ...,business,-0.251639,0.163960,0.071754,-0.103709,0.037717,0,4
1,2,aid climate top davos agenda climate change fi...,business,0.002299,-0.036814,-0.074391,0.069905,0.020913,0,5
2,3,asian quake hit european share share europe le...,business,-0.065608,-0.036552,-0.033421,0.123101,-0.036437,0,5
3,4,india power share jump debut share india large...,business,0.044904,-0.069898,-0.038411,0.031289,-0.076950,0,1
4,5,lacroix label bought u firm luxury good group ...,business,0.051058,-0.069827,-0.019382,-0.071333,-0.049758,0,1
...,...,...,...,...,...,...,...,...,...,...
7018,7315,ok would normally watch farrelly brother movie...,movie,0.156940,0.007568,0.040797,0.034853,0.011007,3,1
7019,7316,give movie 2 star purely slightly liberal plot...,movie,0.091994,-0.009262,0.026543,-0.010390,0.022783,3,1
7020,7317,cant believe film allowed made people drug bea...,movie,-0.061933,-0.067855,0.032780,-0.010957,0.025628,3,0
7021,7318,version saw film blockbuster rental similar ti...,movie,-0.072014,-0.033861,-0.007452,0.044782,-0.008183,3,2


# Подготавливаем данные для обучения модели

In [None]:
# user для инициализации из разных класстеров DBSACN
user_first = []
n = 50
for cluster in pd.unique(user_data['DBSCAN_cluster']):
    if user_data[user_data['DBSCAN_cluster'] == cluster].shape[0] < n:
        user_first.append(user_data[user_data['DBSCAN_cluster'] == cluster]['user_id'].to_list())
    else:
        user_first.append(user_data[user_data['DBSCAN_cluster'] == cluster]['user_id'].sample(n, random_state = 13).to_list())
user_1 = (sum(user_first, []))
user_sql_string = "','".join(['{:.0f}'.format(idu) for idu in user_1])

In [None]:
feed_data = batch_load_sql(
        f'''SELECT *
    FROM public.feed_data
    WHERE user_id IN ('{user_sql_string}');''')

feed_data

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-10-14 11:52:09,33445,3405,view,0
1,2021-10-14 11:52:25,33445,3381,view,1
2,2021-10-14 11:52:53,33445,3381,like,0
3,2021-10-14 11:52:55,33445,6867,view,1
4,2021-10-14 11:53:51,33445,6867,like,0
...,...,...,...,...,...
4897589,2021-12-09 23:12:48,98716,4152,view,1
4897590,2021-12-09 23:13:25,98716,4152,like,0
4897591,2021-12-09 23:13:27,98716,3590,view,1
4897592,2021-12-09 23:15:46,98716,3590,like,0


In [None]:
pd.unique(feed_data['user_id']).shape

(10061,)

In [None]:
feed_data = feed_data[feed_data['action'] != 'like']
feed_data = feed_data.drop('action', axis = 1)
feed_data

Unnamed: 0,timestamp,user_id,post_id,target
0,2021-10-14 11:52:09,33445,3405,0
1,2021-10-14 11:52:25,33445,3381,1
3,2021-10-14 11:52:55,33445,6867,1
5,2021-10-14 11:53:53,33445,614,0
6,2021-10-14 11:56:24,33445,5402,1
...,...,...,...,...
4897587,2021-12-09 23:11:21,98716,3709,0
4897588,2021-12-09 23:11:38,98716,5448,0
4897589,2021-12-09 23:12:48,98716,4152,1
4897591,2021-12-09 23:13:27,98716,3590,1


In [None]:
feed_data[feed_data['user_id'] == 28411]

Unnamed: 0,timestamp,user_id,post_id,target
816131,2021-10-08 09:27:44,28411,4800,0
816132,2021-10-08 09:29:56,28411,4002,0
816133,2021-10-08 09:32:00,28411,829,1
816135,2021-10-08 09:33:18,28411,211,0
816136,2021-10-08 09:35:09,28411,1915,1
...,...,...,...,...
816598,2021-12-28 10:10:04,28411,5295,1
816600,2021-12-28 10:12:13,28411,6551,0
816601,2021-12-28 10:14:51,28411,5494,0
816602,2021-12-28 10:16:01,28411,2514,0


In [None]:
feed_data[feed_data['user_id'] == 100636].sort_values(by='timestamp')['target']

Series([], Name: target, dtype: int64)

In [None]:
#Создаем треин и тест выборки по дате
X_train = pd.DataFrame()
y_train = pd.DataFrame()
X_test = pd.DataFrame()
y_test = pd.DataFrame()
for user in pd.unique(feed_data['user_id']):
    X_r, X_s, y_r, y_s = train_test_split(feed_data[feed_data['user_id'] == user].sort_values(by='timestamp').drop('target', axis = 1),
                                          feed_data[feed_data['user_id'] == user].sort_values(by='timestamp')['target'],
                                          test_size = 0.25,
                                          shuffle = False)
    X_train = pd.concat([X_train, X_r], ignore_index=True)
    y_train = pd.concat([y_train, y_r], ignore_index=True)
    X_test = pd.concat([X_test, X_s], ignore_index=True)
    y_test = pd.concat([y_test, y_s], ignore_index=True)


X_train

Unnamed: 0,timestamp,user_id,post_id
0,2021-10-14 11:52:09,33445,3405
1,2021-10-14 11:52:25,33445,3381
2,2021-10-14 11:52:55,33445,6867
3,2021-10-14 11:53:53,33445,614
4,2021-10-14 11:56:24,33445,5402
...,...,...,...
3187820,2021-11-16 11:14:19,98716,524
3187821,2021-11-16 11:16:02,98716,3712
3187822,2021-11-16 11:16:41,98716,5295
3187823,2021-11-16 11:18:45,98716,2575


In [None]:
X_train

Unnamed: 0,timestamp,user_id,post_id
0,2021-10-14 11:52:09,33445,3405
1,2021-10-14 11:52:25,33445,3381
2,2021-10-14 11:52:55,33445,6867
3,2021-10-14 11:53:53,33445,614
4,2021-10-14 11:56:24,33445,5402
...,...,...,...
3187820,2021-11-16 11:14:19,98716,524
3187821,2021-11-16 11:16:02,98716,3712
3187822,2021-11-16 11:16:41,98716,5295
3187823,2021-11-16 11:18:45,98716,2575


In [None]:
X_test

Unnamed: 0,timestamp,user_id,post_id
0,2021-11-24 13:42:56,33445,1905
1,2021-11-24 13:45:43,33445,1457
2,2021-11-24 13:47:33,33445,513
3,2021-11-24 13:49:24,33445,1305
4,2021-12-09 19:50:56,33445,6751
...,...,...,...
1067623,2021-12-09 23:11:21,98716,3709
1067624,2021-12-09 23:11:38,98716,5448
1067625,2021-12-09 23:12:48,98716,4152
1067626,2021-12-09 23:13:27,98716,3590


In [None]:
X_train_1 = X_train.copy()
X_test_1 = X_test.copy()

In [None]:
X_train_1 = X_train_1.merge(user_data, on='user_id').merge(pt_1, on='post_id').drop(['timestamp', 'user_id', 'post_id', 'text', 'topic'], axis =1)
X_train_1

Unnamed: 0,index_x,age,country,city,gender_1,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic,DBSCAN_cluster,index_y,pca0,pca1,pca2,pca3,pca4,topic_copy,KMeans_cluster
0,33224,0.176245,-4.299089,-0.263414,0.0,0.0,0.0,0.0,0.0,1.0,0.0,84,1545,-0.013072,-0.007488,-0.010693,0.007374,-0.005505,1,2
1,38871,-0.019084,-1.387729,0.538068,1.0,0.0,0.0,0.0,0.0,1.0,0.0,368,1545,-0.013072,-0.007488,-0.010693,0.007374,-0.005505,1,2
2,100673,-0.507407,-4.299089,-0.116061,0.0,1.0,0.0,0.0,0.0,0.0,0.0,74,1545,-0.013072,-0.007488,-0.010693,0.007374,-0.005505,1,2
3,73795,-0.605072,0.067952,0.054192,1.0,0.0,0.0,0.0,0.0,1.0,0.0,9,1545,-0.013072,-0.007488,-0.010693,0.007374,-0.005505,1,2
4,33433,1.934208,0.067952,-1.807636,0.0,0.0,0.0,1.0,0.0,1.0,0.0,364,1545,-0.013072,-0.007488,-0.010693,0.007374,-0.005505,1,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3187820,38557,1.152891,2.251473,-0.424706,1.0,0.0,1.0,0.0,0.0,0.0,0.0,237,1215,0.034640,-0.049552,-0.038590,-0.038636,0.001849,1,2
3187821,11890,-0.702736,2.251473,-1.801662,1.0,1.0,0.0,0.0,0.0,1.0,0.0,429,1215,0.034640,-0.049552,-0.038590,-0.038636,0.001849,1,2
3187822,100491,-0.605072,0.067952,-0.090174,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,1215,0.034640,-0.049552,-0.038590,-0.038636,0.001849,1,2
3187823,88376,-0.409742,2.251473,-0.267396,1.0,0.0,0.0,0.0,0.0,1.0,0.0,43,1215,0.034640,-0.049552,-0.038590,-0.038636,0.001849,1,2


In [None]:
X_test_1 = X_test_1.merge(user_data, on='user_id').merge(pt_1, on='post_id').drop(['timestamp', 'user_id', 'post_id', 'text', 'topic'], axis =1)
X_test_1

Unnamed: 0,index_x,age,country,city,gender_1,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic,DBSCAN_cluster,index_y,pca0,pca1,pca2,pca3,pca4,topic_copy,KMeans_cluster
0,33224,0.176245,-4.299089,-0.263414,0.0,0.0,0.0,0.0,0.0,1.0,0.0,84,3192,0.022543,-0.014015,0.224762,-0.019606,0.022849,5,6
1,100688,-0.702736,0.067952,-0.090174,0.0,0.0,1.0,0.0,0.0,0.0,0.0,14,3192,0.022543,-0.014015,0.224762,-0.019606,0.022849,5,6
2,80583,-0.605072,-1.387729,-1.952002,1.0,0.0,1.0,0.0,0.0,0.0,0.0,130,3192,0.022543,-0.014015,0.224762,-0.019606,0.022849,5,6
3,161775,-0.995730,2.251473,0.221458,1.0,0.0,0.0,0.0,1.0,1.0,1.0,544,3192,0.022543,-0.014015,0.224762,-0.019606,0.022849,5,6
4,38954,0.078580,-4.299089,-1.076844,1.0,0.0,0.0,1.0,0.0,0.0,0.0,336,3192,0.022543,-0.014015,0.224762,-0.019606,0.022849,5,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1067623,25282,-1.093394,-4.299089,-0.241510,1.0,0.0,0.0,0.0,0.0,1.0,0.0,25,2814,-0.154135,0.073276,-0.034273,-0.030901,0.125640,5,4
1067624,33035,0.859897,0.067952,-0.090174,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2,2814,-0.154135,0.073276,-0.034273,-0.030901,0.125640,5,4
1067625,19452,3.203847,0.067952,-0.090174,1.0,0.0,0.0,1.0,0.0,1.0,0.0,291,2814,-0.154135,0.073276,-0.034273,-0.030901,0.125640,5,4
1067626,115204,-0.214413,0.067952,0.999045,0.0,1.0,0.0,0.0,0.0,1.0,1.0,476,2814,-0.154135,0.073276,-0.034273,-0.030901,0.125640,5,4


# Обучение модели

In [None]:
from catboost import CatBoostClassifier

cb = CatBoostClassifier(learning_rate=0.1,
                        eval_metric='PRAUC',
                        depth=15,
                        loss_function = 'Logloss',
                        auto_class_weights = 'Balanced',
                        random_state = 13,
                        thread_count=-1,
                        verbose=100)
cb.fit(X_train_1, y_train.values.ravel(), eval_set=(X_test_1,  y_test.values.ravel()))

preds_train_cb = cb.predict(X_train_1)
preds_test_cb = cb.predict(X_test_1)

0:	learn: 0.5270357	test: 0.5344454	best: 0.5344454 (0)	total: 2.13s	remaining: 35m 29s
100:	learn: 0.7303631	test: 0.5350879	best: 0.5363360 (26)	total: 4m 53s	remaining: 43m 29s
200:	learn: 0.7820111	test: 0.5346966	best: 0.5363360 (26)	total: 9m 52s	remaining: 39m 13s
300:	learn: 0.8164630	test: 0.5345947	best: 0.5363360 (26)	total: 14m 55s	remaining: 34m 40s
400:	learn: 0.8479127	test: 0.5347527	best: 0.5363360 (26)	total: 20m 2s	remaining: 29m 56s
500:	learn: 0.8720782	test: 0.5349008	best: 0.5363360 (26)	total: 25m 9s	remaining: 25m 3s
600:	learn: 0.8908922	test: 0.5347762	best: 0.5363360 (26)	total: 30m 17s	remaining: 20m 6s
700:	learn: 0.9067696	test: 0.5347345	best: 0.5363360 (26)	total: 35m 27s	remaining: 15m 7s
800:	learn: 0.9198213	test: 0.5347590	best: 0.5363360 (26)	total: 40m 35s	remaining: 10m 5s
900:	learn: 0.9301722	test: 0.5349937	best: 0.5363360 (26)	total: 45m 42s	remaining: 5m 1s
999:	learn: 0.9383826	test: 0.5352349	best: 0.5363360 (26)	total: 50m 49s	remaining: 

In [None]:
cb.feature_importances_

array([ 6.05531337,  5.35215369,  6.68017322,  5.98737392,  0.55953366,
        1.64271133,  1.54390049,  2.84452513,  4.54563212,  4.21291715,
        2.93680248,  7.17820326,  8.00663703,  8.41173739,  9.75937124,
        9.49497089, 10.12324845,  4.66479517])

In [None]:
f1_score(y_train, preds_train_cb, average='weighted')

0.6334460171041699

In [None]:
f1_score(y_test, preds_test_cb, average='weighted')

0.5864692082976147

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_train, preds_train_cb))
print(classification_report(y_test, preds_test_cb))

              precision    recall  f1-score   support

         0.0       0.90      0.56      0.69   2721872
         1.0       0.19      0.61      0.30    465953

    accuracy                           0.57   3187825
   macro avg       0.54      0.59      0.49   3187825
weighted avg       0.79      0.57      0.63   3187825

              precision    recall  f1-score   support

         0.0       0.84      0.54      0.65    891440
         1.0       0.17      0.46      0.24    176188

    accuracy                           0.53   1067628
   macro avg       0.50      0.50      0.45   1067628
weighted avg       0.72      0.53      0.59   1067628



In [None]:
cb.save_model('catboost_model_50_lms', format="cbm")

In [None]:
cb_from_file = CatBoostClassifier()  # здесь не указываем параметры, которые были при обучении, в дампе модели все есть

cb_from_file.load_model("catboost_model_50")

<catboost.core.CatBoostClassifier at 0x7f2d99b3e7f0>

# Выдача рекомендации

In [None]:
user_data

Unnamed: 0,user_id,age,country,city,gender_1,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic,DBSCAN_cluster
0,200,0.664568,0.067952,-1.386484,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0
1,201,0.957562,0.067952,-2.032648,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,202,-0.995730,0.067952,0.883552,1.0,0.0,0.0,0.0,1.0,0.0,0.0,2
3,203,-0.898065,0.067952,-0.090174,0.0,1.0,0.0,0.0,0.0,1.0,0.0,3
4,204,0.859897,0.067952,-1.910186,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...
163200,168548,0.859897,0.067952,-0.915552,0.0,0.0,0.0,0.0,1.0,0.0,1.0,483
163201,168549,-0.898065,0.067952,1.237996,0.0,0.0,1.0,0.0,0.0,0.0,1.0,489
163202,168550,1.348220,0.067952,1.647200,1.0,0.0,0.0,0.0,1.0,0.0,1.0,479
163203,168551,1.055226,0.067952,-0.090174,0.0,0.0,0.0,1.0,0.0,1.0,1.0,488


In [None]:
sum(pd.unique(feed_data['user_id']) == 205)

0

In [None]:
user_solo = pd.read_sql(
    "SELECT * FROM public.feed_data WHERE user_id=205;",
    conn_uri)
user_solo

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-11-04 15:44:21,205,2145,view,0
1,2021-10-23 17:52:25,205,1975,view,0
2,2021-10-23 17:53:32,205,1892,view,0
3,2021-10-23 17:55:30,205,2892,view,0
4,2021-10-23 17:57:06,205,1120,view,0
...,...,...,...,...,...
580,2021-12-09 16:32:02,205,4012,view,0
581,2021-12-09 16:33:12,205,1529,view,0
582,2021-12-09 16:33:54,205,1258,view,0
583,2021-12-09 16:35:59,205,1232,view,0


In [None]:
user_solo = user_solo[user_solo['action'] != 'like']
user_solo

Unnamed: 0,timestamp,user_id,post_id,action,target
0,2021-11-04 15:44:21,205,2145,view,0
1,2021-10-23 17:52:25,205,1975,view,0
2,2021-10-23 17:53:32,205,1892,view,0
3,2021-10-23 17:55:30,205,2892,view,0
4,2021-10-23 17:57:06,205,1120,view,0
...,...,...,...,...,...
580,2021-12-09 16:32:02,205,4012,view,0
581,2021-12-09 16:33:12,205,1529,view,0
582,2021-12-09 16:33:54,205,1258,view,0
583,2021-12-09 16:35:59,205,1232,view,0


In [None]:
user_solo = user_solo.drop(['timestamp', 'action'], axis = 1)
user_solo

Unnamed: 0,user_id,post_id,target
0,205,2145,0
1,205,1975,0
2,205,1892,0
3,205,2892,0
4,205,1120,0
...,...,...,...
580,205,4012,0
581,205,1529,0
582,205,1258,0
583,205,1232,0


In [None]:
model_data = user_data[user_data['user_id'] == 205]
model_data

Unnamed: 0,user_id,age,country,city,gender_1,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic,DBSCAN_cluster
5,205,0.469239,0.067952,-1.312808,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4


In [None]:
pt_1['user_id'] = 205
pt_1

Unnamed: 0,post_id,topic,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,KMeans_cluster,user_id
0,1,0,-0.251642,0.163853,0.072783,-0.103348,0.038431,0.039640,-0.027934,-0.030589,0.043932,0.007402,4,205
1,2,0,0.002296,-0.036745,-0.074268,0.070673,0.019397,0.058854,-0.034495,-0.031961,0.000018,0.029150,6,205
2,3,0,-0.065610,-0.036582,-0.034108,0.118855,-0.034580,0.008190,0.002489,-0.005484,-0.088283,-0.021325,3,205
3,4,0,0.044912,-0.069945,-0.039010,0.029774,-0.076927,0.080845,0.045720,-0.054479,0.050570,0.016131,0,205
4,5,0,0.051057,-0.069763,-0.019808,-0.069909,-0.051727,0.036525,-0.010122,-0.013774,-0.079563,-0.011773,0,205
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,3,0.156939,0.007611,0.040936,0.037421,0.011744,-0.117037,0.055479,-0.017180,-0.019924,-0.037735,0,205
7019,7316,3,0.091999,-0.009334,0.026558,-0.011269,0.021285,0.024736,-0.004473,0.011119,0.052619,-0.011274,0,205
7020,7317,3,-0.061937,-0.067832,0.032938,-0.011840,0.024682,-0.030830,-0.008504,0.005428,0.008392,-0.069710,1,205
7021,7318,3,-0.072014,-0.033861,-0.007409,0.045923,-0.005512,0.001955,-0.024997,-0.045560,-0.021797,0.049792,3,205


In [None]:
test_model = pt_1.merge(model_data).drop(['user_id', 'post_id'], axis =1)
test_model = test_model.reindex(columns=X_train_1.columns.tolist())
test_model

Unnamed: 0,age,country,city,gender_1,exp_group_1,exp_group_2,exp_group_3,exp_group_4,os_iOS,source_organic,...,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,KMeans_cluster
0,0.469239,0.067952,-1.312808,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.163853,0.072783,-0.103348,0.038431,0.039640,-0.027934,-0.030589,0.043932,0.007402,4
1,0.469239,0.067952,-1.312808,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.036745,-0.074268,0.070673,0.019397,0.058854,-0.034495,-0.031961,0.000018,0.029150,6
2,0.469239,0.067952,-1.312808,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.036582,-0.034108,0.118855,-0.034580,0.008190,0.002489,-0.005484,-0.088283,-0.021325,3
3,0.469239,0.067952,-1.312808,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.069945,-0.039010,0.029774,-0.076927,0.080845,0.045720,-0.054479,0.050570,0.016131,0
4,0.469239,0.067952,-1.312808,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.069763,-0.019808,-0.069909,-0.051727,0.036525,-0.010122,-0.013774,-0.079563,-0.011773,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,0.469239,0.067952,-1.312808,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.007611,0.040936,0.037421,0.011744,-0.117037,0.055479,-0.017180,-0.019924,-0.037735,0
7019,0.469239,0.067952,-1.312808,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.009334,0.026558,-0.011269,0.021285,0.024736,-0.004473,0.011119,0.052619,-0.011274,0
7020,0.469239,0.067952,-1.312808,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.067832,0.032938,-0.011840,0.024682,-0.030830,-0.008504,0.005428,0.008392,-0.069710,1
7021,0.469239,0.067952,-1.312808,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,-0.033861,-0.007409,0.045923,-0.005512,0.001955,-0.024997,-0.045560,-0.021797,0.049792,3


In [None]:
predict_test = cb.predict(test_model)  #catboost predict


pt_1['predict'] = predict_test
post_pred = pt_1[pt_1['predict']== 1]['post_id'].tolist()
len(post_pred)

2585

In [None]:
real_post = user_solo[user_solo['target']== 1]['post_id'].tolist()
len(sorted(real_post))

23

In [None]:
(list(set(post_pred) & set(real_post)))

[3201, 5795, 4774, 3083, 2606, 6706, 3475, 3641, 5339, 2623]

In [None]:
best_things = cb.predict_proba(test_model)[:, 1]

pt_1['predict'] = best_things
post_pred_proba = pt_1.sort_values(by='predict')['post_id'].head(20)
post_pred_proba

948     2817
5344    5563
4310    4494
1227    2338
2348     582
4580    4769
2498     731
2376     605
4465    4654
2370     598
2502     736
4122    4434
3821    2117
49        52
2991    1672
2691     935
3804    2098
1198    3077
2650     892
2344     570
Name: post_id, dtype: int64

In [None]:
(list(set(post_pred_proba) & set(real_post)))

[]

In [None]:
best_things

array([0.57470036, 0.56140348, 0.36492715, ..., 0.49934379, 0.46998637,
       0.3723271 ])

In [None]:
(list(set(post_pred_proba) & set(user_solo[user_solo['target'] != 1]['post_id'].tolist())))

[2098, 935]

In [None]:
pt_1

Unnamed: 0,post_id,topic,pca0,pca1,pca2,pca3,pca4,pca5,pca6,pca7,pca8,pca9,KMeans_cluster,user_id,predict
0,1,0,-0.251642,0.163853,0.072783,-0.103348,0.038431,0.039640,-0.027934,-0.030589,0.043932,0.007402,4,205,0.574700
1,2,0,0.002296,-0.036745,-0.074268,0.070673,0.019397,0.058854,-0.034495,-0.031961,0.000018,0.029150,6,205,0.561403
2,3,0,-0.065610,-0.036582,-0.034108,0.118855,-0.034580,0.008190,0.002489,-0.005484,-0.088283,-0.021325,3,205,0.364927
3,4,0,0.044912,-0.069945,-0.039010,0.029774,-0.076927,0.080845,0.045720,-0.054479,0.050570,0.016131,0,205,0.382781
4,5,0,0.051057,-0.069763,-0.019808,-0.069909,-0.051727,0.036525,-0.010122,-0.013774,-0.079563,-0.011773,0,205,0.366615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,7315,3,0.156939,0.007611,0.040936,0.037421,0.011744,-0.117037,0.055479,-0.017180,-0.019924,-0.037735,0,205,0.645307
7019,7316,3,0.091999,-0.009334,0.026558,-0.011269,0.021285,0.024736,-0.004473,0.011119,0.052619,-0.011274,0,205,0.520452
7020,7317,3,-0.061937,-0.067832,0.032938,-0.011840,0.024682,-0.030830,-0.008504,0.005428,0.008392,-0.069710,1,205,0.499344
7021,7318,3,-0.072014,-0.033861,-0.007409,0.045923,-0.005512,0.001955,-0.024997,-0.045560,-0.021797,0.049792,3,205,0.469986


Модель попала в отлайканные посты юзера, которого ранее она не видела, отсюда можно сделать предположение, что наша выборка юзеров репрезентативна.

На тесте модель показывает неплохой результат по несбаланированным группам таргета.

Дополнительно модель можно закодировать признаки как юзеров, так и текст постов с помощью нейронных сетей.