In [1]:
# Инициализируем окружение
import os, joblib, s3fs
import pandas as pd
import numpy  as np

pd.options.display.max_columns = 100
pd.options.display.max_rows = 100 

from dotenv import load_dotenv, find_dotenv
load_dotenv()

PROJECT_ROOT    = f"{find_dotenv()[0:-5]}"
DATA_DIR        = f"{PROJECT_ROOT}/data"
RANDOM_STATE    = 42
S3_DIR          = f"{os.environ['S3_BUCKET_NAME']}/Diplom"

s3 = s3fs.core.S3FileSystem(
    endpoint_url=os.environ['AWS_ENDPOINT_URL'],
    key=os.environ['AWS_ACCESS_KEY_ID'],
    secret=os.environ['AWS_SECRET_ACCESS_KEY'], cache_regions=True
)


In [36]:
SRC_FILES = {
    "cats_src" : f"{S3_DIR}/source_data/category_tree.csv",
    "props_src":[f"{S3_DIR}/source_data/item_properties_part1.csv",
                    f"{S3_DIR}/source_data/item_properties_part2.csv"],
    "event_src": f"{S3_DIR}/source_data/events.csv"
}
INFER_FILES = {
    "cats_dst" : f"{S3_DIR}/infer_data/category_tree.parquet",
    "item_cat" : f"{S3_DIR}/infer_data/item_categories.parquet",
    "item_prop": f"{S3_DIR}/infer_data/item_properties.parquet",
    "available": f"{S3_DIR}/infer_data/item_availability.parquet",
    "event_dst": f"{S3_DIR}/infer_data/events.parquet",
    "eventlast": f"{S3_DIR}/infer_data/last_events.parquet",
    "ranked"   : f"{S3_DIR}/infer_data/ranked_candidades.parquet",
}
PROD_FILES = {
    "top_pop"  : f"{S3_DIR}/recommendations/top_popular.parquet",
    "similar"  : f"{S3_DIR}/recommendations/similar_items.parquet",
    "final"    : f"{S3_DIR}/recommendations/final_recommendations.parquet"
}
MODEL_FILES = {
    "als_parms": f"{S3_DIR}/model/als_params.pkl",
    "cb_parms" : f"{S3_DIR}/model/cb_params.pkl",
    "cb_model" : f"{S3_DIR}/model/cb_model.pkl"
}
# параметры конфигурации DAG-а
CONFIG      = {
    "EVENT_HISTORY_WEEKS": 26,  # сохранять events только за последние подгода
    "EVENT_POPULAR_WEEKS": 12,  # глубина истории (в неделях) для определения популярных товаров
    "EVENT_TARGET_WEEKS" : 2,   # количество недель для target-периода (в режиме переобучения моделей)
    "EVENT_CUT_OFF_WEEKS": 4,   # количество недель для inference-периода (в режиме расчета рекомендаций)
}



In [3]:
import io
from   s3fs   import S3FileSystem

def pd_info(df: pd.DataFrame):
    with io.StringIO() as output:
        df.info(show_counts=True, buf=output)
        return output.getvalue()

def load_csv_files(s3: S3FileSystem, path_list: list):
    try:
        df = pd.DataFrame()
        for path in path_list:
            with s3.open(path, mode='r') as fd:
                df = pd.concat([df, pd.read_csv(fd)], axis=0, ignore_index=True)
        return df
    except:
        print(path)
        return None

def delete_s3_files(s3: S3FileSystem, files: dict):
    try:
        for k,path in files.items():
            s3.rm(path)
            print(f"rm {path}")
    except:
        print(path)
        pass
    return

def load_parquet_file(s3: S3FileSystem, path: str):
    try:
        df = pd.DataFrame()
        with s3.open(path, mode='rb') as fd:
            df = pd.read_parquet(fd)
        return df
    except:
        #logging.error(f"Error loading file: {path}")
        return None

def save_to_parquet(df, s3: S3FileSystem, path: str, verbose=True):
    with s3.open(path, mode='wb') as fd:
        df.to_parquet(fd)
    #if verbose:  logging.info(f"\n{path}\n{df.info(show_counts=True)}")
    return

def save_to_pkl(obj, s3: S3FileSystem, path: str):
    with s3.open(path, mode='wb') as fd:
        joblib.dump (obj, fd)
    return

def load_pkl_file(s3: S3FileSystem, path: str):
    try:
        with s3.open(path, mode='rb') as fd:
            obj = joblib.load(fd)
        return obj
    except:
        #logging.error(f"Error loading file: {path}")
        return None


In [4]:
import sys
def ShowAttrs (obj, allattrs=False, outfile=sys.stdout):
    for attr in dir(obj):
        par = "()"  if callable(getattr(obj, attr))  else  ""
        if allattrs  or  not attr.startswith('__') :
            print (f'{attr}{par}:\n{type(getattr(obj, attr))}\n{getattr(obj, attr).__doc__}\n', file=outfile)

In [5]:
fr = load_parquet_file(s3, PROD_FILES['final'])
fr.set_index("visitorid", inplace=True)
fr.loc[155,'itemid'].to_list()[:10]

[50928, 373637, 372610, 437305, 321343, 157024, 420271, 186888, 224690, 294799]

In [6]:
max_rank = fr.groupby('visitorid').agg(maxrank=('rank','max'))
max_rank.sort_values(by='maxrank').head()

Unnamed: 0_level_0,maxrank
visitorid,Unnamed: 1_level_1
717032,22
689826,22
370111,22
1241066,22
1329994,22


In [7]:
sim = load_parquet_file(s3, PROD_FILES['similar'])
sim.reset_index().groupby("itemid").agg(maxrank=('sim_rank','max')).sort_values(by='maxrank',ascending=False)
#sim.head()

Unnamed: 0_level_0,maxrank
itemid,Unnamed: 1_level_1
466861,14
19,14
25,14
147,14
250,14
...,...
827,14
856,14
909,14
982,14


In [8]:
als_params = load_pkl_file(s3, MODEL_FILES['als_parms'])
als_params

{'alpha': 100.0, 'factors': 100, 'iterations': 15, 'regularization': 0.005}

In [9]:
cb_params = load_pkl_file(s3, MODEL_FILES['cb_parms'])
cb_params

{'depth': 6, 'learning_rate': 0.18, 'l2_leaf_reg': 1, 'iterations': 10}

In [10]:
events = load_parquet_file(s3, f"{S3_DIR}/data/events.parquet")
events.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2099173 entries, 0 to 2099172
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   timestamp      2099173 non-null  int64  
 1   visitorid      2099173 non-null  int64  
 2   event          2099173 non-null  int8   
 3   itemid         2099173 non-null  int64  
 4   transactionid  18244 non-null    float64
 5   categoryid     2099173 non-null  int64  
 6   available      2099173 non-null  int8   
 7   root           2099173 non-null  int64  
dtypes: float64(1), int64(5), int8(2)
memory usage: 100.1 MB


In [11]:
print(pd_info(events))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2099173 entries, 0 to 2099172
Data columns (total 8 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   timestamp      2099173 non-null  int64  
 1   visitorid      2099173 non-null  int64  
 2   event          2099173 non-null  int8   
 3   itemid         2099173 non-null  int64  
 4   transactionid  18244 non-null    float64
 5   categoryid     2099173 non-null  int64  
 6   available      2099173 non-null  int8   
 7   root           2099173 non-null  int64  
dtypes: float64(1), int64(5), int8(2)
memory usage: 100.1 MB



In [12]:
events['visitorid'].nunique()

1076473

In [90]:
import scipy
from   sklearn.preprocessing import MinMaxScaler
from   implicit.als          import AlternatingLeastSquares
from   catboost              import CatBoostClassifier, Pool
from   threadpoolctl         import threadpool_limits
threadpool_limits(1, "blas")
RANDOM_STATE = 42

def get_registered_items(timestamp, items_ctgr):
    ''' Возвращает набор itemid, для которых определена корректная категория на заданный момент времени '''
    return  set(items_ctgr.query("timestamp <= @timestamp")['itemid'])

def get_unavailable_items(timestamp, items_avail):
    ''' Возвращает набор itemid, для которых установлен признак available==0 на заданный момент времени '''
    tmp = items_avail.query("timestamp <= @timestamp").drop_duplicates(subset=['itemid'], keep='first')
    return  set(tmp.query("value == '0'")['itemid'])

def get_available_items(timestamp, items_ctgr, items_avail):
    ''' Возвращает набор itemid, доступных на заданный момент времени '''
    return  get_registered_items(timestamp, items_ctgr) - get_unavailable_items(timestamp, items_avail)

def get_item_availability(timestamp, items_ctgr, items_avail):
    ''' Возвращает признак доступности товаров, актуальный на заданный момент времени '''
    lst = get_registered_items(timestamp, items_ctgr)
    tmp = items_avail.query("timestamp <= @timestamp and itemid in @lst") \
                     .drop_duplicates(subset=['itemid'], keep='first').reset_index(drop=True)
    return  tmp[['itemid','value']]

def get_item_properties(timestamp, items):
    ''' Возвращает свойства товаров/товара, актуальные на заданный момент времени '''
    tmp = items.query("timestamp <= @timestamp") \
               .drop_duplicates(subset=['itemid','property'], keep='first').reset_index(drop=True)
    return  tmp[['itemid','property','value_code']]

def calc_item_rating(df):
    ''' Формирует вектор рейтинга взаимодействий: наличие просмотров + добавления в корзину + покупки*2 '''
    return ((df[0] > 0) + df[1] + df[2]*2).astype(np.int16)

def user_item_matrix(events_set, users, items):
    ''' Формирует и возвращает матрицу взаимодействий в dense и sparse формате '''
    user_item = events_set.query("visitorid in @users  and  itemid in @items") \
                          .groupby(['visitorid','itemid'])['event'].value_counts().unstack(fill_value=0).reset_index()
    
    # формируем рейтинг взаимодействий
    user_item['rating'] = 0  if user_item.shape[0]==0  else calc_item_rating(user_item)
    return user_item, scipy.sparse.csr_matrix(
        (user_item['rating'], (user_item['visitorid'], user_item['itemid'])), shape=(users.max()+1,items.max()+1)
    )



In [14]:
items      = load_parquet_file(s3, INFER_FILES['item_prop'])
items_ctgr = load_parquet_file(s3, INFER_FILES['item_cat'])
items_avail= load_parquet_file(s3, INFER_FILES['available'])
items.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17980427 entries, 0 to 17980426
Data columns (total 4 columns):
 #   Column      Non-Null Count     Dtype
---  ------      --------------     -----
 0   timestamp   17980427 non-null  int64
 1   itemid      17980427 non-null  int64
 2   property    17980427 non-null  int64
 3   value_code  17980427 non-null  int64
dtypes: int64(4)
memory usage: 548.7 MB


In [15]:
inference_date = items['timestamp'].max()
pd.to_datetime(inference_date, unit='ms')

Timestamp('2015-09-13 00:00:00')

In [16]:
last_events = events.query("timestamp >= @inference_date").sort_values(by='timestamp',ascending=False) \
                             .groupby(['visitorid','itemid']).head(3) \
                             .groupby('visitorid').head(10)           \
                             .groupby('visitorid')['itemid'].agg(list)
last_events.info(show_counts=True)

<class 'pandas.core.series.Series'>
Index: 41332 entries, 48 to 1407549
Series name: itemid
Non-Null Count  Dtype 
--------------  ----- 
41332 non-null  object
dtypes: object(1)
memory usage: 645.8+ KB


In [17]:
last_events.head()

visitorid
48                                              [215123]
54     [38965, 249114, 442228, 442228, 319680, 388096...
83                                               [20095]
86                                      [150060, 150060]
115                                             [428921]
Name: itemid, dtype: object

In [27]:
last_events.to_frame().merge(max_rank, left_index=True, right_index=True).sort_values(by='maxrank').head()

Unnamed: 0_level_0,itemid,maxrank
visitorid,Unnamed: 1_level_1,Unnamed: 2_level_1
1227495,[174986],24
108978,[359577],24
131663,[226162],26
925292,"[111413, 138785]",27
181952,[8474],27


In [28]:
last_events.reset_index().query("visitorid not in @fr.index")

Unnamed: 0,visitorid,itemid
0,48,[215123]
1,54,"[38965, 249114, 442228, 442228, 319680, 388096..."
2,83,[20095]
3,86,"[150060, 150060]"
4,115,[428921]
...,...,...
41327,1407374,"[164443, 419156]"
41328,1407382,[436641]
41329,1407422,"[354153, 354153]"
41330,1407528,"[195083, 355595, 195083]"


In [23]:
last_events.at[717032] = list(['vasia'] + last_events.at[717032])
last_events.info(show_counts=True)

<class 'pandas.core.series.Series'>
Index: 41333 entries, 48 to 717032
Series name: itemid
Non-Null Count  Dtype 
--------------  ----- 
41333 non-null  object
dtypes: object(1)
memory usage: 1.6+ MB


In [24]:
last_events.at[717032]

['vasia', 9997, 77777777]

Для DAG-a

In [65]:
retrain = False

# ---------------------------------------
# Определяем опорные даты для расчетов
# ---------------------------------------
MS_PER_DAY     = 24*60*60*1000
max_event_time = events['timestamp'].max()
infer_time     = ((max_event_time-1) // MS_PER_DAY +1) * MS_PER_DAY
infer_date     = pd.to_datetime(infer_time, unit='ms')
#logging.info(f"Calculated inference date: {infer_date}")

# для ограничения размера матрицы взаимодействий, в режиме расчета рекомендаций возьмем только 
# пользователей, проявивших активность за последние недели (~60 тыс уникальных пользователей в неделю)
cut_off_time   = 0 if retrain  else infer_time - 7*CONFIG['EVENT_CUT_OFF_WEEKS']*MS_PER_DAY
#if not retrain:  logging.info(f"Calculated cut_off time: {pd.to_datetime(cut_off_time, unit='ms')}")

# в режиме переобучения моделей необходимо определить точку для разделения выборки на train/target
target_time    = infer_time if not retrain  else (
                    infer_time - 7*CONFIG['EVENT_TARGET_WEEKS']*MS_PER_DAY
)
# расчитываем время отсечения событий для определения АКТУАЛЬНЫХ топ-100 товаров
top_pop_time   = infer_time - 7*CONFIG['EVENT_POPULAR_WEEKS']*MS_PER_DAY

pd.to_datetime(infer_time, unit='ms'), pd.to_datetime(cut_off_time, unit='ms'), pd.to_datetime(target_time, unit='ms'), pd.to_datetime(top_pop_time, unit='ms')


(Timestamp('2015-09-18 00:00:00'),
 Timestamp('2015-08-21 00:00:00'),
 Timestamp('2015-09-18 00:00:00'),
 Timestamp('2015-06-26 00:00:00'))

In [66]:
# ---------------------------------------
# Определяем топ-100 популярных товаров
# ---------------------------------------
top_popular = events.query("timestamp >= @top_pop_time")                               \
                    .groupby(['itemid'])['event'].value_counts().unstack(fill_value=0) \
                    .sort_values(by=2,ascending=False).head(100)

# считаем и масштабируем рейтинг популярности
top_popular['rating']    = calc_item_rating(top_popular)
top_popular['pop_score'] = MinMaxScaler().fit_transform(top_popular['rating'].to_frame())

# сортируем по скорингу
top_popular = top_popular[['rating','pop_score']].sort_values(by='rating',ascending=False).reset_index()
top_popular.head(3)


event,itemid,rating,pop_score
0,461686,472,1.0
1,213834,200,0.404814
2,320130,174,0.347921


In [67]:
# пользователи, которым будем давать персональные рекомендации
if  not retrain:
    hot_users = events.query("timestamp >= @cut_off_time")['visitorid'].unique()
else:
    users1    = events.query("timestamp <  @target_time")['visitorid'].unique()
    users2    = events.query("timestamp >= @target_time")['visitorid'].unique()
    hot_users = events.query("visitorid in @users1 and visitorid in @users2")['visitorid'].unique()
    del users1, users2
hot_users.shape[0]

242080

In [68]:
# для рекомендаций (но не для обучения!) будем использовать только доступные товары
if  not retrain:
    av_items = np.array(list(get_available_items (infer_time, items_ctgr, items_avail)))
else:
    av_items = np.array(list(get_registered_items(target_time, items_ctgr)))
av_items.shape[0]

54010

In [69]:
als_users = events.query("timestamp < @target_time")['visitorid'].unique()
als_users.shape[0]

1076473

In [70]:
set(hot_users) - set(als_users)

set()

In [91]:
# строим матрицу взаимодействий:  hot_users x av_items
user_item, user_item_sparse = user_item_matrix(
    events.query("timestamp < @target_time"), als_users, av_items
)
user_item_sparse

<Compressed Sparse Row sparse matrix of dtype 'int16'
	with 643971 stored elements and shape (1407580, 466859)>

In [92]:
# валидационная матрица взаимодействий (в режиме расчета рекомендаций - ПУСТАЯ)
user_item_val, user_item_val_sparse = user_item_matrix(
    events.query("timestamp >= @target_time"), hot_users, av_items
)
user_item_val_sparse

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 0 stored elements and shape (1407572, 466859)>

In [86]:
av_items.max()

466858

In [None]:
MS_PER_DAY     = 24*60*60*1000
target_time =  inference_date - 7* 4 *MS_PER_DAY
users1 = events.query("timestamp < @target_time")['visitorid'].unique()
users2 = events.query("timestamp>= @target_time")['visitorid'].unique()
hot_users = events.query("visitorid in @users1 and visitorid in @users2")['visitorid'].unique()
hot_users.shape


(19964,)

In [52]:
WEEKS_TO_GET   = 4
MS_PER_DAY     = 24*60*60*1000
max_event_time = events['timestamp'].max()
infer_date     = pd.to_datetime(((max_event_time-1) // MS_PER_DAY +1) * MS_PER_DAY, unit='ms')
#cut_off_time   = ((max_event_time-1) // MS_PER_DAY +1 - WEEKS_TO_GET*0) * MS_PER_DAY
cut_off_time   = infer_date.value//1000000 - WEEKS_TO_GET*7* MS_PER_DAY

hot_users = events.query("timestamp >= @cut_off_time")['visitorid'].unique()
hot_users.shape

(242080,)

In [33]:
av_items = get_available_items(max_event_time-1, items_ctgr, items_avail)
len(av_items)

54010

In [54]:
infer_date

Timestamp('2015-09-18 00:00:00')

In [34]:
MONTHS_TO_GET  = 6
cut_off_full   = ((max_event_time-1) // MS_PER_DAY +1 - MONTHS_TO_GET*28) * MS_PER_DAY
user_item, user_item_matrix_full = user_item_matrix(events,
    events.query("timestamp >= @cut_off_full")['visitorid'].unique(), av_items
)
user_item_matrix_full

<Compressed Sparse Row sparse matrix of dtype 'int16'
	with 643971 stored elements and shape (1407569, 466859)>

In [56]:
user_item.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 643971 entries, 0 to 643970
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   visitorid  643971 non-null  int64
 1   itemid     643971 non-null  int64
 2   0          643971 non-null  int64
 3   1          643971 non-null  int64
 4   2          643971 non-null  int64
 5   rating     643971 non-null  int16
dtypes: int16(1), int64(5)
memory usage: 25.8 MB


In [57]:
hot_users = events.query("timestamp >= @cut_off_time and visitorid in @user_item['visitorid'].unique()")['visitorid'].unique()
hot_users.shape

(113697,)

In [58]:
user_item['visitorid'].nunique()

409564

In [59]:
user_item['itemid'].nunique()

39508

In [60]:
als_model = AlternatingLeastSquares(random_state=RANDOM_STATE, **als_params)
als_model.fit(user_item_matrix_full, show_progress=False)


In [61]:
# вычисляем коллаборативные рекомендации
RECS_PER_USER = 15
als_recommendations = als_model.recommend(
    hot_users, 
    user_item_matrix_full[hot_users], 
    filter_already_liked_items=True, N=RECS_PER_USER
)

personal_als  = pd.DataFrame({
    'itemid'   : als_recommendations[0].ravel(),
    'als_score': als_recommendations[1].ravel()
}, index=pd.MultiIndex.from_product([hot_users, range(RECS_PER_USER)], names=['visitorid', 'als_rank'])
).reset_index()

del als_recommendations
personal_als.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1705455 entries, 0 to 1705454
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   visitorid  int64  
 1   als_rank   int64  
 2   itemid     int32  
 3   als_score  float32
dtypes: float32(1), int32(1), int64(2)
memory usage: 39.0 MB


In [62]:
personal_als.nunique()

visitorid    113697
als_rank         15
itemid         9196
als_score    972419
dtype: int64

In [63]:
SIMS_PER_ITEM = 20
#sim_items = als_model.similar_items(user_item['itemid'].unique(), N=SIMS_PER_ITEM)
sim_items = als_model.similar_items(personal_als['itemid'].unique(), N=SIMS_PER_ITEM)

als_similar = pd.DataFrame({
    'sim_itemid': sim_items[0].ravel(),
    'sim_score' : sim_items[1].ravel()
}, index=pd.MultiIndex.from_product(
    #[user_item['itemid'].unique(), range(SIMS_PER_ITEM)], names=['itemid', 'sim_rank']
    [personal_als['itemid'].unique(), range(SIMS_PER_ITEM)], names=['itemid', 'sim_rank']
)).reset_index()
del sim_items

als_similar.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183920 entries, 0 to 183919
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   itemid      183920 non-null  int32  
 1   sim_rank    183920 non-null  int64  
 2   sim_itemid  183920 non-null  int32  
 3   sim_score   183920 non-null  float32
dtypes: float32(1), int32(2), int64(1)
memory usage: 3.5 MB


In [64]:
als_similar.nunique()

itemid          9196
sim_rank          20
sim_itemid     28655
sim_score     157615
dtype: int64

In [65]:
set(als_similar['itemid']) - set(als_similar['sim_itemid'])

set()

In [66]:
# объединяем информацию из personal_als, als_similar и top_popular в привязке к visitorid

candidades = personal_als[['visitorid','itemid']].merge(
    als_similar, how='left', on='itemid'
).groupby(['visitorid','sim_itemid']).agg(
    sim_score=('sim_score','max')
).reset_index().rename(
    columns={'sim_itemid': 'itemid'} 
).merge(
    personal_als[['visitorid','itemid','als_score']], how='outer', on=['visitorid','itemid']
)
candidades.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14418217 entries, 0 to 14418216
Data columns (total 4 columns):
 #   Column     Non-Null Count     Dtype  
---  ------     --------------     -----  
 0   visitorid  14418217 non-null  int64  
 1   itemid     14418217 non-null  int32  
 2   sim_score  14418217 non-null  float32
 3   als_score  1705455 non-null   float32
dtypes: float32(2), int32(1), int64(1)
memory usage: 275.0 MB


In [67]:
candidades.nunique()

visitorid    113697
itemid        28655
sim_score    126818
als_score    972419
dtype: int64

In [68]:
item_props = get_item_properties(max_event_time-1, items)
item_props.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11166726 entries, 0 to 11166725
Data columns (total 3 columns):
 #   Column      Non-Null Count     Dtype
---  ------      --------------     -----
 0   itemid      11166726 non-null  int64
 1   property    11166726 non-null  int64
 2   value_code  11166726 non-null  int64
dtypes: int64(3)
memory usage: 255.6 MB


In [69]:
item_props.nunique()

itemid        416916
property        1102
value_code    448345
dtype: int64

In [70]:
prop_vals = item_props.groupby('property').agg(nvalues=('value_code','max')).reset_index()
categorical_props = prop_vals.query("nvalues < 10")['property'].unique()
categorical_props.shape

(601,)

In [71]:
tmp1 = item_props.query("itemid in @candidades['itemid'].unique() and property in @categorical_props")
tmp1.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 192432 entries, 265 to 11166675
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype
---  ------      --------------   -----
 0   itemid      192432 non-null  int64
 1   property    192432 non-null  int64
 2   value_code  192432 non-null  int64
dtypes: int64(3)
memory usage: 5.9 MB


In [72]:
tmp1.nunique()

itemid        28655
property        470
value_code       10
dtype: int64

In [73]:
user_item_prop = candidades[['visitorid','itemid']].merge(
    tmp1, how='left', on='itemid'
)
user_item_prop.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 102343966 entries, 0 to 102343965
Data columns (total 4 columns):
 #   Column      Non-Null Count      Dtype
---  ------      --------------      -----
 0   visitorid   102343966 non-null  int64
 1   itemid      102343966 non-null  int32
 2   property    102343966 non-null  int64
 3   value_code  102343966 non-null  int64
dtypes: int32(1), int64(3)
memory usage: 2.7 GB


In [74]:
user_item_prop.nunique()

visitorid     113697
itemid         28655
property         470
value_code        10
dtype: int64

In [75]:
user_prop_score = user_item_prop.groupby(['visitorid','property']).agg(
    prop_score=('itemid','nunique')
).reset_index()
user_prop_score.nunique()

visitorid     113697
property         470
prop_score       300
dtype: int64

In [76]:
user_prop_score .info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5421590 entries, 0 to 5421589
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype
---  ------      --------------    -----
 0   visitorid   5421590 non-null  int64
 1   property    5421590 non-null  int64
 2   prop_score  5421590 non-null  int64
dtypes: int64(3)
memory usage: 124.1 MB


In [77]:
user_item_prop_score = user_item_prop.merge(
    user_prop_score, how='left', on=['visitorid','property']
).groupby(['visitorid','itemid']).agg(
    prop_score=('prop_score','mean')
).fillna(0).reset_index()
user_item_prop_score.info(show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14418217 entries, 0 to 14418216
Data columns (total 3 columns):
 #   Column      Non-Null Count     Dtype  
---  ------      --------------     -----  
 0   visitorid   14418217 non-null  int64  
 1   itemid      14418217 non-null  int32  
 2   prop_score  14418217 non-null  float64
dtypes: float64(1), int32(1), int64(1)
memory usage: 275.0 MB


In [78]:
user_item_prop_score.nunique()

visitorid     113697
itemid         28655
prop_score     19079
dtype: int64

In [79]:
def normalize_col_by_col(df: pd.DataFrame, col: str, by_col: str):
    from sklearn.preprocessing import normalize
    tmp      = df.sort_values(by=by_col)
    tmp[col] = tmp.groupby(by_col)[col].apply(
        lambda x: normalize(x.values.reshape(-1,1), norm='l1', axis=0, copy=True, return_norm=True)[0].ravel()
    ).explode(col).values.astype('float32')
    df[col]  = tmp[col]
normalize_col_by_col(user_item_prop_score, 'prop_score', 'visitorid')


In [81]:
# считаем hit_score
user_item_hit_score = events.query("visitorid in @hot_users").groupby(
    ['visitorid','itemid']
).agg(
    hit_score=('event','nunique')
).reset_index().merge(
    candidades[['visitorid','itemid']], how='right', on=['visitorid','itemid']
).fillna(0)

user_item_hit_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14418217 entries, 0 to 14418216
Data columns (total 3 columns):
 #   Column     Dtype  
---  ------     -----  
 0   visitorid  int64  
 1   itemid     int64  
 2   hit_score  float64
dtypes: float64(1), int64(2)
memory usage: 330.0 MB


In [82]:
# добавляем признаки пользователя - "стаж" и активность
user_features = events.query("visitorid in @hot_users").groupby("visitorid").agg(
    stage  =('timestamp', lambda x: (infer_date - pd.to_datetime(x.min(),unit='ms')).days +1),
    nclicks=('timestamp', 'count'),
    nbuys  =('transactionid', 'count')
).reset_index()
user_features["click_per_day"] = user_features["nclicks"] / user_features["stage"]
user_features["buy_per_click"] = user_features["nbuys"]   / user_features["nclicks"]
user_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113697 entries, 0 to 113696
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   visitorid      113697 non-null  int64  
 1   stage          113697 non-null  int64  
 2   nclicks        113697 non-null  int64  
 3   nbuys          113697 non-null  int64  
 4   click_per_day  113697 non-null  float64
 5   buy_per_click  113697 non-null  float64
dtypes: float64(2), int64(4)
memory usage: 5.2 MB


In [83]:
# вносим сформированные дополнительные признаки в candidades_for_train
candidades = candidades.merge(
    user_item_prop_score, how='left', on=['visitorid','itemid']
).merge(
    user_item_hit_score,  how='left', on=['visitorid','itemid']
).merge(
    user_features[['visitorid','stage','click_per_day','buy_per_click']], how='left', on=['visitorid']
)
candidades.info(show_counts=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14418217 entries, 0 to 14418216
Data columns (total 9 columns):
 #   Column         Non-Null Count     Dtype  
---  ------         --------------     -----  
 0   visitorid      14418217 non-null  int64  
 1   itemid         14418217 non-null  int32  
 2   sim_score      14418217 non-null  float32
 3   als_score      1705455 non-null   float32
 4   prop_score     14418217 non-null  float32
 5   hit_score      14418217 non-null  float64
 6   stage          14418217 non-null  int64  
 7   click_per_day  14418217 non-null  float64
 8   buy_per_click  14418217 non-null  float64
dtypes: float32(3), float64(3), int32(1), int64(2)
memory usage: 770.0 MB


In [84]:
candidades.nunique()

visitorid        113697
itemid            28655
sim_score        126818
als_score        972419
prop_score       735236
hit_score             4
stage               131
click_per_day      2198
buy_per_click       234
dtype: int64

In [3]:
import subprocess
my_ip = subprocess.run(['curl', 'ifconfig.co/'], capture_output=True, text=True).stdout.strip()
my_ip

'89.169.168.158'

In [1]:
import requests
def post_request(url, params):
    headers = {'Content-type': 'application/json', 'Accept': 'text/plain'}

    resp = requests.post(url, headers=headers, params=params)

    if resp.status_code == 200:
        recs = resp.json()
    else:
        recs = {"result": "ERROR", "code": f"{resp.status_code}"}
        print(f"Error status code: {resp.status_code} received from {url} ({params})")
    return recs    


In [4]:
recs = post_request(f"http://{my_ip}:8090/reload", "{}")
recs

{'result': 'OK'}

In [1]:
import psutil
psutil.virtual_memory()

svmem(total=33651257344, available=31932575744, percent=5.1, used=1274212352, free=31504547840, active=283656192, inactive=1534652416, buffers=101191680, cached=771305472, shared=1187840, slab=169693184)