In [1]:
import numpy as np
import pandas as pd
import copy
import datetime as dt

In [2]:
# from google.colab import drive
# drive.mount('/content/gdrive')

## Reading and preprocessing data

In [3]:
user_actions = pd.read_parquet('../data/user_actions_august.parquet')
user_actions['clientid'] = user_actions['clientid'].astype('uint32')
user_actions['itemid'] = user_actions['itemid'].astype('uint32')
user_actions['action_type'] = user_actions['action_type'].map({'view': 0, 'to_cart': 1}).astype('bool')
user_actions['timestamp'] = user_actions['timestamp'].dt.date.apply(dt.datetime.toordinal).astype('uint32')

user_actions.head()

In [5]:
test_target = pd.read_csv("../data/target_test.csv")
train_target = pd.read_csv("../data/target_train.csv")

In [6]:
test_target['clientid'] = test_target['clientid'].astype('uint32')
test_target['itemid'] = test_target['itemid'].astype('uint32')
test_target['jointitemid'] = test_target['jointitemid'].astype('uint32')
test_target['label'] = test_target['label'].astype('int8')
test_target.info()

In [7]:
train_target['clientid'] = train_target['clientid'].astype('uint32')
train_target['itemid'] = train_target['itemid'].astype('uint32')
train_target['jointitemid'] = train_target['jointitemid'].astype('uint32')
train_target['label'] = train_target['label'].astype('int8')
train_target.info()

## Generating simple features

In [8]:
# Популярность
def popularity(data_action,data, action):
    if action == 'to_cart':
        id_count_Series = data_action[data_action['action_type']]['itemid'].value_counts()
    else:
        id_count_Series = data_action[~data_action['action_type']]['itemid'].value_counts()
    df = id_count_Series.rename_axis('jointitemid').reset_index(name= f'{action}_cnt')
    return pd.merge(data,df, how="left").fillna(0)

# CTR
def ctr(data, data_action):
    new_data = pd.DataFrame(data_action[['itemid', 'action_type']])
    viewed_data = new_data[new_data['action_type'] == False]['itemid'].value_counts()
    df1 = viewed_data.rename_axis('jointitemid').reset_index(name = 'view_cnt')
    to_cart_data = new_data[new_data['action_type'] == True]['itemid'].value_counts()
    df2 = to_cart_data.rename_axis('jointitemid').reset_index(name = 'to_cart_cnt')
    df = df1.join(df2.set_index('jointitemid'), on = 'jointitemid')
    new_df = df['to_cart_cnt'] / df['view_cnt']
    df['ctr'] = new_df
    df.drop(['view_cnt', 'to_cart_cnt'], axis = 1, inplace = True)
    return data.merge(df, how='left').fillna(0)

# Дата первого просмотра
def date_first_view(data_action, data):
    id_item_series = data_action[~data_action['action_type']].groupby(['itemid'], sort=False)['timestamp'].min()
    df = id_item_series.rename_axis('jointitemid').reset_index(name='novelty_cnt')
    return data.merge(df, how="left").fillna(0)

# Среднее количество картэдов/просмотров в день
def day_avg_popularity(data_action, data, action):
    df = copy.copy(data_action)
    if action == 'to_cart':
        df_cart_countInDay_series = df[df['action_type']].groupby(['itemid','timestamp'])['itemid'].count()
    else:
        df_cart_countInDay_series = df[~df['action_type']].groupby(['itemid','timestamp'])['itemid'].count()
         
    df_cart_countInDay = df_cart_countInDay_series.rename_axis(['jointitemid','timestamp']).reset_index(name='count_day')
    
    df_cart_countInDay[f'{action}_day_avg_cnt'] = df_cart_countInDay.groupby(['jointitemid'])['count_day'].transform('mean')
    df_cart_countInDay.drop(['timestamp','count_day'], axis = 'columns', inplace=True)
    return data.merge(df_cart_countInDay.drop_duplicates(), how="left").fillna(0)

# Количество просмотров в последний день
def last_day_views(data_action, data):
    df = data_action.copy()
    data_views = df[df['action_type']]
    data_last_day_views = data_views[data_views['timestamp'] == max(data_views['timestamp'])]
    df_last_day = data_last_day_views.groupby(['itemid'])['timestamp'].size()
    new_df = df_last_day.rename_axis('jointitemid').reset_index(name='last_day_views_cnt')
    return data.merge(new_df, how='left').fillna(0)

#Количество картэдов в последний день
def cart_add_last_day(data_action,data):
    df = copy.copy(data_action)
    data_addCart = df[df['action_type']]
    data_addCart_lustDay = data_addCart[data_addCart['timestamp']==max(data_addCart['timestamp'])]
    dS_lustDay = data_addCart_lustDay.groupby(['itemid'])['timestamp'].size()
    dataFrame = dS_lustDay.rename_axis('jointitemid').reset_index(name='last_day_to_cart_cnt')
    return data.merge(dataFrame, how="left").fillna(0)

# Отношение количества просмотров в последний день к среднему количеству просмотров в день
def relation_last_mean(data_action, data):
    df = pd.DataFrame()
    quantity_ldviews = last_day_views(data_action, data)
    mean_vpday = day_avg_popularity(data_action, data, 'view')
    df['jointitemid'] = mean_vpday['itemid']
    df['relation_ldv_dap'] = quantity_ldviews['last_day_views_cnt'] / mean_vpday['view_day_avg_cnt']
    return data.merge(df, how='left').fillna(0)

# Отношение 8 к 6
def relation_last_mean(data_action, data):
    ca_Ld = cart_add_last_day(data_action,data)
    ca_Md = day_avg_popularity(data_action,data,'to_cart')
    datafr = pd.DataFrame()
    datafr["jointitemid"] = ca_Md["itemid"]
    datafr["lastDay/dayAvg_cnt"] = ca_Ld['last_day_to_cart_cnt'] / ca_Md['to_cart_day_avg_cnt']
    return data.merge(datafr, how="left").fillna(0)


def generate_popularity_features(user_actions, target):
    res = popularity(user_actions, target, 'view')
    res = popularity(user_actions, res, 'to_cart')
    # res = ctr(user_action, res)
    res = date_first_view(user_actions, res)
    res = day_avg_popularity(user_actions, res,'to_cart')
    res = day_avg_popularity(user_actions, res,'view')
    res = last_day_views(user_actions, res)
    res = cart_add_last_day(user_actions, res)
    res = relation_last_mean(user_actions, res)
    res = relation_last_mean(user_actions, res)
    return res

In [9]:
ctr(train_target, user_actions).head()

In [10]:
%%time
train = generate_popularity_features(user_actions, train_target)
train.head()

In [11]:
%%time
test = generate_popularity_features(user_actions, test_target)
test.head()

### Calculation of the angular coefficient 

In [12]:
# Угловой коэффициент прямой, построенной по следующим точкам, по оси x день недели, по оси y количество картэдов. 
# Прямая строится методом МНК. Данная функция считает коэфициент для одного товара.
def get_coef(df,action):
    x = df.day.values 
    y = df['count_'+action].values 
    if (7 * (x**2).sum() - x.sum()**2) == 0:
        return 0
    k = (7 * (x * y).sum() - x.sum()*y.sum()) / (7 * (x**2).sum() - x.sum()**2)
    return k

In [13]:
def count_view_to_cart_7day(user_actions, action):
    if action == 'view':
        condition = (user_actions['timestamp'] > (user_actions['timestamp'].max() - 7)) & (user_actions['action_type']==False)
    else:
        condition = (user_actions['timestamp'] > (user_actions['timestamp'].max() - 7)) & (user_actions['action_type']==True)

    # создаем две таблички all_items - все товары, days - номер дня
    all_items = user_actions[['itemid']].drop_duplicates()
    days = pd.DataFrame({'day': [0, 1, 2, 3, 4, 5, 6]})

    # соединим эти таблички, каждому айтему присоединим таблицу с днями

    all_items['key'] = 1
    days['key'] = 1
    item_day_df = all_items.merge(days, on='key').drop(columns=['key'])

    cnts_df = (
        user_actions[condition]
        .groupby(['itemid', 'timestamp'])['clientid']
        .agg(['count'])
        .reset_index()
    )

    cnts_df['day'] = cnts_df['timestamp'] - cnts_df['timestamp'].min()

    item_day_df = (
        item_day_df
        .merge(cnts_df.drop(columns=['timestamp']), on = ['itemid','day'], how='left')
        .fillna(0)
    )
    item_day_df = item_day_df.rename(columns={'count': 'count_'+action})
    return item_day_df

In [14]:
%%time
item_day_view = count_view_to_cart_7day(user_actions, 'view')

In [15]:
%%time
item_day_to_cart = count_view_to_cart_7day(user_actions, 'to_cart')

In [16]:
def mnk_coef(item_day, action):
    dfs = []
    ks = []
    items = []
    for group_name, group in item_day.groupby('itemid'):
        k = get_coef(group, action)
        items.append(group_name)
        ks.append(k)
    return pd.DataFrame({'itemid': items, 'k_'+action: ks})

In [17]:
%%time
k_to_cart = mnk_coef(item_day_cart, 'to_cart')

In [18]:
%%time
k_view = mnk_coef(item_day_view , 'view')

In [19]:
k_view.head()

In [20]:
train_target = pd.read_csv('data/train_with_scores.csv')
test_target = pd.read_csv('data/test_with_scores.csv')

In [21]:
train = pd.merge(train_target, k_view, on='itemid', how='left').drop_duplicates()
train = pd.merge(train, k_tocart, on='itemid', how='left').drop_duplicates()
train = train.fillna(0)

In [22]:
test = pd.merge(test_target, k_view, on='itemid', how='left').drop_duplicates()
test = pd.merge(test, k_tocart, on='itemid', how='left').drop_duplicates()
test = test.fillna(0)

## Write to the file

In [23]:
test.to_csv("../data/test_with_scores.csv", index=False)
train.to_csv("../data/train_with_scores.csv", index=False)