In [4]:
import pandas as pd
from gensim.models import Word2Vec, KeyedVectors
from tqdm.notebook import tqdm
from collections import defaultdict

In [17]:
log_df2 = pd.read_csv('../output/cart_log.csv')
product_master_df = pd.read_csv('../datasets/product_master.csv')

In [19]:
max_time_df = log_df2.groupby('session_id')[['spend_time']].max().reset_index()
max_time_df = max_time_df[max_time_df['spend_time'] > 599]
max_time_df.to_csv('../output/train_target_session_id.csv', index=None)
target_sessions = list(max_time_df['session_id'].unique())
print(len(target_sessions))

log_df2 = log_df2[log_df2['session_id'].isin(target_sessions)]

405085


In [24]:
def get_item_record(df, cols=['session_id', 'value_1', 'n_items']):
    return df[(df['kind_1']=='商品') & (df['n_items']>0)][cols]

item_event_df = get_item_record(log_df2)
item_event_df.head()

Unnamed: 0,session_id,value_1,n_items
1140,105,4901670110210,1
1141,105,4522646718089,1
1142,105,4901422153502,1
1144,105,4562224315774,1
1146,105,4974824122551,1


In [63]:
def join_category_id(df, product_master_df):
    df['value_1'] = df['value_1'].astype('int64')
    df = pd.merge(df, product_master_df[['JAN', 'category_id', 'category_name']], left_on='value_1', right_on='JAN', how='left')
    return df.drop(['value_1', 'JAN'], axis=1)

# product_master_df.category_id = product_master_df.category_id.astype('int')
category_event_df = join_category_id(item_event_df, product_master_df)
category_event_df.category_id.fillna(-1, inplace=True)
category_event_df.category_id = category_event_df.category_id.astype('int')
category_event_df.category_id = category_event_df.category_id.astype('str')
category_event_df.head()

Unnamed: 0,session_id,n_items,category_id,category_name
0,105,1,360,台所消耗__ラップ・ホイル
1,105,1,360,台所消耗__ラップ・ホイル
2,105,1,360,台所消耗__ラップ・ホイル
3,105,1,360,台所消耗__ラップ・ホイル
4,105,1,360,台所消耗__ラップ・ホイル


In [64]:
sentences = defaultdict(list)

for group in category_event_df.groupby('session_id'):
    sentences[group[0]] = list(group[1]['category_id'])

sentences = list(sentences.values())

In [67]:
model = Word2Vec(sentences,  sg=1, size=64, window=10, min_count=1, sample=0)
model.wv.save_word2vec_format("../output/vec.pt", binary=True)


In [68]:
del sentences

In [109]:
wv = KeyedVectors.load_word2vec_format('../output/vec.pt', binary=True)
print('most_similar')

target_category = [38, 110, 113, 114, 134, 171, 172, 173, 376, 435, 467, 537, 539, 629, 768]
target_category_str = [str(col) for col in target_category]
dfs = []

for category_id in target_category_str:
    similar_ids = []
    for w_tuple in wv.most_similar(category_id):
        similar_ids.append(int(w_tuple[0]))
    df = product_master_df[product_master_df.category_id.isin(similar_ids)][['category_name', 'category_id']].drop_duplicates()
    df['similar_category_id'] = 'similar_' + str(category_id)
    dfs.append(df)
df = pd.concat(dfs)
df.head()

most_similar


Unnamed: 0,category_name,category_id,similar_category_id
2282,アイスクリーム__プレミアム,39,similar_38
2793,アイスクリーム__マルチ,40,similar_38
5811,冷凍食品__お弁当用,311,similar_38
5816,冷凍食品__おかず,310,similar_38
15161,冷凍食品__冷凍野菜,314,similar_38


In [112]:
list(df[~df.category_id.isin(target_category)].category_name.unique())

['アイスクリーム__プレミアム',
 'アイスクリーム__マルチ',
 '冷凍食品__お弁当用',
 '冷凍食品__おかず',
 '冷凍食品__冷凍野菜',
 '冷凍食品__氷',
 '冷凍食品__麺類',
 '冷凍食品__スナック',
 '冷凍食品__米飯',
 'ホールセールパン__食パン(ホールセールパン)',
 '野菜__香辛・つま物類',
 '生魚__魚惣菜',
 '和菓子__半生・焼菓子',
 'チョコ・ビスクラ__ビスクラ',
 'スナック・キャンディー__グミ',
 'スナック・キャンディー__キャンディ',
 '豚肉__生姜焼き',
 '塩干__乾物',
 'デザート__ヨーグルト',
 '日配飲料__牛乳',
 '玉子__玉子',
 '嗜好品__ジャム・蜂蜜',
 '乳製品__バター･マーガリン',
 '和菓子__ゼリー',
 '日配飲料__豆乳',
 '和日配__納豆',
 'ホールセールパン__食卓パン(ホールセールパン)',
 '漬物__漬物',
 'チョコ・ビスクラ__子供',
 '嗜好品__コーヒー・紅茶',
 '嗜好品__茶',
 '加工肉__チルドソーセージ',
 'ハードリカー__スピリッツ',
 'ハードリカー__ワイン',
 '和酒__焼酎',
 'ハードリカー__洋酒',
 '加工__おつまみ(豚)',
 '加工__おつまみ(鶏)',
 '医薬品__ドリンク剤',
 '和日配__御節',
 '健康飲料__エナジー・機能性（健康）',
 '健康飲料__野菜・健康果汁（大型）',
 '健康飲料__コンク・希釈',
 '缶飲料__コーヒー（ボトル缶）',
 '米__精米(単一銘柄米)',
 '鶏肉__副産物(鶏肉)',
 '珍味__珍味',
 '和日配__豆腐',
 '珍味__豆菓子',
 '珍味__小袋',
 '小型PET__無糖茶（小型PET）',
 '水・炭酸水__大型PET（水）',
 '大型PET__紅茶（大型PET）',
 '大型PET__炭酸（大型PET）',
 '米__無洗米(単一銘柄)',
 '小型PET__紅茶（小型PET）',
 '大型PET__果汁（大型PET）',
 '大型PET__スポーツ・機能性（大型PET）',
 '大型PET__乳性（大型PET）',
 '大型PET__コーヒー（大型PET）',

In [107]:
df['value'] = 1
df = df.pivot(index="category_id", columns="similar_category_id", values="value").reset_index()
df.fillna(0, inplace=True)
df.head()

similar_category_id,category_id,similar_110,similar_113,similar_114,similar_134,similar_171,similar_172,similar_173,similar_376,similar_38,similar_435,similar_467,similar_537,similar_539,similar_629,similar_768
0,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,40,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2,111,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,112,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,114,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [108]:
df.to_csv('../output/similar_categories.csv', index=None)