In [1]:
import numpy as np
import pandas as pd
import os
import random
random.seed(1)

from tqdm import tqdm_notebook as tqdm

import lightgbm as lgb

In [2]:
input_path = '../datasets/'

In [3]:
# log_df = pd.read_csv(os.path.join(input_path, 'carlog.csv'))
meta_df = pd.read_csv(os.path.join(input_path, 'meta.csv'))
test_df = pd.read_csv(os.path.join(input_path, 'test.csv'))
product_master_df = pd.read_csv(os.path.join(input_path, 'product_master.csv'))
user_master_df = pd.read_csv(os.path.join(input_path, 'user_master.csv'))
sample_submission_df = pd.read_csv(os.path.join(input_path, 'atmaCup#9__sample_submission.csv'))

# 正解データの作成

In [4]:
log_df2 = pd.read_csv('../output/cart_log.csv')
log_df2.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,session_id,register_number,date,hour,spend_time,display_action_id,user_id,value_1,name_1,kind_1,number_1,value_2,name_2,kind_2,number_2,unit_price,n_items,is_payment,predict
0,0,2003,2019-02-14,4,0,136,bE94Gct4vGiKM7z2Y79cjk,,,,,,,,,,0,0,0
1,0,2003,2019-02-14,4,32,209,bE94Gct4vGiKM7z2Y79cjk,4522646719765.0,おかかおにぎり　５９円,商品,1.0,,,,,59.0,1,0,0
2,0,2003,2019-02-14,4,37,209,bE94Gct4vGiKM7z2Y79cjk,4522646695663.0,梅（種なし）おにぎり５９,商品,1.0,,,,,59.0,1,0,0
3,0,2003,2019-02-14,4,52,209,bE94Gct4vGiKM7z2Y79cjk,4522646716641.0,紅鮭おにぎり７９円,商品,1.0,,,,,79.0,1,0,0
4,0,2003,2019-02-14,4,59,209,bE94Gct4vGiKM7z2Y79cjk,4522646695670.0,明太子おにぎり　７９円,商品,1.0,,,,,79.0,1,0,0


In [5]:
is_predict = False


## 購入イベントのあったセッションのみに絞る

In [6]:
def get_paid_event(df):
    print('all sessions: ', len(df.session_id.unique()))

    paid_event_ids = list(df[df['is_payment']==1]['session_id'].unique())
    print('num sessions paid: ', len(paid_event_ids))

    return df[df['session_id'].isin(paid_event_ids)]

paid_event_df = get_paid_event(log_df2)

all sessions:  720175
num sessions paid:  618462


## 商品レコードのみに絞る

In [7]:
def get_item_record(df, cols=['session_id', 'value_1', 'n_items', 'predict']):
    return df[df['kind_1']=='商品'][cols]

paid_item_event_df = get_item_record(paid_event_df)
paid_item_event_df.head()

Unnamed: 0,session_id,value_1,n_items,predict
1,0,4522646719765,1,0
2,0,4522646695663,1,0
3,0,4522646716641,1,0
4,0,4522646695670,1,0
5,0,4522646720440,1,0


## 商品マスタと紐づけてcategory_idの取得

In [8]:
def join_category_id(df, product_master_df):
    df['value_1'] = df['value_1'].astype('int64')
    df = pd.merge(df, product_master_df[['JAN', '部門CD']], left_on='value_1', right_on='JAN', how='left')
    return df.drop(['value_1', 'JAN'], axis=1)

category_event_df = join_category_id(paid_item_event_df, product_master_df)
category_event_df.head()

Unnamed: 0,session_id,n_items,predict,部門CD
0,0,1,0,49.0
1,0,1,0,49.0
2,0,1,0,49.0
3,0,1,0,49.0
4,0,1,0,49.0


## 予測対象の行のみに絞る

In [9]:
def get_target_category_record(df, is_predict=True):
    if is_predict:
        df = df[df['predict']==0] #  # 見えてるところの情報だけにする
    return df.drop(['predict'], axis=1)

target_row_event_df = get_target_category_record(category_event_df, is_predict=is_predict)
target_row_event_df.head()

Unnamed: 0,session_id,n_items,部門CD
0,0,1,49.0
1,0,1,49.0
2,0,1,49.0
3,0,1,49.0
4,0,1,49.0


## 購入したか集計

In [10]:
def calc_is_payment(df):
    df = df.groupby(['session_id', '部門CD'])[['n_items']].sum().reset_index()
    df.loc[df['n_items'] > 0, 'n_items'] = 1
    df.loc[df['n_items'] < 0, 'n_items'] = 0
    return df
target_item_sum_df = calc_is_payment(target_row_event_df)
target_item_sum_df.head()

Unnamed: 0,session_id,部門CD,n_items
0,0,49.0,1
1,0,185.0,0
2,1,9.0,1
3,1,47.0,1
4,1,87.0,1


## Pivot

In [11]:
def pivot_df(df):
    df = df.pivot(index="session_id", columns="部門CD", values="n_items").reset_index()
    return df

train_payment_Y = pivot_df(target_item_sum_df)
train_payment_Y.head()

部門CD,session_id,1.0,2.0,3.0,4.0,5.0,7.0,9.0,10.0,13.0,...,225.0,226.0,227.0,228.0,229.0,230.0,231.0,232.0,233.0,234.0
0,0,,,,,,,,,,...,,,,,,,,,,
1,1,,,,,,,1.0,,,...,,,,,,,,,,
2,2,,,,,,,,,,...,,,,,,,,,,
3,3,,,,,1.0,,,,,...,,,,,,1.0,,,,
4,4,,,,,,,,,,...,,,,,,,,,,


## Join meta.csv

In [12]:
def join_meta_df(df, meta_df):
    session_df = meta_df[['session_id']]
    df = pd.merge(session_df, df, on='session_id', how='left')
    df.fillna(0, inplace=True)
    return df

all_sessions_Y = join_meta_df(train_payment_Y, meta_df)
all_sessions_Y.head()

Unnamed: 0,session_id,1.0,2.0,3.0,4.0,5.0,7.0,9.0,10.0,13.0,...,225.0,226.0,227.0,228.0,229.0,230.0,231.0,232.0,233.0,234.0
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## カラム名を文字列方に

In [13]:
def col_name_to_str(df):
    cols = list(df.columns)
    str_cols = []
    for col in cols:
        if col == 'session_id':
            str_cols.append(col)
        else:
            str_cols.append(str(int(col)))
    df.columns = str_cols
    return df

all_sessions_Y = col_name_to_str(all_sessions_Y)
all_sessions_Y.head()

Unnamed: 0,session_id,1,2,3,4,5,7,9,10,13,...,225,226,227,228,229,230,231,232,233,234
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
all_sessions_Y[all_sessions_Y.session_id==7]

Unnamed: 0,session_id,1,2,3,4,5,7,9,10,13,...,225,226,227,228,229,230,231,232,233,234
7,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Train Test Split

In [15]:
def train_test_split(df, test_df):
    # test_Yいらない
    test_sessions = list(test_df['session_id'])
    test_Y = df[df['session_id'].isin(test_sessions)]
    train_Y = df[~df['session_id'].isin(test_sessions)]
    assert len(df) == len(test_Y) + len(train_Y)
    return train_Y, test_Y

train_Y, test_Y = train_test_split(all_sessions_Y, test_df)
display(train_Y.head())
display(test_Y.head())

Unnamed: 0,session_id,1,2,3,4,5,7,9,10,13,...,225,226,227,228,229,230,231,232,233,234
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0,session_id,1,2,3,4,5,7,9,10,13,...,225,226,227,228,229,230,231,232,233,234
663721,663721,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
663725,663725,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
663737,663737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
663745,663745,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
663747,663747,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
if is_predict:
    train_Y.to_csv('../output/all_given_train_depart_target_Y.csv', index=None)
    test_Y.to_csv('../output/test_depart_target_Y.csv', index=None)
else:
    train_Y.to_csv('../output/all_train_depart_target_Y.csv', index=None)
    test_Y.to_csv('../output/test_depart_target_Y.csv', index=None)