# 필요 모듈 import & 데이터 zip 파일 압축 풀기

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import zipfile # zip 파일 풀기

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


import os
files = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        # print(os.path.join(dirname, filename))
        files.append(filename.split('.zip')[0])
        with zipfile.ZipFile(os.path.join(dirname, filename),"r") as z:
            z.extractall()

# 데이터 불러오기

In [None]:
files

In [None]:
departments = pd.read_csv('./'+files[0])
order_products_train = pd.read_csv('./'+files[2], dtype = {'order_id' : np.int32, 'product_id' : np.int32,
                                                           'add_to_cart_order' : np.int16, 'reordered' : np.int16})
order_products_prior = pd.read_csv('./'+files[3], dtype = {'order_id' : np.int32, 'product_id' : np.int32,
                                                           'add_to_cart_order' : np.int16, 'reordered' : np.int16})
orders = pd.read_csv('./'+files[4], dtype = {'order_id' : np.int32,'user_id' : np.int32,
                                             'order_number' : np.int16,'order_dow' : np.int16,
                                             'order_hour_of_day' : np.int16})
products = pd.read_csv('./'+files[5])
aisles = pd.read_csv('./'+files[6])

# 쿼리데이터 만들기
- products_detail : products, aisles, department 합친 것
- order_prior/train : order_products_prior/train 에 user_id 추가한 것
- train/test_users : 학습/테스트 에 사용되는 유저 목록
- order_prior_train : train 유저의 과거 주문들
- order_prior_train_all : order_prior_train + order_train
- order_prior_detail : order_prior + orders **by order_id**

In [None]:
products_detail = pd.merge(products, aisles, on='aisle_id')
products_detail = pd.merge(products_detail, departments, on='department_id')

order_prior = pd.merge(order_products_prior, orders[['order_id', 'user_id']], on='order_id')
order_train = pd.merge(order_products_train, orders[['order_id', 'user_id']], on='order_id')

train_users = orders[orders['eval_set']=='train']['user_id'].values
# train_users = np.sort(train_users) # train_user 에 해당하는 유저 목록
test_users = orders[orders['eval_set']=='test']['user_id'].values
# test_users = np.sort(test_users)

order_prior_train = order_prior[order_prior['user_id'].isin(train_users)] # train 유저의 과거 주문
# order_prior_train_all = pd.concat([order_prior_train, order_train]) # train 유저에 대한 order 정보 취합

In [None]:
## 전체 물건에 대해서 과거 시점들의 평균값들을 구해보자
order_prior_detail = pd.merge(order_prior, orders.drop(['user_id','eval_set'], axis=1), on='order_id')
order_prior_detail

In [None]:
product_reorder = order_prior_detail.groupby('product_id').sum()[['reordered']]
len(product_reorder),len(products_detail)

In [None]:
# 과거에 한번도 주문되지 않은 데이터들 
ordered_product = product_reorder.index
all_product = products_detail.product_id
not_ordered_product = [a for a in all_product if a not in ordered_product]
not_ordered_product_detail = products_detail[products_detail['product_id'].isin(not_ordered_product)]

# 주문되지 않은 물건이 train(가장 최근 주문_학습용)에서 주문 된 게 있을까? - Yes
display(order_train[order_train['product_id'].isin(not_ordered_product)])

In [None]:
# 평균 구매 시간대, 요일, add_to_cart_order 은 groupby 써서 가면 될 듯
product_mean = order_prior_detail.groupby('product_id').mean()[['add_to_cart_order', 'order_dow','order_hour_of_day']].reset_index()
product_mean.head()

In [None]:
# 재주문이 어느정도로 되는지 확인해보자
a = order_prior_detail[order_prior_detail['days_since_prior_order'].notna()].groupby('reordered').count()['product_id']
a.apply(lambda x : x/a.sum())

# 관점 Ⅰ 기존에 샀던 물건들 중에서 재구매를 할까? 
- 기존에 샀던 물건들에 대한 특성이 필요함
    - 재구매율
    - 과거 구매 물품 중 구매 횟수
    - 물건 구매 주기
    - 물건 구매 평균 시간
- 기존에 샀던 물건들 끼리의 유사도는?
    - 유사도
    - 유사한 것 끼리 중복해서 사는 정도가 있었나?
        - 만약, 초코쿠키를 샀었는데 다음엔 초코칩쿠키를 샀다던지

### 함수화 하기 이전에, user_id==1 인 경우를 먼저 파악해보자

In [None]:
user1_orders = order_prior_train[order_prior_train['user_id']==1]
user1_orders = pd.merge(user1_orders,orders[['order_id','order_number','order_dow','order_hour_of_day','days_since_prior_order']], on='order_id')
user1_orders.sort_values(by=['order_number','add_to_cart_order'], inplace=True)

user1_orders.head()

In [None]:
# 유저가 과거에 산 물건 종류
user1_prior_products = user1_orders.product_id.unique()
# 유저가 과거에 주문한 횟수
user1_orders_count = len(user1_orders.order_id.unique())
# 구매한 물건 별로 재주문 횟수
user1_products = user1_orders.groupby('product_id').sum()[['reordered']].reset_index()
user1_products = pd.merge(user1_products, products_detail[['product_id', 'product_name']], on='product_id')

# 기존 물건 별 재주문율
user1_products['reordered_ratio'] = user1_products['reordered'].apply(lambda x : x /(len(user1_orders.order_id.unique())-1))
# 과거에 총 몇 번 구매를 했는가? 
user1_products['order_count'] = user1_products['reordered']+1
# 구매 물건 총 개수 중 이 물건은 어느 정도 비중을 차지하는 가? 
user1_products['order_ratio'] = user1_products['order_count'].apply(lambda x : x/len(user1_orders))

# 해당 물건의 물건 평균 주기는 어떻게 되는가? 
user1_products['product_buy_term'] = user1_products['product_id'].apply(lambda x : user1_orders[(user1_orders['product_id'] == x) \
                                                                                                & (user1_orders['days_since_prior_order'] > 0)]['days_since_prior_order'].mean() \
                                                                       if len(user1_orders[(user1_orders['product_id'] == x)]) > 1 else np.nan)
# 해당 물건을 사는 데 있어서 평균 시간대는 어떻게 되는가?
user1_products['product_buy_time'] = user1_products['product_id'].apply(lambda x : user1_orders[(user1_orders['product_id'] == x)]['order_hour_of_day'].mean())

user1_products.head()

In [None]:
# user_1 이 최종적으로 산 물건은
user1_buy_final = order_train[order_train['user_id']==1].product_id.values

# user_1이 기존에 산 물건들 끼리의 유사도를 구해보자 product_name + aisle or department어서 생각해야할 듯
products_detail['product_name_lower'] = products_detail['product_name'].apply(lambda x : x.lower())
products_detail['name+aisle'] = products_detail['product_name_lower'] + ' ' + products_detail['aisle']
products_detail['name+department'] = products_detail['product_name_lower'] + ' ' + products_detail['department']
products_detail['name+department+aisle'] = products_detail['product_name_lower'] + ' ' + products_detail['department'] + ' ' + products_detail['aisle']

# user_1 이 과거에 산 물건 이름
# user1_prior_products_name = products_detail[products_detail['product_id'].isin(user1_prior_products)]['name+department']

In [None]:
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
product_mat = count_vect.fit_transform(user1_prior_products_name)

product_sim = cosine_similarity(product_mat, product_mat)
product_similarity = pd.DataFrame(product_sim, index = products_detail[products_detail['product_id'].isin(user1_prior_products)]['product_name'],\
                                  columns = products_detail[products_detail['product_id'].isin(user1_prior_products)]['product_name'])

In [None]:
user1_products['best_similar'] = user1_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False).index[1])
user1_products['best_similarity'] = user1_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False)[1])
user1_products

- 상품 구매의 흐름 데이터도 넣어서 좀 더 확인해볼 가치가 있음

# 관점 Ⅱ 기존에 샀던 물건들 바탕으로 새로운 물건을 산다면 어떤 물건을 살 것인가?
- 재구매 가능성이 높은 물건과 유사한 물건을 사지 않았을까?
    - product name의 유사도 파악

In [None]:
# 새로운 물건이 있나요? 
user1_buy_new = []
for prod in user1_buy_final:
    if prod not in user1_products.product_id.values:
        print(prod)
        user1_buy_new.append(prod)

In [None]:
# 결과적으로 유저 1이 산 목록들 전부 취합
user1_buy_all = user1_prior_products.tolist() + user1_buy_new

In [None]:
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
product_mat = count_vect.fit_transform(products_detail[products_detail['product_id'].isin(user1_buy_all)]['name+department'])

product_sim = cosine_similarity(product_mat, product_mat)
product_similarity = pd.DataFrame(product_sim, index = products_detail[products_detail['product_id'].isin(user1_buy_all)]['product_name'],\
                                  columns = products_detail[products_detail['product_id'].isin(user1_buy_all)]['product_name'])

In [None]:
user1_buy_new_name = products_detail[products_detail['product_id'].isin(user1_buy_new)]['product_name'].values


In [None]:
sum(product_similarity[products_detail[products_detail['product_id'].isin(user1_buy_new)]['product_name'].values].values>0.5)[0]-1 # 유사도가 0.5 보다 큰게 2개 있다

# 관점들을 전부 함수화

In [None]:
#관점 1
def make_user_df(user_id):
    user_orders = order_prior_train[order_prior_train['user_id']==user_id]
    user_orders = pd.merge(user_orders,orders[['order_id','order_number','order_dow','order_hour_of_day','days_since_prior_order']], on='order_id')
    user_orders.sort_values(by=['order_number','add_to_cart_order'], inplace=True)
    
    # 유저가 과거에 산 물건 종류 (id 로)
    user_prior_products = user_orders.product_id.unique()
    # 유저가 과거에 주문한 횟수 
    user_orders_count = len(user_orders.order_id.unique())
    # 구매한 물건 별로 재주문 횟수
    user_products = user_orders.groupby('product_id').sum()[['reordered']].reset_index()
    user_products = pd.merge(user_products, products_detail[['product_id', 'product_name']], on='product_id')

    # 기존 물건 별 재주문율
    user_products['reordered_ratio'] = user_products['reordered'].apply(lambda x : x /(user_orders_count-1))
    # 과거에 총 몇 번 구매를 했는가? 
    user_products['order_count'] = user_products['reordered']+1
    # 구매 물건 총 개수 중 이 물건은 어느 정도 비중을 차지하는 가? 
    user_products['order_ratio'] = user_products['order_count'].apply(lambda x : x/len(user_orders))

    # 해당 물건의 물건 평균 주기는 어떻게 되는가? 
    user_products['product_buy_term'] = user_products['product_id'].apply(lambda x : user_orders[(user_orders['product_id'] == x) \
                                                                                                    & (user_orders['days_since_prior_order'] > 0)]['days_since_prior_order'].mean() \
                                                                           if len(user_orders[(user_orders['product_id'] == x)]) > 1 else np.nan)
    # 해당 물건을 사는 데 있어서 평균 시간대는 어떻게 되는가?
    user_products['product_buy_time'] = user_products['product_id'].apply(lambda x : user_orders[(user_orders['product_id'] == x)]['order_hour_of_day'].mean())
    
    # 과거에 산 물건 이름 + 대분류
    user_prior_products_name = products_detail[products_detail['product_id'].isin(user_prior_products)]['name+department']
    
    # 기존 물건들 사이의 유사도 
    count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
    product_mat = count_vect.fit_transform(user_prior_products_name)

    product_sim = cosine_similarity(product_mat, product_mat)
    product_similarity = pd.DataFrame(product_sim, index = products_detail[products_detail['product_id'].isin(user_prior_products)]['product_name'],\
                                  columns = products_detail[products_detail['product_id'].isin(user_prior_products)]['product_name'])
    # print(len(user_prior_products_name)) 
    user_products['best1_similar'] = user_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False).index[1] \
                                                                         if len(user_prior_products_name)>1 else product_similarity[x].index[0])
    user_products['best1_similarity'] = user_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False)[1]\
                                                                           if len(user_prior_products_name)>1 else product_similarity[x][0])
    
#     user_products['best2_similar'] = user_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False).index[2])
#     user_products['best2_similarity'] = user_products['product_name'].apply(lambda x : product_similarity[x].sort_values(ascending=False)[2])
    
    return user_products

In [None]:
# 관점2
def why_new_product(user_id):
    prior_products = order_prior[order_prior['user_id']==user_id].product_id.unique()
    final_products = order_train[order_train['user_id']==user_id].product_id.values
    
    user_buy_new = []
    for prod in final_products:
        if prod not in prior_products:
            user_buy_new.append(prod)
    
    user_buy_all = prior_products.tolist() + user_buy_new
    count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
    product_mat = count_vect.fit_transform(products_detail[products_detail['product_id'].isin(user_buy_all)]['name+department'])

    product_sim = cosine_similarity(product_mat, product_mat)
    product_similarity = pd.DataFrame(product_sim, index = products_detail[products_detail['product_id'].isin(user_buy_all)]['product_name'],\
                                  columns = products_detail[products_detail['product_id'].isin(user_buy_all)]['product_name'])
    
    return product_similarity[products_detail[products_detail['product_id'].isin(user_buy_new)]['product_name']],user_buy_new

In [None]:
why_new_product(1)[1]

## '재구매율' 은 어느정도로 영향을 줄까?

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [None]:
user1_buy_new = order_train[order_train['user_id']==10].product_id.values
user1_df = make_user_df(10)

print(user1_buy_new
user1_df.product_id.values

In [None]:
user1_df['final_reordered'] = user1_df['product_id'].apply(lambda x : 1 if x in user1_buy_new else 0)
x = user1_df[['reordered_ratio']]
y = user1_df['final_reordered']

model = RandomForestClassifier()
model.fit(x,y)

print(f'Accuracy : {accuracy_score(model.predict(x), y)}')
print(f'F1-score : {f1_score(model.predict(x), y)}')
print(f'Recall : {recall_score(model.predict(x), y)}')
print(f'Precision : {precision_score(model.predict(x), y)}')

In [None]:
model.predict(x), y.values

In [None]:
def reordered_ratio_effect(user_id):
    user_buy_new = order_train[order_train['user_id']==user_id].product_id.values
    user_df = make_user_df(user_id)
    
    user_df['final_reordered'] = user_df['product_id'].apply(lambda x : 1 if x in user_buy_new else 0)
    x = user_df[['reordered_ratio']]
    y = user_df['final_reordered']
    
    model = RandomForestClassifier()
    model.fit(x,y)
    
    print(f'Accuracy : {accuracy_score(model.predict(x), y)}')
    print(f'F1-score : {f1_score(model.predict(x), y)}')
    print(f'Recall : {recall_score(model.predict(x), y)}')
    print(f'Precision : {precision_score(model.predict(x), y)}')
    
    return accuracy_score(model.predict(x),y), f1_score(model.predict(x),y)

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
order_prior.groupby('user_id').count()[['product_id']]

In [None]:
# user 표본을 구하기 위해서 user 정보 테이블을 만들어보자
# user 당 주문 횟수
# user 당 평균 구매 물건 수

tmp = order_prior.groupby('user_id').count()[['product_id']].reset_index()
tmp = tmp[tmp['user_id'].isin(train_users)]
train_user = orders.groupby('user_id').count()[['order_id']].reset_index()
train_user = train_user[train_user['user_id'].isin(train_users)]
train_user = train_user.reset_index(drop=True)
train_user = pd.merge(train_user, tmp, on='user_id')
train_user.columns = ['user_id','주문 횟수', '구매 물건 수']
train_user['평균 구매 물건 수'] = train_user['구매 물건 수'] / train_user['주문 횟수']
train_user.head()

In [None]:
test = []
for i,user in enumerate([259,358,513,540]):
    # print(f'user_{user}>>')
    try:
        acc, f1 = reordered_ratio_effect(user)
        acc_f1.append([acc,f1])
    except:
        print(f'user_{user} 에서 에러 발생')
        break
print('finished')

In [None]:
acc_f1 = []
for i,user in enumerate(train_users[:500]):
    # print(f'user_{user}>>')
    try:
        acc, f1 = reordered_ratio_effect(user)
        acc_f1.append([acc,f1])
    except:
        print(f'user_{user} 에서 에러 발생')
        break
    if (i+1) % 100 == 0:
        print(round((i+1)/500*100, 2),'% 진행')
print('finished')

In [None]:
# user 500 명에 대한 재구매율 feature 효율
user500 = pd.DataFrame(acc_f1, columns = ['ACC', 'F1'])
user500['user_id'] = train_users[:500]
user500 = user500[['user_id', 'ACC','F1']]
user500.describe()

In [None]:
train_user = pd.merge(user500, train_user, on='user_id')

In [None]:
# 정확도 1, F1-score 0
k = train_user[(train_user['F1']==0) & (train_user['ACC']==1)].user_id.values
for i in k:
    # 마지막에 산 물건
    new_buy = order_train[order_train['user_id']==i].product_id.values
    
    # 기존 구매내역
    user_df = make_user_df(i)
    prior_buy = user1_df.product_id
    
    print(prior_buy.isin(new_buy).sum())

In [None]:
train_user.sort_values(by='F1',ascending=False)