# Basket Size 예측

In [1]:
import numpy as np
import pandas as pd

In [2]:
products = pd.read_csv("../data/products.csv")
aisles = pd.read_csv("../data/aisles.csv")
departments = pd.read_csv("../data/departments.csv")
orders = pd.read_csv("../data/orders.csv")
prior = pd.read_csv("../data/order_products__prior.csv")
train = pd.read_csv("../data/order_products__train.csv")

In [3]:
priorXorders = prior.merge(orders, on='order_id')
priorXorders.head(10)

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0
5,2,17794,6,1,202279,prior,3,5,9,8.0
6,2,40141,7,1,202279,prior,3,5,9,8.0
7,2,1819,8,1,202279,prior,3,5,9,8.0
8,2,43668,9,0,202279,prior,3,5,9,8.0
9,3,33754,1,1,205970,prior,16,5,17,12.0


## prior의 basket size

In [4]:
prior_order_basketsize = pd.DataFrame()
prior_order_basketsize['basketsize'] = prior.groupby('order_id').size()
prior_order_basketsize.head(10)

Unnamed: 0_level_0,basketsize
order_id,Unnamed: 1_level_1
2,9
3,8
4,13
5,26
6,3
7,2
8,1
9,15
10,15
11,5


## train의 basketsize

In [5]:
train_order_basketsize = pd.DataFrame()
train_order_basketsize['basketsize'] = train.groupby('order_id').size()
train_order_basketsize.head(10)

Unnamed: 0_level_0,basketsize
order_id,Unnamed: 1_level_1
1,8
36,8
38,9
96,7
98,49
112,11
170,17
218,5
226,13
349,11


In [6]:
order_basketsize = prior_order_basketsize.append(train_order_basketsize)

# order와 basketsize를 합침

In [7]:
order_plus_basketsize = orders.join(order_basketsize, on = ['order_id'])
order_plus_basketsize.head(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,basketsize
0,2539329,1,prior,1,2,8,,5.0
1,2398795,1,prior,2,3,7,15.0,6.0
2,473747,1,prior,3,3,12,21.0,5.0
3,2254736,1,prior,4,4,7,29.0,5.0
4,431534,1,prior,5,4,15,28.0,8.0
5,3367565,1,prior,6,2,7,19.0,4.0
6,550135,1,prior,7,1,9,20.0,5.0
7,3108588,1,prior,8,1,14,14.0,6.0
8,2295261,1,prior,9,1,16,0.0,6.0
9,2550362,1,prior,10,4,8,30.0,9.0


# 추측을 위해서 Prior의 데이터만 추출

In [8]:
prior_basket_Info = order_plus_basketsize[order_plus_basketsize['eval_set'] == 'prior']
prior_basket_Info.head(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,basketsize
0,2539329,1,prior,1,2,8,,5.0
1,2398795,1,prior,2,3,7,15.0,6.0
2,473747,1,prior,3,3,12,21.0,5.0
3,2254736,1,prior,4,4,7,29.0,5.0
4,431534,1,prior,5,4,15,28.0,8.0
5,3367565,1,prior,6,2,7,19.0,4.0
6,550135,1,prior,7,1,9,20.0,5.0
7,3108588,1,prior,8,1,14,14.0,6.0
8,2295261,1,prior,9,1,16,0.0,6.0
9,2550362,1,prior,10,4,8,30.0,9.0


# 평가를 위해서 Train의 데이터만 추출

In [9]:
train_basket_Info = order_plus_basketsize[order_plus_basketsize['eval_set'] == 'train']
train_basket_Info.set_index('user_id')
train_basket_Info.head(10)

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,basketsize
10,1187899,1,train,11,4,8,14.0,11.0
25,1492625,2,train,15,1,11,30.0,31.0
49,2196797,5,train,5,0,11,6.0,9.0
74,525192,7,train,21,2,11,6.0,9.0
78,880375,8,train,4,1,14,10.0,18.0
82,1094988,9,train,4,6,10,30.0,22.0
88,1822501,10,train,6,0,19,30.0,4.0
115,1827621,13,train,13,0,21,8.0,5.0
129,2316178,14,train,14,2,19,11.0,11.0
200,2180313,17,train,41,3,10,30.0,6.0


# 1. 평균값을 이용한 basket size 예측
* user_id별로 평균 basket size로 basket size를 예측
* 평가방법 - RMSE를 이용
* RMSE : 5.84

In [10]:
average_basketsize = pd.DataFrame()
average_basketsize['user_avg_basketsize'] = prior_basket_Info.groupby('user_id')['basketsize'].mean()
average_basketsize['user_id'] = average_basketsize.index

Average_Basketsize = pd.merge(average_basketsize, train_basket_Info, how = 'inner', left_on='user_id',right_on='user_id')
del average_basketsize

Average_Basketsize = Average_Basketsize.drop(['eval_set','order_number','order_dow','order_hour_of_day','days_since_prior_order'], 1)
Average_Basketsize = Average_Basketsize.set_index('user_id')
Average_Basketsize.head(10)

Unnamed: 0_level_0,user_avg_basketsize,order_id,basketsize
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,5.9,1187899,11.0
2,13.928571,1492625,31.0
5,9.25,2196797,9.0
7,10.3,525192,9.0
8,16.333333,880375,18.0
9,25.333333,1094988,22.0
10,28.6,1822501,4.0
13,6.75,1827621,5.0
14,16.153846,2316178,11.0
17,7.35,2180313,6.0


In [11]:
import math
mse = 0.0
for row in Average_Basketsize.itertuples():
    a = row.user_avg_basketsize
    b = row.basketsize
    mse += (a-b)*(a-b)
N = len(Average_Basketsize.index)
mse /= N
rmse = math.sqrt(mse)

print('mse : ',mse)
print('rmse : ',rmse)

mse :  34.109794474
rmse :  5.8403591048797825


# 2. 우에에에에에ㅔㅇ