In [2]:
import sys
import pandas as pd
import datetime as dt
import numpy as np

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules 
from mlxtend.frequent_patterns import fpgrowth

In [3]:
def get_parent_directory():
    list = sys.path[0].split('\\')[:-1]
    return_str = ''
    for element in list:
        return_str += element + '/'
    return return_str.rstrip('/')

In [2]:
obj = pd.read_pickle(get_parent_directory()+'/data/transaction_and_features.pkl')
obj['purch_date'] = pd.to_datetime(obj['purch_date']).dt.date
obj = obj[obj.is_purchase==1]

In [3]:
test_months = 6
max_date = obj['purch_date'].max()
train = obj[obj['purch_date'] < max_date - pd.Timedelta(weeks=test_months * 4)]
test = obj[obj['purch_date'] >= max_date - pd.Timedelta(weeks=test_months * 4)]

In [4]:
items_sts = train.groupby('item').agg({'price':['median', 'mean'], 
                                       'quantity':['median', 'mean']}).reset_index() 
items_sts.columns = items_sts.columns.droplevel()
items_sts.columns = ['item', 'price_median', 'price_mean', 'quantity_median', 'quantity_mean']

items_sts.head()

Unnamed: 0,item,price_median,price_mean,quantity_median,quantity_mean
0,18296,335.0,345.530612,2.47,2.525918
1,26681,3731.0,3731.0,2.47,2.47
2,32173,1533.0,1555.875,2.47,2.64125
3,32539,3779.0,3928.8,2.47,3.383333
4,38071,277.0,320.315315,2.47,2.618108


In [5]:
category_sts = train.groupby('category').agg({'price':['median', 'mean'], 
                                              'quantity':['median', 'mean']}).reset_index() 
category_sts.columns = category_sts.columns.droplevel()
category_sts.columns = ['category', 'price_median', 'price_mean', 'quantity_median', 'quantity_mean']

category_sts.head()

Unnamed: 0,category,price_median,price_mean,quantity_median,quantity_mean
0,петлевой ковролин,355.0,355.284444,15.074,18.352593
1,террасные доски,512.0,646.762102,6.58,14.908008
2,1 ламповые,803.0,877.364372,2.47,3.676377
3,2 ламповые,1512.0,1546.273128,2.47,2.829097
4,3 ламповые,1572.0,1829.419608,2.47,2.717137


In [10]:
len(train)*0.00001

37.12751

### Результат #1 на уровне item по id_check_unic

In [11]:
tb = train[['item','id_check_unique', 'price', 'quantity', 'turnover', 'is_purchase', 'purch_date']]
df1 = tb.groupby('id_check_unique')['item'].apply(list).reset_index(name='basket')
df1['basket_len'] = df1['basket'].str.len()
df1 = df1[df1.basket_len>1]
df1 = df1.basket.tolist()

te = TransactionEncoder() 
fitted = te.fit(df1)
te_ary = fitted.transform(df1, sparse=True) 

df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_) 
df.columns = [str(i) for i in df.columns]

In [12]:
rules1 = fpgrowth(df, min_support=0.00001, max_len=2, use_colnames=True)

In [13]:
rules1_metrics = association_rules(rules1, metric="lift", min_threshold=1)
rules1_metrics["antecedents"] = rules1_metrics["antecedents"].apply(lambda x: list(x)[0]).astype("unicode")
rules1_metrics["consequents"] = rules1_metrics["consequents"].apply(lambda x: list(x)[0]).astype("unicode")

rules1_metrics['antecedents'] = rules1_metrics['antecedents'].astype(int)
items_sts['item'] = items_sts['item'].astype(int)

rules1_metrics = rules1_metrics.merge(items_sts, how='left', right_on='item', left_on='antecedents')
rules1_metrics = rules1_metrics[['antecedents', 'consequents', 'antecedent support',
       'consequent support', 'support', 'confidence', 'lift', 'leverage',
       'conviction', 'zhangs_metric',  'price_median', 'price_mean',
       'quantity_median', 'quantity_mean']]

rules1_metrics.to_csv(get_parent_directory()+'/data/rules1_lift_6m.csv')

### Результат #2 на уровне category по id_check_unic

In [17]:
tb = train[['category', 'id_check_unique']]
# tb = tb.drop_duplicates()
df2 = tb.groupby('id_check_unique')['category'].apply(list).reset_index(name='basket')
df2['basket_len'] = df2['basket'].str.len()
df2 = df2[df2.basket_len>1]
df2 = df2.basket.tolist()

te = TransactionEncoder() 
fitted = te.fit(df2)
te_ary = fitted.transform(df2, sparse=True) 

df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_) 
df.columns = [str(i) for i in df.columns]

rules2 = fpgrowth(df, min_support=0.00001, max_len=2, use_colnames=True)

In [None]:
rules2_metrics = association_rules(rules2, metric="lift", min_threshold=1)
rules2_metrics["antecedents"] = rules2_metrics["antecedents"].apply(lambda x: list(x)[0]).astype("unicode")
rules2_metrics["consequents"] = rules2_metrics["consequents"].apply(lambda x: list(x)[0]).astype("unicode")

rules2_metrics['antecedents'] = rules2_metrics['antecedents'].astype(str)
category_sts['category'] = category_sts['category'].astype(str)

rules2_metrics = rules2_metrics.merge(category_sts, how='left', right_on='category', left_on='antecedents')
rules2_metrics = rules2_metrics[['antecedents', 'consequents', 'antecedent support',
       'consequent support', 'support', 'confidence', 'lift', 'leverage',
       'conviction', 'zhangs_metric',  'price_median', 'price_mean',
       'quantity_median', 'quantity_mean']]

rules2_metrics.to_csv(get_parent_directory()+'/data/rules2_lift_6m.csv')

### Результат #3 на уровне item по id_check_unic_lastnext

In [23]:
tb = train[['id', 'item','id_check_unique', 'price', 'quantity', 'turnover', 'is_purchase', 'purch_date', 'channel']]
df3 = tb.groupby(['id', 'id_check_unique', 'purch_date', 'channel'])['item'].apply(list).reset_index(name='basket')
df3.sort_values(by=['id', 'channel', 'purch_date',  'id_check_unique'], inplace=True)
df3['lagged_values'] = df3.groupby(['id', 'channel'])['id_check_unique'].shift(1)
df3 = df3.merge(df3[['id_check_unique', 'basket']], how='left', left_on='lagged_values', right_on='id_check_unique')
df3['basket_y'] = [ [] if x is np.NaN else x for x in df3['basket_y'] ]
df3['merged_basket'] = df3.basket_x + df3.basket_y

df3 = df3[['id', 'id_check_unique_x', 'lagged_values', 'merged_basket']]
df3.columns = ['id', 'id_check_unique', 'lagged_values', 'merged_basket']
df3['basket_len'] = df3['merged_basket'].str.len()
df3 = df3[df3['basket_len']>1]
df3 = df3.merged_basket.tolist()

In [24]:
te = TransactionEncoder() 
fitted = te.fit(df3)
te_ary = fitted.transform(df3, sparse=True) 

df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_) 
df.columns = [str(i) for i in df.columns]

In [25]:
rules3 = fpgrowth(df, min_support=0.00001, max_len=2, use_colnames=True)

In [26]:
rules3_metrics = association_rules(rules3, metric="lift", min_threshold=1)
rules3_metrics["antecedents"] = rules3_metrics["antecedents"].apply(lambda x: list(x)[0]).astype("unicode")
rules3_metrics["consequents"] = rules3_metrics["consequents"].apply(lambda x: list(x)[0]).astype("unicode")

rules3_metrics['antecedents'] = rules3_metrics['antecedents'].astype(int)

rules3_metrics = rules3_metrics.merge(items_sts, how='left', right_on='item', left_on='antecedents')
rules3_metrics = rules3_metrics[['antecedents', 'consequents', 'antecedent support',
       'consequent support', 'support', 'confidence', 'lift', 'leverage',
       'conviction', 'zhangs_metric',  'price_median', 'price_mean',
       'quantity_median', 'quantity_mean']]

rules3_metrics.to_csv(get_parent_directory()+'/data/rules3_lift_6m.csv')

### Результат #4 на уровне category по id_check_unic_lastnext

In [27]:
tb = train[['id', 'category','id_check_unique', 'price', 'quantity', 'turnover', 'is_purchase', 'purch_date', 'channel']]
df4 = tb.groupby(['id', 'id_check_unique', 'purch_date', 'channel'])['category'].apply(list).reset_index(name='basket')
df4.sort_values(by=['id', 'channel', 'purch_date',  'id_check_unique'], inplace=True)
df4['lagged_values'] = df4.groupby(['id', 'channel'])['id_check_unique'].shift(1)

df4 = df4.merge(df4[['id_check_unique', 'basket']], how='left', left_on='lagged_values', right_on='id_check_unique')
df4['basket_y'] = [ [] if x is np.NaN else x for x in df4['basket_y'] ]
df4['merged_basket'] = df4.basket_x + df4.basket_y

df4 = df4[['id', 'id_check_unique_x', 'lagged_values', 'merged_basket']]
df4.columns = ['id', 'id_check_unique', 'lagged_values', 'merged_basket']
df4['basket_len'] = df4['merged_basket'].str.len()
df4 = df4[df4['basket_len']>1]
df4 = df4.merged_basket.tolist()

te = TransactionEncoder() 
fitted = te.fit(df4)
te_ary = fitted.transform(df4, sparse=True) 

df = pd.DataFrame.sparse.from_spmatrix(te_ary, columns=te.columns_) 
df.columns = [str(i) for i in df.columns]

In [28]:
rules4 = fpgrowth(df, min_support=0.00001, max_len=2, use_colnames=True)

In [35]:
rules4_metrics = association_rules(rules4, metric="lift", min_threshold=1)
rules4_metrics["antecedents"] = rules4_metrics["antecedents"].apply(lambda x: list(x)[0]).astype("unicode")
rules4_metrics["consequents"] = rules4_metrics["consequents"].apply(lambda x: list(x)[0]).astype("unicode")

rules4_metrics['antecedents'] = rules4_metrics['antecedents'].astype(str)

rules4_metrics = rules4_metrics.merge(category_sts, how='left', right_on='category', left_on='antecedents')
rules4_metrics = rules4_metrics[['antecedents', 'consequents', 'antecedent support',
       'consequent support', 'support', 'confidence', 'lift', 'leverage',
       'conviction', 'zhangs_metric',  'price_median', 'price_mean',
       'quantity_median', 'quantity_mean']]

rules4_metrics.to_csv(get_parent_directory()+'/data/rules4_lift_6m.csv')