In [1]:
import pandas as pd
import numpy

In [2]:
dtype = {
    '交易id': numpy.str,
    '資料日期': numpy.str,
    '資料時間': numpy.str,
    '餐別帶': numpy.str,
    '縣市別': numpy.str,
    '店舖代號': numpy.uint32,
    '主商圈': numpy.str,
    '品號-品名稱': numpy.str,
    '群號-群名稱': numpy.str,
    '單品名稱': numpy.str,
    '銷售數量': numpy.uint16,
    '銷售單價': numpy.float,
    '交易金額': numpy.float
}

In [43]:
USE_COLUMNS = ['交易id', '資料日期', '資料時間', '餐別帶', '縣市別', '店舖代號', '主商圈', '品號-品名稱',
       '群號-群名稱', '單品名稱', '銷售數量', '銷售單價', '交易金額']
PARSE_DATES = {
    '資料日期與時間': [
        '資料日期',
        '資料時間'
    ]
}
TRANSACTION_ATTRS =  ['餐別帶', '資料日期與時間', '縣市別', '店舖代號', '主商圈']
ITEM_ATTRS = ['品號-品名稱', '群號-群名稱', '銷售單價']

In [9]:
datas = pd.read_csv('customer_data(utf-8).csv',
                   index_col=1,
                   nrows=50000,
                   usecols=USE_COLUMNS,
                   dtype=dtype,
                   parse_dates=PARSE_DATES,
        )

In [10]:
datas['餐別帶'].unique()

array(['晚餐時間帶', '下午茶時間帶', '早餐時間帶', '午餐時間帶', '一般時間帶'], dtype=object)

In [11]:
datas.describe()

Unnamed: 0,店舖代號,銷售數量,銷售單價,交易金額
count,50000.0,50000.0,49992.0,50000.0
mean,8434.70708,1.25904,38.140362,43.63634
std,4712.359107,1.268736,58.41183,82.358639
min,2076.0,1.0,0.0,0.0
25%,3688.0,1.0,10.0,12.0
50%,9166.0,1.0,25.0,25.0
75%,13012.0,1.0,42.0,49.0
max,16626.0,96.0,3000.0,9000.0


In [43]:
def get_transaction_dict(df):
    transaction_df = df.filter(['交易id'])
    groupbyObject = transaction_df.groupby(['交易id'])
    return groupbyObject.first()

In [11]:
def get_items_dict(df):
    items = df.set_index('單品名稱')
    items = items.drop(['餐別帶', '資料日期與時間', '縣市別', '店舖代號', '主商圈', '銷售數量', '交易金額'], axis=1)
    return items.groupby(items.index).first().to_dict('index')

In [50]:
class TransactionTransformer:
    def __init__(self, transaction_id_name, item_name, transaction_amount_name, transaction_attrs=[], item_attrs=[]):
        self.transaction_id_name = transaction_id_name
        self.item_name = item_name
        self.transaction_attrs = transaction_attrs
        self.item_attrs = item_attrs
        self.transaction_amount_name = transaction_amount_name
    
    def to_dict(self, df, filter_cols, group_by, aggregation_option):
        total_cols = list(df.columns)
        df = df.filter(filter_cols)
        groupbyObject = df.groupby([group_by])
        df = groupbyObject.agg(aggregation_option)
        dic =  df.to_dict('index')
        for index, value in dic.items():
            value[group_by] = index
        return dic

    def get_transaction_dict(self, df):
        filter_columns = [self.transaction_id_name, self.transaction_amount_name] + self.transaction_attrs 
        aggr_option = { key: 'first' for key in self.transaction_attrs }
        aggr_option[self.transaction_amount_name] = 'sum'
        return self.to_dict(df, filter_columns, self.transaction_id_name, aggr_option)
    
    def get_item_dict(self, df):
        filter_columns = [self.item_name] + self.item_attrs
        aggr_option = {key: 'first' for key in self.item_attrs }
        return self.to_dict(df, filter_columns, self.item_name,  aggr_option)

    def transform(self, df):
        transaction_dict = self.get_transaction_dict(df)
        item_dict = self.get_item_dict(df)
        
        
    def get_transaction_length(self, df):
        return len(df.index.unique())

    def get_nodes(self, df):
        item_df = df.set_index(self.item_name)
        item_df = item_df.filter(self.item_attrs)
        item_dict = items.groupby(items.index).first().to_dict('index')
        for key, value in item_dict.items():
            value[self.item_name] = key
        return item_dict
    
    def get_edges(self, df, nodes, support, weight_func):
        df = df.filter([self.item_name, self.transaction_id_name])
            
    def get_copurchase_df(self, df):
        count_size = df.index.unique()
        size_df = df.groupby(self.transaction_id_name).size()
        size_df = size_df[size_df < 2]
        return df.drop(list(size_df.index), axis=0)

    def get_graph_view(self, df, support, weight_func):
        df = df.dropna()
        copurchase_df = self.get_copurchase_df(df)
        nodes = self.get_nodes(df)

In [51]:
transformer = TransactionTransformer('交易id', '單品名稱', '交易金額', TRANSACTION_ATTRS, ITEM_ATTRS)

In [52]:
transformer.get_item_dict(datas)

{'一口爆漿餡餅（５入）': {'品號-品名稱': '24-冷凍食品',
  '群號-群名稱': '241-冷凍調理',
  '銷售單價': 28.0,
  '單品名稱': '一口爆漿餡餅（５入）'},
 '一度贊爌肉麵（碗）': {'品號-品名稱': '43-泡麵',
  '群號-群名稱': '433-大碗麵',
  '銷售單價': 53.0,
  '單品名稱': '一度贊爌肉麵（碗）'},
 '一度贊爌肉麵（袋）': {'品號-品名稱': '43-泡麵',
  '群號-群名稱': '435-袋麵',
  '銷售單價': 47.0,
  '單品名稱': '一度贊爌肉麵（袋）'},
 '一度贊麻辣牛肉麵（碗）': {'品號-品名稱': '43-泡麵',
  '群號-群名稱': '433-大碗麵',
  '銷售單價': 53.0,
  '單品名稱': '一度贊麻辣牛肉麵（碗）'},
 '一日水果１００％蘋果汁': {'品號-品名稱': '29-冷藏飲料',
  '群號-群名稱': '297-冷藏常溫飲料',
  '銷售單價': 20.0,
  '單品名稱': '一日水果１００％蘋果汁'},
 '一日蔬果１００％紫色蔬果汁': {'品號-品名稱': '29-冷藏飲料',
  '群號-群名稱': '297-冷藏常溫飲料',
  '銷售單價': 20.0,
  '單品名稱': '一日蔬果１００％紫色蔬果汁'},
 '一日蔬果１００％蔬果汁': {'品號-品名稱': '29-冷藏飲料',
  '群號-群名稱': '297-冷藏常溫飲料',
  '銷售單價': 20.0,
  '單品名稱': '一日蔬果１００％蔬果汁'},
 '一條根精油貼布（薑黃）': {'品號-品名稱': '68-保健衛生',
  '群號-群名稱': '683-保健用品',
  '銷售單價': 139.0,
  '單品名稱': '一條根精油貼布（薑黃）'},
 '一產保險費': {'品號-品名稱': '00-傳統代收',
  '群號-群名稱': '05-產險',
  '銷售單價': 0.0,
  '單品名稱': '一產保險費'},
 '一番榨生啤酒－５００': {'品號-品名稱': '61-啤酒',
  '群號-群名稱': '612-進口啤酒',
  '銷售單價': 60.0,
  '單品名稱': '一番榨