In [40]:
import pandas as pd
import numpy

In [41]:
dtype = {
    '交易id': numpy.str,
    '資料日期': numpy.str,
    '資料時間': numpy.str,
    '餐別帶': numpy.str,
    '縣市別': numpy.str,
    '店舖代號': numpy.uint32,
    '主商圈': numpy.str,
    '品號-品名稱': numpy.str,
    '群號-群名稱': numpy.str,
    '單品名稱': numpy.str,
    '銷售數量': numpy.uint16,
    '銷售單價': numpy.float,
    '交易金額': numpy.float
}

In [42]:
USE_COLUMNS = ['交易id', '資料日期', '資料時間', '餐別帶', '縣市別', '店舖代號', '主商圈', '品號-品名稱',
       '群號-群名稱', '單品名稱', '銷售數量', '銷售單價', '交易金額']
PARSE_DATES = {
    '資料日期與時間': [
        '資料日期',
        '資料時間'
    ]
}
TRANSACTION_ATTRS =  ['餐別帶', '資料日期與時間', '縣市別', '店舖代號', '主商圈']
ITEM_ATTRS = ['品號-品名稱', '群號-群名稱', '銷售單價']

In [43]:
datas = pd.read_csv('customer_data(utf-8).csv',
                   index_col=1,
                   nrows=100000,
                   usecols=USE_COLUMNS,
                   dtype=dtype,
                   parse_dates=PARSE_DATES,
        )

In [44]:
class TransactionTransformer:
    def __init__(self, transaction_id_name, item_name, transaction_amount_name, transaction_attrs=[], item_attrs=[]):
        self.transaction_id_name = transaction_id_name
        self.item_name = item_name
        self.transaction_attrs = transaction_attrs
        self.item_attrs = item_attrs
        self.transaction_amount_name = transaction_amount_name
    
    def to_dict(self, df, filter_cols, group_by, aggregation_option):
        total_cols = list(df.columns)
        df = df.filter(filter_cols)
        groupbyObject = df.groupby([group_by])
        df = groupbyObject.agg(aggregation_option)
        dic =  df.to_dict('index')
        for index, value in dic.items():
            value[group_by] = index
        return dic

    def get_transaction_dict(self, df):
        filter_columns = [self.transaction_id_name, self.transaction_amount_name] + self.transaction_attrs 
        aggr_option = { key: 'first' for key in self.transaction_attrs }
        aggr_option[self.transaction_amount_name] = 'sum'
        return self.to_dict(df, filter_columns, self.transaction_id_name, aggr_option)
    
    def get_item_dict(self, df):
        filter_columns = [self.item_name] + self.item_attrs
        aggr_option = {key: 'first' for key in self.item_attrs }
        return self.to_dict(df, filter_columns, self.item_name,  aggr_option)

    def transform(self, df):
        df = df.dropna()
        transaction_dict = self.get_transaction_dict(df)
        item_dict = self.get_item_dict(df)
        for _, value in transaction_dict.items():
            value['items'] = []

        for index, data in df.iterrows():
            if index in transaction_dict:
                ts = transaction_dict[index]
                item_name = data[self.item_name]
                if item_name in item_dict:
                    item = dict(item_dict[item_name])
                    item['amount'] = data[self.transaction_amount_name]
                    ts['items'].append(item)
        return list(transaction_dict.values())

In [45]:
transformer = TransactionTransformer('交易id', '單品名稱', '交易金額', TRANSACTION_ATTRS, ITEM_ATTRS)

In [46]:
transactions = transformer.transform(datas)

In [47]:
import pymongo

In [48]:
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client['pn']

In [49]:
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'pn')

In [50]:
db.transactions

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'pn'), 'transactions')

In [51]:
transactions[0]

{'餐別帶': '一般時間帶',
 '資料日期與時間': Timestamp('2017-12-01 00:03:07'),
 '縣市別': '台中市',
 '店舖代號': 3047,
 '主商圈': '住宅型',
 '交易金額': 125.0,
 '交易id': '00324420171201000307000118769702',
 'items': [{'品號-品名稱': '58-香煙',
   '群號-群名稱': '585-進口濃煙',
   '銷售單價': 125.0,
   '單品名稱': '倫敦登喜路精裁１０毫克香菸',
   'amount': 125.0}]}

In [52]:
ids = db.transactions.insert_many(transactions)