In [2]:
import pandas as pd
import numpy

In [3]:
dtype = {
    '交易id': numpy.str,
    '資料日期': numpy.str,
    '資料時間': numpy.str,
    '餐別帶': numpy.str,
    '縣市別': numpy.str,
    '店舖代號': numpy.uint32,
    '主商圈': numpy.str,
    '品號-品名稱': numpy.str,
    '群號-群名稱': numpy.str,
    '單品名稱': numpy.str,
    '銷售數量': numpy.uint16,
    '銷售單價': numpy.float,
    '交易金額': numpy.float
}
USE_COLUMNS = ['交易id', '資料日期', '資料時間', '餐別帶', '縣市別', '店舖代號', '主商圈', '品號-品名稱',
       '群號-群名稱', '單品名稱', '銷售數量', '銷售單價', '交易金額']
PARSE_DATES = {
    '資料日期與時間': [
        '資料日期',
        '資料時間'
    ]
}

In [4]:
file = pd.read_csv('customer_data(utf-8).csv',
                   index_col=1,
                   nrows=100000,
                   usecols=USE_COLUMNS,
                   dtype=dtype,
                   parse_dates=PARSE_DATES,
        )

In [5]:
file = file.dropna()

In [6]:
file.shape

(99981, 11)

In [7]:
def get_copurchase_df(df):
    count_size = df.index.unique()
    size_df = df.groupby('交易id').size()
    size_df = size_df[size_df < 2]
    return df.drop(list(size_df.index), axis=0)

In [8]:
data = get_copurchase_df(file)

In [9]:
data.head()

Unnamed: 0_level_0,資料日期與時間,餐別帶,縣市別,店舖代號,主商圈,品號-品名稱,群號-群名稱,單品名稱,銷售數量,銷售單價,交易金額
交易id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
01637920171201184556000112534201,2017-12-01 18:45:56,晚餐時間帶,台中市,11148,住宅型,72-家庭雜貨,722-家雜用品,（新）銷售用購物袋１８號袋,1,1.0,1.0
01614720171201163042000114405402,2017-12-01 16:30:42,下午茶時間帶,台中市,10697,住宅型,29-冷藏飲料,296-冷藏奶茶,飲冰室茶集綠奶茶,1,25.0,25.0
01384020171201130558000119295301,2017-12-01 13:05:58,午餐時間帶,彰化縣,13840,文教型,12-調理麵,122-熱食麵２配,沙茶牛肉炒麵,1,69.0,69.0
01475120171201153506000111697202,2017-12-01 15:35:06,下午茶時間帶,新竹市,4151,工業型,29-冷藏飲料,291-冷藏茶飲料,冷泡茶台茶１２號紅茶,1,25.0,21.0
01564720171201180924000113751602,2017-12-01 18:09:24,晚餐時間帶,台中市,3860,住宅型,34-健康飲料,343-水,多喝水２Ｌ,2,35.0,49.0


In [10]:
def aggregate(df):
    dic = {}
    for index, row in df.iterrows():
        if index not in dic:
            dic[index] = []
        dic[index].append({
            'name': row['單品名稱'],
            'amount': row['銷售單價']
        })
    dic = { index : value for index, value in dic.items() if len(value) > 1}
    return dic

In [11]:
purchase_list = aggregate(data)

In [12]:
from pymongo import MongoClient

In [13]:
client = MongoClient('localhost', 27017)
db = client['pn']

In [14]:
purchase_list = list(db['transactions'].find({'items.1': { '$exists': True } }, projection=['items']))

In [131]:
purchase_list[1]

{'_id': ObjectId('5ccfa1068e090479028fdd8b'),
 'items': [{'品號-品名稱': '19-吐司蛋糕',
   '群號-群名稱': '192-蛋糕',
   '銷售單價': 35.0,
   '單品名稱': '雞蛋牛奶捲',
   'amount': 70.0},
  {'品號-品名稱': '00-傳統代收',
   '群號-群名稱': '18-代碼繳費',
   '銷售單價': 0.0,
   '單品名稱': '代碼繳費１',
   'amount': 0.0}]}

In [16]:
len(purchase_list)

23288

# Network Analysis

In [160]:
from itertools import filterfalse
def convert(purchase_list, support):
    def find_edges_in_list(itemsets):
        from itertools import combinations
        result = []
        return combinations(itemsets, 2)
    result = {}
    nodes = set()
    for transaction in purchase_list:
        itemsets = transaction['items']
        if len(itemsets) > 1:
            for edge_dict_tuple in find_edges_in_list(itemsets):
                edge = tuple([dic['單品名稱'] for dic in edge_dict_tuple])
                weight = sum([dic['amount'] for dic in edge_dict_tuple])
                if edge in result:
                    result[edge]['count'] += 1
                    result[edge]['weight'] += weight
                else:
                    result[edge] = {}
                    result[edge]['count'] = 1
                    result[edge]['weight'] = weight
    for key in list(result.keys()):
        if result[key]['count']/len(purchase_list) < support:
            del result[key]
    for items in result.keys():
        for item in items:
            if item not in nodes:
                nodes.add(item)
    return (nodes, result)

In [215]:
nodes, edges_dict = convert(purchase_list, 0.0003)

In [216]:
print('Node number: {}\nEdge number: {}'.format(len(nodes), len(edges_dict)))

Node number: 231
Edge number: 571


In [217]:
import igraph

In [218]:
g = igraph.Graph()

In [219]:
for node in nodes:
    g.add_vertex(node)

In [220]:
len(g.vs)

231

In [221]:
for edge, attrs in edges_dict.items():
    weight = attrs['weight'] * attrs['count'] if attrs['weight'] > 0 else 1
    g.add_edge(edge[0], edge[1], weight=weight)

In [222]:
g.simplify(combine_edges={ "weight": "sum" })
g.to_undirected(mode='mutual', combine_edges={ "weight": "sum" })

In [223]:
communities = g.community_fastgreedy('weight')

In [224]:
cluster = communities.as_clustering()

In [225]:
len(cluster)

35

In [188]:
for comm in cluster:
    comm_name = [g.vs[index]['name'] for index in comm]
    print(" ".join(comm_name))
    print("==================")

（新）銷售用購物袋１８號袋 伯朗咖啡 蘋果冰茶 御茶園特上奶茶 保力達蠻牛維他命Ｂ飲料 麥香錫蘭奶茶 全家紅豆麵包 奶油雞排歐姆蛋燴飯 鹼性離子水 衛生冰 麥香奶茶ＴＰ３００ 茶裏王日式綠茶 台鹽海洋鹼性離子水 金牌台灣啤酒 冷泡茶冷萃綠茶無糖 全家克林姆麵包 蔥爆牛肉燴飯 可口可樂ＰＥＴ 純喫茶鮮柚綠茶６５０ｍｌ 寶礦力水得 茶裏王英式紅茶 雲絲頓紅１０毫克香煙 金蘋果調味乳蘋果風味 金牌台啤罐裝（６入） 義美奶茶 藍山咖啡 純喫茶紅茶 （新）４５號銷售用購物袋 冷泡茶冰釀烏龍 飲冰室茶集綠奶茶 飲冰室茶集紅奶茶 統一布丁（雞蛋口味） 科學麵 泡沫綠茶ＴＰ３００ 純喫茶無糖綠茶６５０ｍｌ 麥香綠茶ＴＰ３００ 義美錫蘭紅茶 麥香阿薩姆奶茶 紫米紅豆湯圓 麥香紅茶ＴＰ３００ 金牌台灣啤酒５００ＭＬ 原萃日式綠茶 ＦＩＮ深海健康補給飲料 皇家奶茶 貝納頌經典拿鐵 海苔肉鬆沙拉麵包 舒跑運動飲料ＰＥＴ 天然水２．２Ｌ 泡沫紅茶ＴＰ３００ 日式蒜香燒豚飯 葡萄冰茶 長壽黃硬盒香煙 純喫茶綠茶６５０ｍｌ 咖啡廣場奶香特調咖啡 天然水 芒果冰茶 七星１０毫克硬盒香煙 純喫茶紅茶６５０ｍｌ
熱美式中杯 養樂多 頂級鮮奶優格－莓果穀物脆片 蕃薯（２５元） 香蕉單入 肉鬆起酥麵包 特濃咖啡拿鐵 蝦皮取件Ｃ 蕃薯（３０元） 雪花蛋糕 麥香奶茶（罐裝） 經典原味熱狗 大口奶油蕈菇起司雞排飯糰 肉鬆飯糰 光泉無加糖鮮豆漿 熱拿鐵大杯 全家熱狗麵包 茶葉蛋（銷售用） 重黑巧克力蛋糕 爆濃起司熱狗 蕃薯（２０元） 番薯（３５元） 蕃薯（１５元） 統一陽光無糖高纖豆漿 ＬＣＡ活菌原味發酵乳 光泉米漿 熱美式小杯 簡單點原味優酪乳 特濃黑可可 熱美式大杯 熱拿鐵中杯 綠豆沙牛乳 黑胡椒熱狗 林鳳營全脂鮮乳 全家花生夾心土司（２入） 鮮奶茶 義美古早傳統無糖豆奶 商店街取件 光泉無加糖黑豆漿 義美全脂鮮乳２３６ｍｌ 熱拿鐵小杯 簡單點無加糖優酪乳 統一大布丁（雞蛋口味） ＦＰ取件 ＥＣ代收一 鮪魚飯糰 特濃抹茶拿鐵 大口法香烤雞飯糰 冰拿鐵大杯
午后時光鐵觀音奶茶 午后時光王室大吉嶺奶茶 午后時光皇家伯爵奶茶
冰鎮檸檬紅茶６５０ｍｌ 冰鎮芭樂綠茶６５０ｍｌ
Ｃｒｅａｍ－Ｏ黑巧克力三明治餅 促銷券０６ 伊藤園蘋果紅茶 頑皮滷蛋－原味
戰禍邪神第１１章 戰禍邪神第１２章
關東煮本舖拉麵 手工高麗菜捲 究

In [226]:
cluster.modularity

0.6625472802880235

In [227]:
items = []
for index, value in enumerate(g.betweenness(weights='weight')):
    if value > 0:
        items.append({ 'name': g.vs[index]['name'], 'betweeness': value })
items.sort(key=lambda x: x['betweeness'], reverse=True)

In [191]:
items[0:10]

[{'name': '（新）銷售用購物袋１８號袋', 'betweeness': 5390.0},
 {'name': '茶葉蛋（銷售用）', 'betweeness': 4178.0},
 {'name': '統一大布丁（雞蛋口味）', 'betweeness': 3035.0},
 {'name': '促銷券０６', 'betweeness': 987.0},
 {'name': '代收折價卷', 'betweeness': 680.0},
 {'name': '熱拿鐵中杯', 'betweeness': 564.0},
 {'name': '肉鬆飯糰', 'betweeness': 440.0},
 {'name': '蝦皮取件Ｃ', 'betweeness': 344.0},
 {'name': '經典原味熱狗', 'betweeness': 343.0},
 {'name': '鮪魚飯糰', 'betweeness': 338.0}]

In [192]:
g.degree()

[54,
 1,
 1,
 5,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 15,
 15,
 1,
 10,
 1,
 1,
 4,
 1,
 1,
 1,
 10,
 5,
 5,
 1,
 1,
 1,
 5,
 2,
 1,
 1,
 5,
 1,
 2,
 1,
 1,
 5,
 2,
 3,
 5,
 1,
 12,
 4,
 1,
 1,
 1,
 3,
 1,
 1,
 1,
 3,
 15,
 3,
 33,
 6,
 1,
 1,
 1,
 2,
 3,
 5,
 1,
 2,
 1,
 1,
 1,
 1,
 2,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 13,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 14,
 7,
 2,
 1,
 3,
 1,
 1,
 1,
 3,
 2,
 2,
 5,
 1,
 1,
 1,
 1,
 1,
 2,
 2,
 1,
 1,
 2,
 4,
 5,
 12,
 1,
 1,
 2,
 1,
 8,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 2,
 15,
 3,
 3,
 1,
 2,
 4,
 2,
 13,
 2,
 1,
 1,
 2,
 1,
 4,
 2,
 1,
 1,
 1,
 1,
 1,
 1,
 11,
 14,
 1,
 1,
 1,
 1,
 1,
 12,
 1,
 2,
 2,
 2,
 2,
 1,
 1,
 3,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 3,
 1,
 1,
 2,
 1,
 1,
 1,
 1,
 16,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 2,
 1,
 15,
 1,
 1,
 1,
 14,
 17,
 1,
 5,
 1,
 1,
 4,
 3,
 2,
 4,
 1,
 1,
 3,
 2,
 2,
 3,
 2,
 1,
 1,
 15,
 13,
 1,
 9,
 1,
 8,
 2,
 1,
 1,
 4,
 7,
 1,
 1]

In [193]:
for index, vertex in enumerate(g.vs):
    vertex.update_attributes({ 'community': cluster.membership[index], 'id': index })

In [194]:
def normalizer(max_degree):
    max_value = max_degree
    min_value = 1
    def normalize(value):
        return (value - min_value) / max_value + 1
    return normalize

In [195]:
import json
def to_json(graph, cluster):
    for index, vertex in enumerate(graph.vs):
        vertex.update_attributes({ 'community': cluster.membership[index], 'id': index })
    norm= normalizer(graph.maxdegree())
    nodes = []
    edges = []
    for edge in graph.es:
        edge_attr = {}
        edge_attr['from'], edge_attr['to'] = edge.tuple
        edge_attr['weight'] = edge['weight']
        edges.append(edge_attr)
    for node in graph.vs:
        node_attr = {}
        node_attr = { key: node[key] for key in node.attributes()}
        node_attr['degree'] = node.degree()
        nodes.append(node_attr)
    return json.dumps({
        'nodes': nodes,
        'edges': edges,
    }, indent=4)

In [72]:
data = to_json(g, cluster)

In [74]:
with open('default.json', 'w', encoding='utf-8') as file:
    file.write(data)

In [1]:
import numpy as np  
import matplotlib.pyplot as plt  
import pandas as pd  
from apyori import apriori 

In [228]:
records = []  
for i in range(0, len(purchase_list)):  
    records.append([str(purchase_list[i]['items'][j]['單品名稱']) for j in range(0, len(purchase_list[i]))])

In [229]:
association_rules = apriori(records, min_support=0.0003, min_confidence=0.0001, min_lift=1, min_length=2)  
association_results = list(association_rules) 

In [204]:
print(association_results)

[RelationRecord(items=frozenset({'中信外１５', '代收手續費１５'}), support=0.002232909653040192, ordered_statistics=[OrderedStatistic(items_base=frozenset({'中信外１５'}), items_add=frozenset({'代收手續費１５'}), confidence=0.8666666666666666, lift=109.09693693693694), OrderedStatistic(items_base=frozenset({'代收手續費１５'}), items_add=frozenset({'中信外１５'}), confidence=0.2810810810810811, lift=109.09693693693694)]), RelationRecord(items=frozenset({'台灣自來水', '中華電信'}), support=0.002447612504294057, ordered_statistics=[OrderedStatistic(items_base=frozenset({'中華電信'}), items_add=frozenset({'台灣自來水'}), confidence=0.2446351931330472, lift=21.417535254445124), OrderedStatistic(items_base=frozenset({'台灣自來水'}), items_add=frozenset({'中華電信'}), confidence=0.21428571428571427, lift=21.417535254445124)]), RelationRecord(items=frozenset({'遠傳電信', '中華電信'}), support=0.000644108553761594, ordered_statistics=[OrderedStatistic(items_base=frozenset({'中華電信'}), items_add=frozenset({'遠傳電信'}), confidence=0.06437768240343347, lift=12.18884120171

In [230]:
for num in range(0,35):
    item_name = [g.vs[index]['name'] for index in cluster[num]]
    info = 0
    count = 0

    for rule in association_results:
        pair = rule[0] 
        items = [x for x in pair]
        for item in items:
            if item in item_name:
                info += rule[2][0][2]
                count += 1

    density = info / count
    utility = density*(count/(count+1))
    print(utility)
    print(item_name)

0.045569253320576146
['（新）銷售用購物袋１８號袋', '伯朗咖啡', '蘋果冰茶', '御茶園特上奶茶', '保力達蠻牛維他命Ｂ飲料', '麥香錫蘭奶茶', '全家紅豆麵包', '奶油雞排歐姆蛋燴飯', '鹼性離子水', '衛生冰', '麥香奶茶ＴＰ３００', '茶裏王日式綠茶', '台鹽海洋鹼性離子水', '金牌台灣啤酒', '冷泡茶冷萃綠茶無糖', '全家克林姆麵包', '蔥爆牛肉燴飯', '可口可樂ＰＥＴ', '純喫茶鮮柚綠茶６５０ｍｌ', '寶礦力水得', '茶裏王英式紅茶', '雲絲頓紅１０毫克香煙', '金蘋果調味乳蘋果風味', '金牌台啤罐裝（６入）', '義美奶茶', '藍山咖啡', '純喫茶紅茶', '（新）４５號銷售用購物袋', '冷泡茶冰釀烏龍', '飲冰室茶集綠奶茶', '飲冰室茶集紅奶茶', '統一布丁（雞蛋口味）', '科學麵', '泡沫綠茶ＴＰ３００', '純喫茶無糖綠茶６５０ｍｌ', '麥香綠茶ＴＰ３００', '義美錫蘭紅茶', '麥香阿薩姆奶茶', '紫米紅豆湯圓', '麥香紅茶ＴＰ３００', '金牌台灣啤酒５００ＭＬ', '原萃日式綠茶', 'ＦＩＮ深海健康補給飲料', '皇家奶茶', '貝納頌經典拿鐵', '海苔肉鬆沙拉麵包', '舒跑運動飲料ＰＥＴ', '天然水２．２Ｌ', '泡沫紅茶ＴＰ３００', '日式蒜香燒豚飯', '葡萄冰茶', '長壽黃硬盒香煙', '純喫茶綠茶６５０ｍｌ', '咖啡廣場奶香特調咖啡', '天然水', '芒果冰茶', '七星１０毫克硬盒香煙', '純喫茶紅茶６５０ｍｌ']
0.09182471314506341
['熱美式中杯', '養樂多', '頂級鮮奶優格－莓果穀物脆片', '蕃薯（２５元）', '香蕉單入', '肉鬆起酥麵包', '特濃咖啡拿鐵', '蝦皮取件Ｃ', '蕃薯（３０元）', '雪花蛋糕', '麥香奶茶（罐裝）', '經典原味熱狗', '大口奶油蕈菇起司雞排飯糰', '肉鬆飯糰', '光泉無加糖鮮豆漿', '熱拿鐵大杯', '全家熱狗麵包', '茶葉蛋（銷售用）', '重黑巧克力蛋糕', '爆濃起司熱狗', '蕃薯（２０元）', '番薯（３５元）', '蕃薯（１５元）', '統一陽光無糖高纖豆漿', 'ＬＣＡ活菌原味發酵乳', '光泉米漿', '熱美式小杯',