In [1]:
import pandas as pd
import numpy

In [2]:
import codecs
import os
with codecs.open('customer_data.csv', 'r') as file:
    BLOCKSIZE = 512
    if not os.path.exists('customer_data(utf-8).csv'):
        with codecs.open('customer_data(utf-8).csv', "w", "utf-8") as targetFile:
            while True:
                contents = file.read(BLOCKSIZE)
                if not contents:
                    break
                targetFile.write(contents)

FileNotFoundError: [Errno 2] No such file or directory: 'customer_data.csv'

In [2]:
dtype = {
    '交易id': numpy.str,
    '資料日期': numpy.str,
    '資料時間': numpy.str,
    '餐別帶': numpy.str,
    '縣市別': numpy.str,
    '店舖代號': numpy.uint32,
    '主商圈': numpy.str,
    '品號-品名稱': numpy.str,
    '群號-群名稱': numpy.str,
    '單品名稱': numpy.str,
    '銷售數量': numpy.uint16,
    '銷售單價': numpy.float,
    '交易金額': numpy.float
}
USE_COLUMNS = ['交易id', '資料日期', '資料時間', '餐別帶', '縣市別', '店舖代號', '主商圈', '品號-品名稱',
       '群號-群名稱', '單品名稱', '銷售數量', '銷售單價', '交易金額']
PARSE_DATES = {
    '資料日期與時間': [
        '資料日期',
        '資料時間'
    ]
}

In [3]:
file = pd.read_csv('customer_data(utf-8).csv',
                   index_col=1,
                   nrows=100000,
                   usecols=USE_COLUMNS,
                   dtype=dtype,
                   parse_dates=PARSE_DATES,
        )

In [4]:
file = file.dropna()

In [5]:
file.shape

(99981, 11)

In [6]:
def get_copurchase_df(df):
    count_size = df.index.unique()
    size_df = df.groupby('交易id').size()
    size_df = size_df[size_df < 2]
    return df.drop(list(size_df.index), axis=0)

In [7]:
data = get_copurchase_df(file)

In [8]:
data.head()

Unnamed: 0_level_0,資料日期與時間,餐別帶,縣市別,店舖代號,主商圈,品號-品名稱,群號-群名稱,單品名稱,銷售數量,銷售單價,交易金額
交易id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
01637920171201184556000112534201,2017-12-01 18:45:56,晚餐時間帶,台中市,11148,住宅型,72-家庭雜貨,722-家雜用品,（新）銷售用購物袋１８號袋,1,1.0,1.0
01614720171201163042000114405402,2017-12-01 16:30:42,下午茶時間帶,台中市,10697,住宅型,29-冷藏飲料,296-冷藏奶茶,飲冰室茶集綠奶茶,1,25.0,25.0
01384020171201130558000119295301,2017-12-01 13:05:58,午餐時間帶,彰化縣,13840,文教型,12-調理麵,122-熱食麵２配,沙茶牛肉炒麵,1,69.0,69.0
01475120171201153506000111697202,2017-12-01 15:35:06,下午茶時間帶,新竹市,4151,工業型,29-冷藏飲料,291-冷藏茶飲料,冷泡茶台茶１２號紅茶,1,25.0,21.0
01564720171201180924000113751602,2017-12-01 18:09:24,晚餐時間帶,台中市,3860,住宅型,34-健康飲料,343-水,多喝水２Ｌ,2,35.0,49.0


In [9]:
def aggregate(df):
    dic = {}
    for index, row in df.iterrows():
        if index not in dic:
            dic[index] = []
        dic[index].append({
            'name': row['單品名稱'],
            'amount': row['銷售單價']
        })
    dic = { index : value for index, value in dic.items() if len(value) > 1}
    return dic

In [10]:
purchase_list = aggregate(data)

In [11]:
len(purchase_list)

23288

# Network Analysis

In [12]:
from itertools import filterfalse
def convert(purchase_list, support):
    def find_edges_in_list(itemsets):
        from itertools import combinations
        result = []
        return combinations(itemsets, 2)
    result = {}
    nodes = set()
    for key, itemsets in purchase_list.items():
        if len(itemsets) > 1:
            for edge_dict_tuple in find_edges_in_list(itemsets):
                edge = tuple([dic['name'] for dic in edge_dict_tuple])
                weight = sum([dic['amount'] for dic in edge_dict_tuple])
                if edge in result:
                    result[edge]['count'] += 1
                else:
                    result[edge] = {}
                    result[edge]['count'] = 1
                    result[edge]['weight'] = weight
    for key in list(result.keys()):
        if result[key]['count'] < support:
            del result[key]
    for items in result.keys():
        for item in items:
            if item not in nodes:
                nodes.add(item)
    return (nodes, result)

In [15]:
nodes, edges_dict = convert(purchase_list, 8)

In [16]:
print('Node number: {}\nEdge number: {}'.format(len(nodes), len(edges_dict)))

Node number: 194
Edge number: 453


In [17]:
import igraph

In [18]:
g = igraph.Graph()

In [19]:
for node in nodes:
    g.add_vertex(node)

In [20]:
len(g.vs)

194

In [21]:
for edge, attrs in edges_dict.items():
    weight = attrs['weight'] * attrs['count'] if attrs['weight'] > 0 else 1
    g.add_edge(edge[0], edge[1], weight=weight)

In [22]:
g.simplify(combine_edges={ "weight": "sum" })

<igraph.Graph at 0x115a7c408>

In [23]:
communities = g.community_fastgreedy('weight')

In [24]:
cluster = communities.as_clustering()

In [25]:
len(cluster)

36

In [26]:
for comm in cluster:
    comm_name = [ g.vs[index]['name'] for index in comm]
    print(" ".join(comm_name))
    print("==================")

泡沫紅茶ＴＰ３００ 鹼性離子水 麥香阿薩姆奶茶 日式蒜香燒豚飯 （新）銷售用購物袋１８號袋 純喫茶紅茶 冷泡茶冰釀烏龍 （新）４５號銷售用購物袋 麥香奶茶ＴＰ３００ 台鹽海洋鹼性離子水 紫米紅豆湯圓 泡沫綠茶ＴＰ３００ 原萃日式綠茶 金蘋果調味乳蘋果風味 純喫茶綠茶６５０ｍｌ 全家克林姆麵包 伯朗咖啡 飲冰室茶集紅奶茶 純喫茶無糖綠茶６５０ｍｌ 蔥爆牛肉燴飯 義美奶茶 咖啡廣場奶香特調咖啡 冷泡茶冷萃綠茶無糖 麥香錫蘭奶茶 麥香綠茶ＴＰ３００ 奶油雞排歐姆蛋燴飯 金牌台灣啤酒 可口可樂ＰＥＴ 金牌台灣啤酒５００ＭＬ 雲絲頓紅１０毫克香煙 統一大布丁（雞蛋口味） 天然水 飲冰室茶集綠奶茶 天然水２．２Ｌ 茶裏王日式綠茶 金牌台啤罐裝（６入） 麥香紅茶ＴＰ３００ 藍山咖啡 七星１０毫克硬盒香煙 衛生冰
伊藤園蘋果紅茶 頑皮滷蛋－原味 促銷券０６ Ｃｒｅａｍ－Ｏ黑巧克力三明治餅
雅虎線上寄件 店到店雅虎拍賣線上手續費 露天寄件Ｗ 蝦皮寄件Ｗ 商店街寄件Ｗ
特濃黑可可 商店街取件 鮪魚飯糰 簡單點原味優酪乳 光泉米漿 爆濃起司熱狗 鮮奶茶 簡單點無加糖優酪乳 茶葉蛋（銷售用） 頂級鮮奶優格－莓果穀物脆片 ＬＣＡ活菌原味發酵乳 蕃薯（２０元） 蕃薯（１５元） 經典原味熱狗 林鳳營全脂鮮乳 肉鬆起酥麵包 蕃薯（３０元） 黑胡椒熱狗 特濃咖啡拿鐵 肉鬆飯糰 熱拿鐵小杯 香蕉單入 蝦皮取件Ｃ 光泉無加糖鮮豆漿 蕃薯（２５元） 統一陽光無糖高纖豆漿 雪花蛋糕 養樂多 義美古早傳統無糖豆奶 全家熱狗麵包
蝦皮寄件Ｆ 商店街寄件Ｆ
中信外１５ 玉山淘寶款 合庫代１５ 代收手續費１５ 雅虎拍賣繳費
讚岐烏龍麵 海鮮魚卵棒 黃金厚切魚板 鮮香菇 究極味付蛋 王子麵 日式黑輪 特級花枝丸 白玉蘿蔔 手工高麗菜捲 旗魚黑輪 關東煮本舖冬粉 千層玉子燒 筊白筍 蟹肉糰子 黃金魚豆腐 杏鮑菇 野菜多多魚餅 關東煮本舖拉麵
大口法香烤雞飯糰 大口奶油蕈菇起司雞排飯糰 特濃抹茶拿鐵 綠豆沙牛乳 熱美式中杯 熱拿鐵中杯
欣中天然氣 代收手續費４ 花旗信用卡 遠傳電信 台灣大哥大 國民年金代 台灣自來水 台新信用卡 地方查核稅款 台灣電力 勞工退休金 健保費代收 聯邦信用卡 勞保費代收 欣林瓦斯費 國泰世華卡 台中二段停 玉山信用卡 中華電信 中信有線代
代收手續費１３ 渣打一般３
葡萄冰茶 蘋果冰茶 芒果

In [27]:
cluster.modularity

0.8211309100930249

In [28]:
items = []
for index, value in enumerate(g.betweenness(weights='weight')):
    if value > 0:
        items.append({ 'name': g.vs[index]['name'], 'betweeness': value })
items.sort(key=lambda x: x['betweeness'], reverse=True)

In [29]:
items[0:10]

[{'name': '（新）銷售用購物袋１８號袋', 'betweeness': 2790.0},
 {'name': '茶葉蛋（銷售用）', 'betweeness': 2564.0},
 {'name': '麥香奶茶ＴＰ３００', 'betweeness': 2011.0},
 {'name': '促銷券０６', 'betweeness': 726.0},
 {'name': '代收折價卷', 'betweeness': 506.0},
 {'name': '熱拿鐵中杯', 'betweeness': 419.0},
 {'name': '經典原味熱狗', 'betweeness': 255.0},
 {'name': '肉鬆飯糰', 'betweeness': 248.0},
 {'name': '鮪魚飯糰', 'betweeness': 173.0},
 {'name': '黑胡椒熱狗', 'betweeness': 172.0}]

In [30]:
for index, vertex in enumerate(g.vs):
    vertex.update_attributes({ 'community': cluster.membership[index], 'id': index })

In [31]:
for i in g.es:
    print(i)

igraph.Edge(<igraph.Graph object at 0x115a7c408>, 0, {'weight': 200.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 1, {'weight': 300.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 2, {'weight': 6264.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 3, {'weight': 1500.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 4, {'weight': 1.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 5, {'weight': 841.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 6, {'weight': 380.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 7, {'weight': 600.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 8, {'weight': 2.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 9, {'weight': 1110.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 10, {'weight': 645.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 11, {'weight': 986.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 12, {'weight': 720.0})
igraph.Edge(<igraph.Graph object at 0x115a7c408>, 13, {'weight

In [32]:
import json
def to_json(graph, cluster):
    for index, vertex in enumerate(graph.vs):
        vertex.update_attributes({ 'community': cluster.membership[index], 'id': index })
    nodes = []
    edges = []
    for edge in graph.es:
        edge_attr = {}
        edge_attr['from'], edge_attr['to'] = edge.tuple
        edge_attr['weight'] = edge['weight']
        edges.append(edge_attr)
    for node in graph.vs:
        node_attr = {}
        node_attr = { key: node[key] for key in node.attributes()}
        nodes.append(node_attr)
    return json.dumps({
        'nodes': nodes,
        'edges': edges,
    }, indent=4)

In [33]:
data = to_json(g, cluster)

In [34]:
with open('data.json', 'w', encoding='utf-8') as file:
    file.write(data)