In [2]:
import pandas as pd
import numpy

In [3]:
dtype = {
    '交易id': numpy.str,
    '資料日期': numpy.str,
    '資料時間': numpy.str,
    '餐別帶': numpy.str,
    '縣市別': numpy.str,
    '店舖代號': numpy.uint32,
    '主商圈': numpy.str,
    '品號-品名稱': numpy.str,
    '群號-群名稱': numpy.str,
    '單品名稱': numpy.str,
    '銷售數量': numpy.uint16,
    '銷售單價': numpy.float,
    '交易金額': numpy.float
}
USE_COLUMNS = ['交易id', '資料日期', '資料時間', '餐別帶', '縣市別', '店舖代號', '主商圈', '品號-品名稱',
       '群號-群名稱', '單品名稱', '銷售數量', '銷售單價', '交易金額']
PARSE_DATES = {
    '資料日期與時間': [
        '資料日期',
        '資料時間'
    ]
}

In [4]:
file = pd.read_csv('customer_data(utf-8).csv',
                   index_col=1,
                   nrows=100000,
                   usecols=USE_COLUMNS,
                   dtype=dtype,
                   parse_dates=PARSE_DATES,
        )

In [5]:
file = file.dropna()

In [6]:
file.shape

(99981, 11)

In [7]:
def get_copurchase_df(df):
    count_size = df.index.unique()
    size_df = df.groupby('交易id').size()
    size_df = size_df[size_df < 2]
    return df.drop(list(size_df.index), axis=0)

In [8]:
data = get_copurchase_df(file)

In [9]:
data.head()

Unnamed: 0_level_0,資料日期與時間,餐別帶,縣市別,店舖代號,主商圈,品號-品名稱,群號-群名稱,單品名稱,銷售數量,銷售單價,交易金額
交易id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
01637920171201184556000112534201,2017-12-01 18:45:56,晚餐時間帶,台中市,11148,住宅型,72-家庭雜貨,722-家雜用品,（新）銷售用購物袋１８號袋,1,1.0,1.0
01614720171201163042000114405402,2017-12-01 16:30:42,下午茶時間帶,台中市,10697,住宅型,29-冷藏飲料,296-冷藏奶茶,飲冰室茶集綠奶茶,1,25.0,25.0
01384020171201130558000119295301,2017-12-01 13:05:58,午餐時間帶,彰化縣,13840,文教型,12-調理麵,122-熱食麵２配,沙茶牛肉炒麵,1,69.0,69.0
01475120171201153506000111697202,2017-12-01 15:35:06,下午茶時間帶,新竹市,4151,工業型,29-冷藏飲料,291-冷藏茶飲料,冷泡茶台茶１２號紅茶,1,25.0,21.0
01564720171201180924000113751602,2017-12-01 18:09:24,晚餐時間帶,台中市,3860,住宅型,34-健康飲料,343-水,多喝水２Ｌ,2,35.0,49.0


In [10]:
def aggregate(df):
    dic = {}
    for index, row in df.iterrows():
        if index not in dic:
            dic[index] = []
        dic[index].append({
            'name': row['單品名稱'],
            'amount': row['銷售單價']
        })
    dic = { index : value for index, value in dic.items() if len(value) > 1}
    return dic

In [11]:
purchase_list = aggregate(data)

In [47]:
from pymongo import MongoClient

In [48]:
client = MongoClient('localhost', 27017)
db = client['pn']

In [58]:
purchase_list = list(db['transactions'].find({'items.1': { '$exists': True } }, projection=['items']))

In [59]:
purchase_list[0]

{'_id': ObjectId('5cc94a6608c78d51c024704f'),
 'items': [{'品號-品名稱': '31-茶飲料',
   '群號-群名稱': '311-紅茶',
   '銷售單價': 10.0,
   '單品名稱': '麥香紅茶ＴＰ３００',
   'amount': 10.0},
  {'品號-品名稱': '58-香煙',
   '群號-群名稱': '584-進口淡煙',
   '銷售單價': 95.0,
   '單品名稱': '萬寶路金軟包活性碳濾嘴香菸',
   'amount': 95.0}]}

# Network Analysis

In [60]:
from itertools import filterfalse
def convert(purchase_list, support):
    def find_edges_in_list(itemsets):
        from itertools import combinations
        result = []
        return combinations(itemsets, 2)
    result = {}
    nodes = set()
    for transaction in purchase_list:
        itemsets = transaction['items']
        if len(itemsets) > 1:
            for edge_dict_tuple in find_edges_in_list(itemsets):
                edge = tuple([dic['單品名稱'] for dic in edge_dict_tuple])
                weight = sum([dic['amount'] for dic in edge_dict_tuple])
                if edge in result:
                    result[edge]['count'] += 1
                else:
                    result[edge] = {}
                    result[edge]['count'] = 1
                    result[edge]['weight'] = weight
    for key in list(result.keys()):
        if result[key]['count'] < support:
            del result[key]
    for items in result.keys():
        for item in items:
            if item not in nodes:
                nodes.add(item)
    return (nodes, result)

In [64]:
nodes, edges_dict = convert(purchase_list, 8)

In [65]:
print('Node number: {}\nEdge number: {}'.format(len(nodes), len(edges_dict)))

Node number: 194
Edge number: 453


In [33]:
import igraph

In [34]:
g = igraph.Graph()

In [35]:
for node in nodes:
    g.add_vertex(node)

In [36]:
len(g.vs)

44

In [37]:
for edge, attrs in edges_dict.items():
    weight = attrs['weight'] * attrs['count'] if attrs['weight'] > 0 else 1
    g.add_edge(edge[0], edge[1], weight=weight)

In [38]:
g.simplify(combine_edges={ "weight": "sum" })
g.to_undirected(mode='mutual', combine_edges={ "weight": "sum" })

In [39]:
communities = g.community_fastgreedy('weight')

In [40]:
cluster = communities.as_clustering()

In [41]:
len(cluster)

20

In [42]:
for comm in cluster:
    comm_name = [ g.vs[index]['name'] for index in comm]
    print(" ".join(comm_name))
    print("==================")

蕃薯（１５元） 蕃薯（２０元）
蝦皮取件Ｃ 商店街取件
高鐵手續費 高鐵取票
空瓶回收（銷售用） 紅標料理米酒
健保費代收 代收手續費４ 國民年金代
台中裁罰單 代收手續費６
伊藤園蘋果紅茶 促銷券０６
蝦皮寄件Ｆ 商店街寄件Ｆ
通行繳費 代收手續費５
戰禍邪神第１１章 戰禍邪神第１２章
麥香奶茶ＴＰ３００ 麥香紅茶ＴＰ３００
寶物交易代 代收手續費２５
店到店雅虎拍賣手續費 雅虎拍賣寄件
ＦＰ店到店 店到店ＦＰ手續費
蘋果日報 自由時報
代收手續費１３ 渣打一般３
代收手續費１５ 玉山淘寶款 雅虎拍賣繳費 中信外１５ 合庫代１５
台灣自來水 中華電信
代收折價卷 奶香綠茶３３０ＭＬ
台鐵取票 台鐵手續費


In [43]:
cluster.modularity

0.6157882031074606

In [44]:
items = []
for index, value in enumerate(g.betweenness(weights='weight')):
    if value > 0:
        items.append({ 'name': g.vs[index]['name'], 'betweeness': value })
items.sort(key=lambda x: x['betweeness'], reverse=True)

In [45]:
items[0:10]

[{'name': '代收手續費１５', 'betweeness': 6.0}, {'name': '代收手續費４', 'betweeness': 1.0}]

In [46]:
g.degree()

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 2,
 1,
 1,
 1,
 4,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1]

In [75]:
for index, vertex in enumerate(g.vs):
    vertex.update_attributes({ 'community': cluster.membership[index], 'id': index })

In [87]:
def normalizer(max_degree):
    max_value = max_degree
    min_value = 1
    def normalize(value):
        return (value - min_value) / max_value + 1
    return normalize

In [94]:
import json
def to_json(graph, cluster):
    for index, vertex in enumerate(graph.vs):
        vertex.update_attributes({ 'community': cluster.membership[index], 'id': index })
    norm= normalizer(graph.maxdegree())
    nodes = []
    edges = []
    for edge in graph.es:
        edge_attr = {}
        edge_attr['from'], edge_attr['to'] = edge.tuple
        edge_attr['weight'] = edge['weight']
        edges.append(edge_attr)
    for node in graph.vs:
        node_attr = {}
        node_attr = { key: node[key] for key in node.attributes()}
        node_attr['degree'] = node.degree()
        nodes.append(node_attr)
    return json.dumps({
        'nodes': nodes,
        'edges': edges,
    }, indent=4)

In [95]:
data = to_json(g, cluster)

In [96]:
with open('data.json', 'w', encoding='utf-8') as file:
    file.write(data)