In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import collections 
import numpy as np
import json

In [None]:
df = pd.read_csv("TTS1.csv", header=None , encoding="big5")

- TT	TT1	郵件狀態代碼	X(2) | Status_code
- TT	TT2	掛號號碼	X(20) | Mail_num
- TT	TT3	處理日期	X(10) | Mail_date
- TT	TT4	處理時間	X(8) | Mail_time
- TT	TT5	處理局號	X(6) | OP_office // operation office
- TT	TT6	其它	X(42) | other


In [4]:
df.columns = ["Status_code", "Mail_num", "Mail_date", "Mail_time", "OP_office", "other"]

# 定義：狀態碼為 **點**

In [5]:
nodes = list(df.Status_code.unique())

In [6]:
head_nodes = sorted(list(set([ node[0] for node in nodes])))

In [7]:
dic_nodes  = dict( zip(head_nodes, range(len(head_nodes))) )

In [8]:
label_node = lambda x: dic_nodes[x[0]]

In [51]:
# 只有狀態為節點
data_nodes = []
for node in nodes:
    data = { "id": node, "group": label_node(node)}
    data_nodes.append(data)

# 定義：**線**，狀態的改變
- 要依據狀態、郵件號碼及時間去決定**線** 的連接
- 時間的轉換，請參考 [pandas.to_datetime()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html)

In [11]:
df['Mail_datetime'] = df.Mail_date+" "+df.Mail_time

In [12]:
df.Mail_datetime = pd.to_datetime(df.Mail_datetime)

```
all_mail["58668700100170"] = [ ("Y4", "2018-01-01 09:49:04"),
                           ("I4", "2018-01-01 14:11:51"), ... ] 

```

In [57]:
df['Code_OP'] = df.Status_code + "-" + df.OP_office.map(str) 

In [80]:
data_nodes = []
for node in df['Code_OP'].unique():
    data = { "id": node, "group": label_node(node)}
    data_nodes.append(data)

In [82]:
all_mail = {}
all_mail_key = set()
for idx, row in df.iterrows():
    mail_code = row.Mail_num.strip()
    if not mail_code in all_mail_key:
        all_mail[mail_code] = []
        all_mail_key.add(mail_code)
        
    all_mail[mail_code].append( (row.Code_OP, row.Mail_datetime) )

In [83]:
len(all_mail.keys())

5982

In [84]:
all_mail['00120000014101061010']

[('Y4-500584', Timestamp('2018-01-02 07:50:39')),
 ('H4-500584', Timestamp('2018-01-02 14:06:59')),
 ('Z2-500584', Timestamp('2018-01-02 16:34:28'))]

In [85]:
def convert_2_edge(mail_status):
    edges = []
    for idx in range(len(mail_status)-1):
        edges.append( ( mail_status[idx][0], mail_status[idx+1][0]) )
        
    return edges
# "Y4" -> "H4", 
# "H4" -> "Z2"

mail_status = all_mail['96410700000070']
convert_2_edge(mail_status)

[('Y4-540028', 'Y4-540028'),
 ('Y4-540028', 'I4-540028'),
 ('I4-540028', 'I4-540028')]

In [86]:
all_edges = [] 
for mail_code in all_mail:
    status_num = len(all_mail[mail_code])
    
    if (status_num) > 1:
        mail_status = all_mail[mail_code]
        all_edges.extend(convert_2_edge(mail_status))

In [88]:
value_list = {}
for edge, value in collections.Counter(all_edges).most_common(1000):
    value_list[edge] = value

list_max = np.max(list(value_list.values()))
list_min = np.min(list(value_list.values()))
list_diff = float(list_max - list_min)

k = 20

normal_val = lambda x: int(k/3+k*2*(x-list_min)/list_diff)

In [89]:
normal_val(1700)

380

In [90]:
data_edges = []
for edge, value in collections.Counter(all_edges).most_common(1000):
    edge = {"source": edge[0], "target":edge[1], "value":normal_val(value)}
    data_edges.append(edge)

In [91]:
all_data = { "nodes":data_nodes, "links":data_edges}

In [93]:
open("data_4_d3.json", 'w').write(json.dumps(all_data, indent=2))

131313