In [1]:
%matplotlib inline

In [2]:
import pandas as pd

In [230]:
df = pd.read_csv("mid_set/TTS1.csv", header=None , encoding="big5")

- TT	TT1	郵件狀態代碼	X(2) | Status_code
- TT	TT2	掛號號碼	X(20) | Mail_num
- TT	TT3	處理日期	X(10) | Mail_date
- TT	TT4	處理時間	X(8) | Mail_time
- TT	TT5	處理局號	X(6) | OP_office // operation office
- TT	TT6	其它	X(42) | other


In [231]:
df.columns = ["Status_code", "Mail_num", "Mail_date", 
             "Mail_time", "OP_office", "other"]

# 定義：狀態碼為 **點**

In [232]:
nodes = list(df.Status_code.unique())

In [233]:
head_nodes = sorted(list(set([ node[0] for node in nodes])))

In [234]:
dic_nodes  = dict( zip(head_nodes, range(len(head_nodes))) )

In [235]:
label_node = lambda x: dic_nodes[x[0]]

In [236]:
label_node("T4")

6

In [237]:
data_nodes = []
for node in nodes:
    data = { "id": node, "group": label_node(node)}
    data_nodes.append(data)

# 定義：**線**，狀態的改變
- 要依據狀態、郵件號碼及時間去決定**線** 的連接
- 時間的轉換，請參考 [pandas.to_datetime()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html)

In [238]:
df['Mail_datetime'] = df.Mail_date+" "+df.Mail_time

In [239]:
df.Mail_datetime = pd.to_datetime(df.Mail_datetime)

```
all_mail["58668700100170"] = [ ("Y4", "2018-01-01 09:49:04"),
                           ("I4", "2018-01-01 14:11:51"), ... ] 

```

In [None]:
all_mail = {}
all_mail_key = set()
for idx, row in df.iterrows():
    mail_code = row.Mail_num.strip()
    if not mail_code in all_mail_key:
        all_mail[mail_code] = []
        all_mail_key.add(mail_code)
        
    all_mail[mail_code].append( (row.Status_code, row.Mail_datetime) )

In [None]:
len(all_mail.keys())

In [120]:
all_mail['00120000014101061010']

[('Y4', Timestamp('2018-01-02 07:50:39')),
 ('H4', Timestamp('2018-01-02 14:06:59')),
 ('Z2', Timestamp('2018-01-02 16:34:28'))]

In [121]:
def convert_2_edge(mail_status):
    edges = []
    for idx in range(len(mail_status)-1):
        edges.append( ( mail_status[idx][0], mail_status[idx+1][0]) )
        
    return edges
# "Y4" -> "H4", 
# "H4" -> "Z2"

mail_status = all_mail['96410700000070']
convert_2_edge(mail_status)

[('Y4', 'Y4'), ('Y4', 'I4'), ('I4', 'I4')]

In [122]:
all_edges = [] 
for mail_code in all_mail:
    status_num = len(all_mail[mail_code])
    
    if (status_num) > 1:
        mail_status = all_mail[mail_code]
        all_edges.extend(convert_2_edge(mail_status))

In [226]:
import collections 
import numpy as np

In [227]:
value_list = {}
for edge, value in collections.Counter(all_edges).most_common(1000):
    value_list[edge] = value

list_max = np.max(list(value_list.values()))
list_min = np.min(list(value_list.values()))
list_diff = float(list_max - list_min)

k = 20

normal_val = lambda x: int(k/3+k*2*(x-list_min)/list_diff)

In [228]:
normal_val(1700)

36

In [222]:
data_edges = []
for edge, value in collections.Counter(all_edges).most_common(1000):
    edge = {"source": edge[0], "target":edge[1], "value":normal_val(value)}
    data_edges.append(edge)

In [223]:
all_data = { "nodes":data_nodes, "links":data_edges}

In [224]:
import json

In [225]:
open("data_4_d3.json", 'w').write(json.dumps(all_data, indent=2))

6390

## test on op_office

In [47]:
mail_op = {}
for idx, row in df.head(1000).iterrows():
    m_code = row.Mail_num.strip()
    if not m_code in mail_op:
        mail_op[m_code] = []
    mail_op[m_code].append( (row.Status_code, row.OP_office, row.Mail_datetime))

In [48]:
def OP_seq(mail_seq):
    seq = []
    for idx in range(len(mail_seq)-1):
        seq.append( ("%s-%s"%(mail_seq[idx][0], mail_seq[idx+1][0]), mail_seq[idx][1], mail_seq[idx+1][1]))
    return seq

In [49]:
mail_op_seq = []
for code in mail_op:
    if (len(mail_op[code])) > 1:
        mail_op_seq.append(OP_seq(mail_op[code]))

In [50]:
mail_op_seq

[[('X2-H7', 830584, 830584)],
 [('Y4-I4', 830584, 830584)],
 [('Y4-Y4', 540028, 540028),
  ('Y4-I4', 540028, 540028),
  ('I4-I4', 540028, 540028)],
 [('Y4-Y4', 730013, 730013), ('Y4-I4', 730013, 730013)],
 [('Y4-I4', 360029, 360029)],
 [('Z4-Y4', 91814, 100587), ('Y4-I4', 100587, 100587)],
 [('Z2-P4', 500600, 500600)],
 [('Y4-I4', 100189, 100189)],
 [('Y4-I4', 600009, 600009)],
 [('H4-W2', 500027, 500027)],
 [('W2-I4', 260582, 260582)],
 [('W2-I4', 640582, 640582)],
 [('H4-W2', 500027, 500027)],
 [('Y4-I4', 330585, 330585)],
 [('Y4-H4', 300038, 300038)],
 [('I4-I4', 320008, 320008)],
 [('Y4-I4', 640026, 640026)],
 [('Y4-I4', 100601, 100601)],
 [('H4-W2', 500027, 500027)],
 [('H4-W2', 500027, 500027)],
 [('H4-W2', 500027, 500027)],
 [('Z2-P4', 500600, 500600)],
 [('H4-W2', 500027, 500027)],
 [('W2-I4', 100603, 100603)],
 [('H4-W2', 500027, 500027)],
 [('H4-W2', 500027, 500027)],
 [('Y4-I4', 100189, 100189)],
 [('Y4-I4', 970582, 970582)],
 [('Y4-I4', 830584, 830584)],
 [('Y4-H4', 220032,