In [1]:
%matplotlib inline

In [2]:
import pandas as pd
from pathlib import Path
import collections 
import numpy as np
import json
import dask.dataframe as dd
import time
import zipfile as zfile
import threading
import datetime as dt

In [3]:
# 取得壓縮檔內的檔名

df_fn = []
zf = zfile.ZipFile('../POSTDB.zip')
for fname in zfile.ZipFile.namelist(zf):
    df_fn.append(fname.encode('cp437').decode('big5'))
df_fn

['CC.csv',
 'CC.SQL',
 'GPS.SQL',
 'GPSS1.CSV',
 'TT.SQL',
 'TTS1.csv',
 '中華郵政大數據競賽資料欄位規格.xlsx',
 '中華郵政大數據競賽資料欄位規格_新.xlsx',
 'ORACLE相關/',
 'ORACLE相關/ACC_Oracle.SQL',
 'ORACLE相關/CC_Oracle.SQL',
 'ORACLE相關/createDB.txt',
 'ORACLE相關/GPS_Oracle.SQL',
 'ORACLE相關/sqlldr.exe',
 'ORACLE相關/TTS1.CTL',
 'ORACLE相關/TT_Oracle.SQL',
 '培訓暨說明會 - 簡報檔/',
 '培訓暨說明會 - 簡報檔/1.競賽辦法說明.pdf',
 '培訓暨說明會 - 簡報檔/2.競賽資料說明-1.pdf',
 '培訓暨說明會 - 簡報檔/2.競賽資料說明-2(中華郵政大數據競賽資料欄位規格).xlsx',
 '培訓暨說明會 - 簡報檔/3.郵務知識培訓-1.pdf',
 '培訓暨說明會 - 簡報檔/3.郵務知識培訓-2.pdf',
 '培訓暨說明會 - 簡報檔/4.微軟Power BI分析軟體介紹.pdf',
 '培訓暨說明會 - 簡報檔/5.競賽命題及方向之工研院楊奇達經理建議.pdf',
 '培訓暨說明會 - 簡報檔/5.競賽命題及方向之高端訓博士建議.pdf',
 'ACC.SQL',
 'ACCS1.csv']

In [4]:
# df = pd.read_csv("mid_set/TTS1.csv", header=None, encoding="big5")
zf = zfile.ZipFile('../POSTDB.zip')
df = pd.read_csv(zf.open('TTS1.csv'), # 檔案位置
                 header=None, # 首行是否有欄位名稱
                 encoding="big5", # 編碼
                 dtype = "str", # 資料類型
                 nrows = 200000000, #讀取行數
                 names = ["Status_code", "Mail_num", "Mail_date", "Mail_time", "OP_office", "other"] # 欄位名稱
                )

In [5]:
mail_counts = df.groupby(['Mail_num']).size().reset_index(name='counts').sort_values(['counts'], ascending=False)
mail_cnts = mail_counts.groupby(['counts']).size().reset_index(name='cnts').sort_values(['cnts'], ascending=False)
mail_cnts

Unnamed: 0,counts,cnts
3,4,26054547
2,3,6362504
4,5,4033064
0,1,3705017
1,2,2432054
5,6,1540988
7,8,1373221
6,7,1032139
8,9,919168
9,10,345810


#### TT1 , X(2) , 郵件狀態代碼，TT2 , X(20) , 掛號號碼，TT3 , X(10) , 處理日期，TT4 , X(8) , 處理時間，TT5 , X(6) , 處理局號，TT6 , X(42) , 其它

In [93]:
nodes = list(df.Status_code.unique())

In [94]:
head_nodes = sorted(list(set([node[0] for node in nodes])))

In [95]:
dic_nodes = dict(zip(head_nodes, range(len(head_nodes))))

In [96]:
label_node = lambda x: dic_nodes[x[0]]

In [97]:
data_nodes = []
for node in nodes:
    data = {"id":node, "group":label_node(node)}
    data_nodes.append(data)

In [98]:
df['Mail_datetime'] = pd.to_datetime(df.Mail_date + " " + df.Mail_time)

In [99]:
len(df)

10000000

In [78]:
df.Mail_num.unique()

array(['00000000000000      ', '58668700100170      ',
       '59928400100170      ', ..., '15388440206518      ',
       '15388540206518      ', '15388640206518      '], dtype=object)

In [100]:
start_time = time.time()

all_mail = {}
for idx, row in df.iterrows():
    # print(idx, row.Status_code, row.Mail_num.strip(), row.Mail_datetime)
    
    if not row.Mail_num.strip() in all_mail:
        all_mail[row.Mail_num.strip()] = []
        
    all_mail[row.Mail_num.strip()].append((row.Status_code, row.Mail_datetime))
    
time_diff = time.time() - start_time
print("處理 ",len(df), "筆資料，組成 ", len(all_mail.keys()), " 筆資料，共執行：", time_diff, "秒")

處理  10000000 筆資料，組成  4419944  筆資料，共執行： 931.2079753875732 秒


In [101]:
len(all_mail.keys())

4419944

In [102]:
def convert_2_edge(mail_status):
    edges = []
    for idx in range(len(mail_status) - 1):
        edges.append((mail_status[idx][0], mail_status[idx + 1][0]))
        
    return edges

# mail_status = all_mail["02620770702618"]
# convert_2_edge(mail_status)

In [103]:
all_edges = []
for mail_code in all_mail:
    status_num = len(all_mail[mail_code])
    
    if(status_num) > 1:
        mail_status = all_mail[mail_code]
        all_edges.extend(convert_2_edge(mail_status))

In [104]:
value_list = {}
for edge, value in collections.Counter(all_edges).most_common(1000):
    value_list[edge] = value

In [105]:
list_max = np.max(list(value_list.values()))
list_min = np.min(list(value_list.values()))

In [106]:
list_diff = float(list_max - list_min)

In [107]:
k = 20
normal_val = lambda x: int(k/3+k*2*(x-list_min)/list_diff)

In [108]:
data_edges = []
for edge, value in collections.Counter(all_edges).most_common(10000):
    edge = {"source": edge[0], "target":edge[1], "value":normal_val(value)}
    data_edges.append(edge)

In [109]:
all_data = {"nodes":data_nodes, "links":data_edges}

In [110]:
open("data_4_d3.json",'w').write(json.dumps(all_data, indent = 2))

52547