In [1]:
# 把網圖型資料與地點位置結合

In [52]:
%matplotlib inline

In [53]:
import pandas as pd
from pathlib import Path
import collections 
import numpy as np
import json
import dask.dataframe as dd
import time
import zipfile as zfile
import threading
import datetime as dt

In [54]:
df_fn = []
zf = zfile.ZipFile('E:\POSTDB.zip')
for fname in zfile.ZipFile.namelist(zf):
    df_fn.append(fname.encode('cp437').decode('big5'))
df_fn

['CC.csv',
 'CC.SQL',
 'GPS.SQL',
 'GPSS1.CSV',
 'TT.SQL',
 'TTS1.csv',
 '中華郵政大數據競賽資料欄位規格.xlsx',
 '中華郵政大數據競賽資料欄位規格_新.xlsx',
 'ORACLE相關/',
 'ORACLE相關/ACC_Oracle.SQL',
 'ORACLE相關/CC_Oracle.SQL',
 'ORACLE相關/createDB.txt',
 'ORACLE相關/GPS_Oracle.SQL',
 'ORACLE相關/sqlldr.exe',
 'ORACLE相關/TTS1.CTL',
 'ORACLE相關/TT_Oracle.SQL',
 '培訓暨說明會 - 簡報檔/',
 '培訓暨說明會 - 簡報檔/1.競賽辦法說明.pdf',
 '培訓暨說明會 - 簡報檔/2.競賽資料說明-1.pdf',
 '培訓暨說明會 - 簡報檔/2.競賽資料說明-2(中華郵政大數據競賽資料欄位規格).xlsx',
 '培訓暨說明會 - 簡報檔/3.郵務知識培訓-1.pdf',
 '培訓暨說明會 - 簡報檔/3.郵務知識培訓-2.pdf',
 '培訓暨說明會 - 簡報檔/4.微軟Power BI分析軟體介紹.pdf',
 '培訓暨說明會 - 簡報檔/5.競賽命題及方向之工研院楊奇達經理建議.pdf',
 '培訓暨說明會 - 簡報檔/5.競賽命題及方向之高端訓博士建議.pdf',
 'ACC.SQL',
 'ACCS1.csv']

In [55]:
# 載入特種郵件追蹤查詢資料檔
zf = zfile.ZipFile('E:\POSTDB.zip')
df = pd.read_csv(zf.open('TTS1.csv'), # 檔案位置
                 header=None, # 首行是否有欄位名稱
                 encoding="big5", # 編碼
                 # dtype = "str", # 資料類型
                 nrows = 50000, #讀取行數
                 names = ["Status_code", "Mail_num", "Mail_date", "Mail_time", "OP_office", "other", "Mail_datetime"] # 欄位名稱
                )

In [56]:
df.Mail_datetime = pd.to_datetime(df.Mail_date + " " + df.Mail_time)

In [None]:
# 統計相關

df.groupby(['Mail_num'], sort = True)['Mail_datetime'].max()

mail_counts = df.groupby(['Mail_num']).size().reset_index(name='counts').sort_values(['counts'], ascending=False)
mail_cnts = mail_counts.groupby(['counts']).size().reset_index(name='cnts').sort_values(['cnts'], ascending=False)
mail_cnts

In [57]:
# 要執行以下程式時，需重載資料
del df['Mail_date']
del df['Mail_time']
del df['other']


In [58]:
mail_df = df.sort_values(["Mail_num", "Mail_datetime"])

In [59]:
df.sort_values(["Mail_num", "Mail_datetime"]).head(3)

Unnamed: 0,Status_code,Mail_num,OP_office,Mail_datetime
0,Y4,0,330031,2018-01-01 09:49:04
1,Y4,0,330031,2018-01-01 09:58:08
2,I4,0,330031,2018-01-01 14:11:51


In [9]:
df.reset_index().head(3)

Unnamed: 0,index,Status_code,Mail_num,OP_office,Mail_datetime
0,0,Y4,0,330031,2018-01-01 09:49:04
1,1,Y4,0,330031,2018-01-01 09:58:08
2,2,I4,0,330031,2018-01-01 14:11:51


In [14]:
# 載入郵局資料檔
post_office = pd.read_csv('D:\GitHub\POSTBD\coach\Post_All_new.csv')
new_po = post_office[["郵務局號", "局名", "緯度", "經度"]]
new_po.columns = ["post_code", "name", "lon", "lat"]
po_dict = new_po.set_index("post_code").to_dict()


In [38]:
# 輸入郵務局號，回傳經緯度座標
def getPOInfo(post_code):
    if isPOCode(post_code):
        return (po_dict['name'][post_code],
                po_dict['lon'][post_code],
                po_dict['lat'][post_code])
    else:
        return ("","","")

In [39]:
# 判斷郵務局號是否存在
unique_po_code = set(po_dict['name'].keys())
def isPOCode(post_code):
    if post_code in unique_po_code:
        return True
    else:
        return False

In [40]:
# 整合郵件資料的郵號與地點

start_time = time.time()

all_mail = {}
all_mail_key = set()
for idx, row in df.iterrows():
    mail_code = row.Mail_num.strip()
    if not mail_code in all_mail_key:
        all_mail[mail_code] = []
        all_mail_key.add(mail_code)
        
    all_mail[mail_code].append( (row.OP_office, row.Mail_datetime, row.Status_code) )
    
time_diff = time.time() - start_time
print("處理 ",len(df), "筆資料，組成 ", len(all_mail.keys()), " 筆資料，共執行：", time_diff, "秒")

處理  50000 筆資料，組成  28364  筆資料，共執行： 4.182816028594971 秒


In [41]:
def convert_2_edge(mail_status):
    edges = []
    for idx in range(len(mail_status) - 1):
        edges.append((mail_status[idx][0], mail_status[idx + 1][0],
                     mail_status[idx][2], mail_status[idx + 1][2]))
        
    return edges

In [42]:
all_edges = [] 
for mail_code in all_mail:
    status_num = len(all_mail[mail_code])
    
    if (status_num) > 1:
        mail_status = all_mail[mail_code]
        all_edges.extend(convert_2_edge(mail_status))

In [43]:
src_nodes = set([ ele[0] for ele in all_edges])
tar_nodes = set([ ele[1] for ele in all_edges]) 
all_nodes = list(src_nodes.union(tar_nodes))

In [44]:
print(len(src_nodes), len(tar_nodes), len(all_nodes))

696 553 738


In [None]:
getPOInfo(all_nodes[12])



In [48]:
import simplekml

In [49]:
kml = simplekml.Kml()

In [50]:
for x in all_edges[:3]:
    src, target, src_state, target_state = x
    if isPOCode(src) and isPOCode(target):
        src_pnt = getPOInfo(src)
        tar_pnt = getPOInfo(target)
        
        # point
        kml.newpoint(name=str(src), 
                     description=src_pnt[0],
                     coords=[(src_pnt[2],src_pnt[1])])
        kml.newpoint(name=str(target), 
                     description=tar_pnt[0],
                     coords=[(tar_pnt[2],tar_pnt[1])])
        
        
        # line
        kml.newlinestring(
            name="Pathway", 
            description="%s to %s"%(src_pnt[0],tar_pnt[0]) ,
            coords=[(src_pnt[2],src_pnt[1]), 
                    (tar_pnt[2],tar_pnt[1])])

In [51]:
kml.save("test.kml")