In [11]:
# 导包
from notebookutils import mssparkutils  
from openpyxl import load_workbook
import pandas as pd
import os
from datetime import datetime,timedelta    
from pyspark.sql.types import StructType, StructField, StringType
import pyspark.sql.functions as F

In [12]:
# 方法
def schema_from_pandas(df):
    """读取pd文件schema"""
    schema = StructType([StructField(str(col),StringType(),True) for col in df.columns]) 
    return schema

In [13]:
# todo 1：数据载入
ods_reimburse = 'abfss://data-warehouse-ods@dlsaaddpnorth3001.dfs.core.chinacloudapi.cn/ods_mau_reimburse_cn_mf.csv'
df_pd = pd.read_csv(ods_reimburse).fillna('')

schema = schema_from_pandas(df_pd)
df  = spark.createDataFrame(df_pd,schema=schema)


# todo 2:数据准备
cols = {
     'Project'  :   'project'
    ,'Key'  :   'reimbursement_key'
    ,'Approved Travel Application'  :   'TA_key'
    ,'Summary'  :   'summary'
    ,'Issue Type'   :   'issue_type'
    ,'Status'   :   'status'
    ,'Priority' :   'priority'
    ,'Resolution'   :   'resolution'
    ,'Current Assignee' :   'current_assignee'
    ,'Applicant'    :   'applicant'
    ,'Created'  :   'created'
    ,'Updated'  :   'updated'
    ,'Due Date' :   'due_date'
    ,'NT-ID Second Level'   :   'NT_ID_second_level'
    ,'NT-ID First Level'    :   'NT_ID_first_level'
    ,'Application reason & additional comment'  :   'application_reason_additional_comment'
    ,'Cost Center'  :   'cost_center'
    ,'Company Code' :   'company_code'
    ,'Special Approval *'   :   'special_approval'
    ,'Adjustment necessary *'   :   'adjustment_necessary'
    ,'Approvers *'  :   'approvers'
    ,'Advance applied or not *' :   'advance_applied_or_not'
    ,'Approved Advance Application' :   'approved_advance_application'
    ,'Charge to other cost center *'    :   'charge_to_other_cost_center'
    ,'apcntr.field.trainingProvider'    :   'training_provider'
    ,'apcntr.field.trainingType'    :   'training_type'
    ,'apcntr.field.trainingNeedsCaused' :   'training_needs_caused'
    ,'Name (Modify & Post)' :   'name_modify_post'
    ,'apcntr.field.thisTrainingCost'    :   'this_training_cost'
    ,'apcntr.field.tmpTrainingType' :   'tmp_training_type'
    ,'Open advance' :   'open_advance'
    ,'Time (Modify & Post)' :   'time_modify_post'
    ,'apcntr.field.estimatedTravelExpenses' :   'estimated_travel_expenses'
    ,'apcntr.field.participantNumber'   :   'participant_number'
    ,'apcntr.field.estimatedOtherCost'  :   'estimated_other_cost'
    ,'apcntr.field.estimatedTotalCost'  :   'estimated_total_cost'
    ,'common.field.employee.companycode'    :   'employee_company_code'
    ,'Total - FIN Deduction'    :   'total_FIN_deduction'
    ,'Total - Kilometers'   :   'total_kilometers'
    ,'Global CC2'   :   'global_CC2'
    ,'Policy Compliance Result' :   'policy_compliance_result'
    ,'Non-compliance Reason'    :   'non_compliance_reason'
    ,'Personnel number *'   :   'personnel_number'
    ,'Tel.' :   'Telephone'
    ,'FIN Remark Ref'   :   'FIN_remark_ref'
    ,'FIN Remark Reason Type'   :   'FIN_remark_reason_type'
    ,'FIN Remark Reason Details'    :   'FIN_remark_reason_details'
    ,'FIN Remark Reason Code'   :   'FIN_remark_reason_code'
    ,'FIN Remark Comment'   :   'FIN_remark_reason_comment'
    ,'Actual Start Date, Time*:'    :   'actual_start_date_time'
    ,'SAP number'   :   'SAP_number'
    ,'Charge to Cost Center *'  :   'charge_to_cost_center'
    ,'Travel or not *'  :   'travel_or_not'
    ,'Charge to other division (company code)'  :   'charge_to_other_division_company_code'
    ,'Travel Type'  :   'travel_type'
    ,'Planned End Date:'    :   'planned_end_date'
    ,'Planned Start Date:'  :   'planned_start_date'
    ,'Planned Duration:'    :   'planned_duration'
    ,'WBS/Internal order Posted'    :   'WBS_Internal_order_posted'
    ,'Total - Payable'  :   'total_payable'
    ,'Org.Unit' :   'org_unit'
    ,'Message returned from SAP'    :   'message_returned_from_SAP'
    ,'Actual Duration:' :   'actual_duration'
    ,'Name' :   'name'
    ,'Actual End Date, Time*:'  :   'actual_end_date_time'
    ,'WBS/Internal order'   :   'WBS_Internal_order'
    ,'Global CC'    :   'global_CC'
    ,'Applicant NT/Display Name'    :   'applicant_NT_display_name'
    ,'Travel to*:'  :   'travel_to'
    ,'travel_allow' :   'travel_allow'
    ,'WBS or Internal order'    :   'WBS_or_internal_order'
    ,'Total - Claim'    :   'total_claim'
    ,'Request need print and delivery?' :   'request_need_print_and_delivery'
    ,'Fapiao & Supporting'  :   'Fapiao_and_supporting'
    ,'Total - Actual payment'   :   'total_actual_payment'
    ,'Text sent to SAP *'   :   'text_sent_to_SAP'
}   

for old_name , new_name in cols.items():
    df = df.withColumnRenamed(old_name, new_name)

# todo 3: 数据处理      
formatted_endtime = datetime.now()
formatted_endtime += timedelta(hours=8)
etl_load_time = formatted_endtime.strftime("%Y-%m-%d %H:%M:%S")

df = df.select(list(cols.values())) \
        .dropDuplicates() \
        .withColumn('etl_load_time', F.lit(etl_load_time)) \
        .toPandas() 


display(df)

In [14]:
# todo 4：数据落盘
save_path = 'abfss://data-warehouse-dwd@dlsaaddpnorth3001.dfs.core.chinacloudapi.cn/dwd_fi_te_reimburse_header.parquet'
df.to_parquet(save_path,index=False)

In [15]:
# todo 5：数据落盘
save_path = 'abfss://data-warehouse-dwd@dlsaaddpnorth3001.dfs.core.chinacloudapi.cn/dwd_fi_te_reimburse_header.csv'
df.to_csv(save_path,index=False,header=True)