In [33]:
import pyspark.sql.functions as F
from pyspark.sql.window import Window

In [34]:
# todo 1: 读取数据
employee_path = 'abfss://data-warehouse-ods@dlsaaddpnorth3001.dfs.core.chinacloudapi.cn/ods_mau_employee_cn_mf.csv'
travel_path = 'abfss://data-warehouse-dwd@dlsaaddpnorth3001.dfs.core.chinacloudapi.cn/dwd_fi_te_travel_app_header_cn.parquet' 
reimburse_path = 'abfss://data-warehouse-dwd@dlsaaddpnorth3001.dfs.core.chinacloudapi.cn/dwd_fi_te_reimburse_header.parquet' 
air_path = 'abfss://data-warehouse-dwd@dlsaaddpnorth3001.dfs.core.chinacloudapi.cn/dwd_fi_te_air_ticket_cn.csv' 
 

df_emp = spark.read.csv(employee_path,header=True) \
                .where(F.col('Personnel number *').isNotNull()) \
                .withColumn('source',F.lit(1)) \
                .withColumnRenamed('Org.Unit','org_unit') \
                .withColumn('rn',F.row_number().over(Window.partitionBy('Personnel number *').orderBy('org_unit','name'))) \
                .where(F.col('rn')==1) \
                .dropDuplicates()

df_travel = spark.read.parquet(travel_path,header=True) \
                .select(F.col('personnel_number').alias('t_no'),F.col('org_unit').alias('t_org'),F.col('name').alias('t_name')) \
                .withColumn('source',F.lit(2)) \
                .withColumn('rn',F.row_number().over(Window.partitionBy('t_no').orderBy('t_org','t_name'))) \
                .where(F.col('rn')==1) \
                .dropDuplicates() 

df_reimburse = spark.read.parquet(reimburse_path,header=True) \
                .select(F.col('personnel_number').alias('r_no'),F.col('org_unit').alias('r_org'),F.col('name').alias('r_name')) \
                .withColumn('source',F.lit(3)) \
                .withColumn('rn',F.row_number().over(Window.partitionBy('r_no').orderBy('r_org','r_name'))) \
                .where(F.col('rn')==1) \
                .dropDuplicates() 

df_air = spark.read.csv(air_path,header=True) \
                .select(F.col('employee_no').alias('air_no'),F.col('department').alias('air_org'),F.col('name').alias('air_name')) \
                .dropDuplicates() \
                .withColumn('source',F.lit(4)) \
                .withColumn('rn',F.row_number().over(Window.partitionBy('air_no').orderBy('air_org','air_name'))) \
                .where(F.col('rn')==1) \
                .dropDuplicates() 

In [35]:
# todo 2 : 主数据
df_no = df_emp.unionAll(df_travel).unionAll(df_reimburse).unionAll(df_air) \
              .select('Personnel number *',
                    F.min('source').over(Window.partitionBy('Personnel number *').orderBy('source')).alias('from')
                ) \
              .withColumnRenamed('Personnel number *','personnel_number') \
              .where(F.col('personnel_number').isNotNull() & ~(F.col('personnel_number') == 0)) \
              .dropDuplicates() 

df_master = df_no.join(df_emp,df_no['personnel_number']==df_emp['Personnel number *'],'left') \
                 .join(df_travel,df_no['personnel_number']==df_travel['t_no'],'left') \
                 .join(df_reimburse,df_no['personnel_number']==df_reimburse['r_no'],'left') \
                 .join(df_air,df_no['personnel_number']==df_air['air_no'],'left') 

df_result = df_master.select('personnel_number',
                    F.when(F.col('org_unit').isNotNull(),F.col('org_unit'))
                    .when(F.col('t_org').isNotNull(),F.col('t_org'))
                    .when(F.col('r_org').isNotNull(),F.col('r_org'))
                    .when(F.col('air_org').isNotNull(),F.col('air_org'))
                    .otherwise('').alias('org_unit'),

                    F.when(F.col('name').isNotNull(),F.col('name'))
                    .when(F.col('t_name').isNotNull(),F.col('t_name'))
                    .when(F.col('r_name').isNotNull(),F.col('r_name'))
                    .when(F.col('air_name').isNotNull(),F.col('air_name'))
                    .otherwise('').alias('employee_name'),

                    F.when(F.col('from')==1,'yes').otherwise('no').alias('onboard_or_not')
                    )

display(df_result)


In [36]:
# todo 3: 写入文件
save_path = 'abfss://data-warehouse-dim@dlsaaddpnorth3001.dfs.core.chinacloudapi.cn/dim_employee_mf.csv'
df_result.toPandas().to_csv(save_path, mode='w',index=False, header=True)