In [21]:
# 导包
import pyspark.sql.functions as F
from pyspark.sql.types import StringType,StructField,StructType
from datetime import datetime,timedelta    
import pandas as pd



In [22]:
# 方法
def schema_from_pandas(df):
    """读取pd文件schema"""
    schema = StructType([StructField(str(col),StringType(),True) for col in df.columns]) 
    return schema

In [23]:
# todo 1:数据载入
read_path = 'abfss://data-warehouse-ods@dlsaaddpnorth3001.dfs.core.chinacloudapi.cn/ods_mau_travel_app_cn_mf.csv'
df_pd = pd.read_csv(read_path)

schema = schema_from_pandas(df_pd)
df = spark.createDataFrame(df_pd, schema=schema)

# todo 2: 数据处理
formatted_endtime = datetime.now()
formatted_endtime += timedelta(hours=8)
etl_load_time = formatted_endtime.strftime("%Y-%m-%d %H:%M:%S")


df = df.withColumnRenamed('Key','TA_key') \
       .select('TA_key','trip_itinerary') \
       .withColumn("trip_itinerary", F.explode(F.split(F.split(df['trip_itinerary'], 'Tools\* ').getItem(1), "\n"))) \
       .where(F.col('trip_itinerary') != '') \
       .withColumn('from_place', F.split(F.col('trip_itinerary'),'\|').getItem(0)) \
       .withColumn('from_date', F.split(F.col('trip_itinerary'),'\|').getItem(1)) \
       .withColumn('to_place', F.split(F.col('trip_itinerary'),'\|').getItem(3)) \
       .withColumn('to_date', F.split(F.col('trip_itinerary'),'\|').getItem(4)) \
       .withColumn('transportation_tools', F.split(F.col('trip_itinerary'),'\|').getItem(6)) \
       .withColumn('etl_load_time', F.lit(etl_load_time)) \
       .drop('trip_itinerary') \
       .toPandas()

display(df)


In [24]:
# todo 3:数据写入
save_path = 'abfss://data-warehouse-dwd@dlsaaddpnorth3001.dfs.core.chinacloudapi.cn/dwd_fi_te_travel_app_item_cn.csv'
df.to_csv(save_path,index=False,header=True)