In [0]:
#Your task is to:
#1. Join these DataFrames on `customer_id`.
#2. Calculate the difference in days between the `account_open_date` and the current date.
#3. Select the required columns: `customer_name`, `customer_phone_number`, and `difference` (number of days since the account was opened).#

from pyspark.sql import SparkSession
from pyspark.sql.functions import datediff, current_date, to_date

spark = SparkSession.builder.appName('AppName1').getOrCreate()

# Sample data for customer_df
customer_data = [("Alice", 1, "123-456-7890"), ("Bob", 2, "987-654-3210"), ("Charlie", 3, "555-666-7777")]
# Sample data for account_df
account_data = [(1, "2020-01-01"), (2, "2021-06-15"), (3, "2023-03-25")]

#Customer Schema and Account schema
customer_schema = ["customer_name", "customer_id", "customer_phone_number"]
account_schema = ["customer_id", "account_open_date"]

#Create Dataframes
customer_df = spark.createDataFrame(customer_data, customer_schema)
account_df = spark.createDataFrame(account_data, account_schema)

#Task1: Join dataframes by customer_id
joined_df = customer_df.join(account_df, on='customer_id', how='inner')

#Task2: Calculate the difference in days between the `account_open_date` and the current date.
#Convert accountopendate to DateType
account_df = account_df.withColumn("account_open_date", to_date(account_df["account_open_date"], "yyyy-MM-dd"))
#joined_df.show()

final_df = joined_df.withColumn("difference", datediff(current_date(), joined_df["account_open_date"]))
result_df = final_df.select("customer_name", "customer_phone_number", "difference")
result_df.show()

+-------------+---------------------+----------+
|customer_name|customer_phone_number|difference|
+-------------+---------------------+----------+
|        Alice|         123-456-7890|      1685|
|          Bob|         987-654-3210|      1154|
|      Charlie|         555-666-7777|       506|
+-------------+---------------------+----------+

