### Problem Statement
Write a solution to report the first name, last name, city, and state of each person in the Person table. If the address of a personId is not present in the Address table, report null instead.

Return the result table in any order.

In [0]:
from pyspark.sql import SparkSession

# Assuming you already have a SparkSession
spark = SparkSession.builder \
    .appName("PersonAddressJoin") \
    .getOrCreate()

# Sample data
data_person = [
    (1, 'Doe', 'John'),
    (2, 'Smith', 'Jane'),
    (3, 'Johnson', 'Michael')
]

data_address = [
    (1, 1, 'New York', 'NY'),
    (2, 2, 'Los Angeles', 'CA')
]

# Create DataFrames
person_df = spark.createDataFrame(data_person, ['personId', 'lastName', 'firstName'])
address_df = spark.createDataFrame(data_address, ['addressId', 'personId', 'city', 'state'])

# Show the result
person_df.display()
address_df.display()


personId,lastName,firstName
1,Doe,John
2,Smith,Jane
3,Johnson,Michael


addressId,personId,city,state
1,1,New York,NY
2,2,Los Angeles,CA


In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Perform left join to get the desired result
result_df = person_df.join(address_df, person_df.personId == address_df.personId, how='left') \
                    .select(person_df.firstName, person_df.lastName, address_df.city, address_df.state)

# Show the result
result_df.display()


firstName,lastName,city,state
John,Doe,New York,NY
Jane,Smith,Los Angeles,CA
Michael,Johnson,,


In [0]:
# Register DataFrames as tables
person_df.createOrReplaceTempView("Person")
address_df.createOrReplaceTempView("Address")

In [0]:
%sql
SELECT p.firstName, p.lastName, a.city, a.state
FROM person p
LEFT JOIN Address a ON p.personId = a.personId;


firstName,lastName,city,state
John,Doe,New York,NY
Jane,Smith,Los Angeles,CA
Michael,Johnson,,
