In [0]:
from pyspark import SparkConf
from pyspark.sql.session import SparkSession

from pyspark.sql.functions import col
from pyspark.sql.types import StringType, IntegerType, FloatType, BooleanType, TimestampType, DateType, StructField, StructType
from pyspark.sql.dataframe import DataFrame

spark = SparkSession.builder.appName("app").master("localhost[*]").getOrCreate()

In [0]:
schema = StructType([StructField('personId',IntegerType(),False),
                     StructField('lastName',StringType(),False),
                     StructField('firstName',StringType(),False)])

data = [(1,"Wang","Allen"),(2,"Alice","Bob")]
person = spark.createDataFrame(data=data,schema=schema)
person.show()


+--------+--------+---------+
|personId|lastName|firstName|
+--------+--------+---------+
|       1|    Wang|    Allen|
|       2|   Alice|      Bob|
+--------+--------+---------+



In [0]:
data = [(1,2,"NYC","NY"),
        (2,3,"CMD","CAL")]

schema = StructType([\
    StructField("addressId",IntegerType(),False),
    StructField("personId",IntegerType(),False),
    StructField("city",StringType(),False),
    StructField("state",StringType(),False)
])

address = spark.createDataFrame(data=data,schema=schema)
address.show()

+---------+--------+----+-----+
|addressId|personId|city|state|
+---------+--------+----+-----+
|        1|       2| NYC|   NY|
|        2|       3| CMD|  CAL|
+---------+--------+----+-----+



In [0]:
# 175. Combine Two Tables
# Write a solution to report the first name, last name, city, and state of each person in the Person table. If the address of a personId is not present in the Address table, report null instead.
# Return the result table in any order.

final_df = person.join(address,person.personId==address.personId,how="left").select(person.firstName,person.lastName,address.city,address.state)
final_df.show()


+---------+--------+----+-----+
|firstName|lastName|city|state|
+---------+--------+----+-----+
|    Allen|    Wang|null| null|
|      Bob|   Alice| NYC|   NY|
+---------+--------+----+-----+



In [0]:
#Using spark.sql
person.createOrReplaceTempView("person_sql")
address.createOrReplaceTempView("address_sql")

spark.sql("select p.firstName,p.lastName,a.city,a.state from person_sql p left join address_sql a on p.personId=a.personId").show()


+---------+--------+----+-----+
|firstName|lastName|city|state|
+---------+--------+----+-----+
|    Allen|    Wang|null| null|
|      Bob|   Alice| NYC|   NY|
+---------+--------+----+-----+



In [0]:
spark.stop()