In [3]:
import pandas
import pyspark
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()

In [4]:
from pyspark.sql.functions import *

In [5]:
person=[(1,'Wang','Allen'),(2,'Alice','Bob')]
personSchema=['personId','lastName','firstName']
PersonDF=spark.createDataFrame(person,personSchema)

In [6]:
PersonDF.printSchema()
PersonDF.show(truncate=False)

root
 |-- personId: long (nullable = true)
 |-- lastName: string (nullable = true)
 |-- firstName: string (nullable = true)



                                                                                

+--------+--------+---------+
|personId|lastName|firstName|
+--------+--------+---------+
|1       |Wang    |Allen    |
|2       |Alice   |Bob      |
+--------+--------+---------+



In [7]:
address=[(1,2,'New York City','New York'),(2,3,'Leetcode','California')]
addressSchema=['addressId','personId','city','state']
AddressDF=spark.createDataFrame(address,addressSchema)

In [8]:
AddressDF.printSchema()
AddressDF.show(truncate=False)

root
 |-- addressId: long (nullable = true)
 |-- personId: long (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)

+---------+--------+-------------+----------+
|addressId|personId|city         |state     |
+---------+--------+-------------+----------+
|1        |2       |New York City|New York  |
|2        |3       |Leetcode     |California|
+---------+--------+-------------+----------+



alias() can be used to truncate the long data frame names. In this case alias() was provided before the join() operation. Also, the select() function was used to keep only the defined four columns.

In [9]:
PersonDF=PersonDF.alias('p')
AddressDF=AddressDF.alias('a')
outputDF=PersonDF.join(AddressDF,PersonDF.personId ==  AddressDF.personId,"left").select('p.firstName','p.lastName','a.city','a.state')

In [10]:
outputDF.printSchema()
outputDF.show(truncate=False)

root
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)



                                                                                

+---------+--------+-------------+--------+
|firstName|lastName|city         |state   |
+---------+--------+-------------+--------+
|Allen    |Wang    |null         |null    |
|Bob      |Alice   |New York City|New York|
+---------+--------+-------------+--------+



Another way to give an alias() is while doing the join() operation. Here, the col() function is used which need to be called from pyspark.sql.functions. 

In [17]:
outputDF=PersonDF.alias('p').join(AddressDF.alias('a'),col('p.personId')==col('a.personId'),"left").select([col('p.firstName'),col('p.lastName')]+[col('a.city'),col('a.state')])

In [18]:
outputDF.printSchema()
outputDF.show(truncate=False)

root
 |-- firstName: string (nullable = true)
 |-- lastName: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)

+---------+--------+-------------+--------+
|firstName|lastName|city         |state   |
+---------+--------+-------------+--------+
|Allen    |Wang    |null         |null    |
|Bob      |Alice   |New York City|New York|
+---------+--------+-------------+--------+



                                                                                