In [1]:
from pyspark.sql import SparkSession

from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, sum, count

spark = SparkSession.builder.master('local').getOrCreate()

In [2]:
employees = spark.read.parquet("data/employees.parquet")
job_history = spark.read.parquet("data/job_history.parquet")
department = spark.read.parquet("data/department.parquet")
jobs = spark.read.parquet("data/jobs.parquet")
locations = spark.read.parquet("data/locations.parquet")
countries = spark.read.parquet("data/countries.parquet")
regions = spark.read.parquet("data/regions.parquet")

In [3]:
employees.createOrReplaceTempView("employees")
job_history.createOrReplaceTempView("job_history")
department.createOrReplaceTempView("department")
jobs.createOrReplaceTempView("jobs")
locations.createOrReplaceTempView("locations")
countries.createOrReplaceTempView("countries")
regions.createOrReplaceTempView("regions")

In [4]:
locations.printSchema()

root
 |-- location_id: integer (nullable = true)
 |-- street_address: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state_province: string (nullable = true)
 |-- country_id: string (nullable = true)



In [5]:
df = employees.join(department, ["department_id"]).join(locations, ["location_id"]) \
    .select("department_id", "street_address", "employee_id", "first_name", "last_name", "hire_date")

w2 = Window.partitionBy("department_id").orderBy(col("hire_date"))
df.withColumn("row", row_number().over(w2)).filter(col("row") == 1).drop("row").show()

+-------------+--------------------+-----------+-----------+---------+----------+
|department_id|      street_address|employee_id| first_name|last_name| hire_date|
+-------------+--------------------+-----------+-----------+---------+----------+
|           10|     2004 Charade Rd|        200|   Jennifer|   Whalen|17.09.2003|
|           20|     147 Spadina Ave|        201|    Michael|Hartstein|17.02.2004|
|           30|     2004 Charade Rd|        114|        Den| Raphaely|07.12.2002|
|           40|      8204 Arthur St|        203|      Susan|   Mavris|07.06.2002|
|           50| 2011 Interiors Blvd|        122|      Payam| Kaufling|01.05.2003|
|           60| 2014 Jabberwocky Rd|        103|  Alexander|   Hunold|03.01.2006|
|           70|Schwanthalerstr. ...|        204|    Hermann|     Baer|07.06.2002|
|           80|Magdalen Centre, ...|        158|      Allan|   McEwen|01.08.2004|
|           90|     2004 Charade Rd|        102|        Lex|  De Haan|13.01.2001|
|          100| 