In [20]:
from pyspark.sql import SparkSession

from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number, sum, count, split

spark = SparkSession.builder.master("local").getOrCreate()

In [21]:
employees = spark.read.parquet("data/employees.parquet")
job_history = spark.read.parquet("data/job_history.parquet")
department = spark.read.parquet("data/department.parquet")
jobs = spark.read.parquet("data/jobs.parquet")
locations = spark.read.parquet("data/locations.parquet")
countries = spark.read.parquet("data/countries.parquet")
regions = spark.read.parquet("data/regions.parquet")

In [22]:
employees.createOrReplaceTempView("employees")
job_history.createOrReplaceTempView("job_history")
department.createOrReplaceTempView("department")
jobs.createOrReplaceTempView("jobs")
locations.createOrReplaceTempView("locations")
countries.createOrReplaceTempView("countries")
regions.createOrReplaceTempView("regions")
employees.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- hire_date: string (nullable = true)
 |-- job_id: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- commission_pct: string (nullable = true)
 |-- manager_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)



In [27]:
df1 = employees.withColumn("code", split("phone_number", "\.")[0])\
    .select("employee_id", "code")
df1.show(10, truncate=False)

+-----------+----+
|employee_id|code|
+-----------+----+
|100        |515 |
|101        |515 |
|102        |515 |
|103        |590 |
|104        |590 |
|105        |590 |
|106        |590 |
|107        |590 |
|108        |515 |
|109        |515 |
+-----------+----+
only showing top 10 rows



In [24]:
df2 = df1.groupBy("code").count()
df2.show()

+----+-----+
|code|count|
+----+-----+
| 011|   35|
| 650|   45|
| 515|   21|
| 603|    1|
| 590|    5|
+----+-----+



In [25]:
print(dict(df2.collect()))

{'011': 35, '650': 45, '515': 21, '603': 1, '590': 5}
