In [0]:
import pyspark.sql.functions as F
data = [("Anderson","Sales","NY",90000,34,10000),
    ("Kenedy","Sales","CA",86000,56,20000),
    ("Billy","Sales","NY",81000,30,23000),
    ("Andy","Finance","CA",90000,24,23000),
    ("Mary","Finance","NY",99000,40,24000),
    ("Eduardo","Finance","NY",83000,36,19000),
    ("Mendes","Finance","CA",79000,53,15000),
    ("Keyth","Marketing","CA",80000,25,18000),
    ("Truman","Marketing","NY",91000,50,21000)
  ]

schema = ["emp_name","dep_name","state","salary","age","bonus"]
df = spark.createDataFrame(data=data, schema = schema)
df.printSchema()
df.show(truncate=False)

root
 |-- emp_name: string (nullable = true)
 |-- dep_name: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)
 |-- age: long (nullable = true)
 |-- bonus: long (nullable = true)

+--------+---------+-----+------+---+-----+
|emp_name|dep_name |state|salary|age|bonus|
+--------+---------+-----+------+---+-----+
|Anderson|Sales    |NY   |90000 |34 |10000|
|Kenedy  |Sales    |CA   |86000 |56 |20000|
|Billy   |Sales    |NY   |81000 |30 |23000|
|Andy    |Finance  |CA   |90000 |24 |23000|
|Mary    |Finance  |NY   |99000 |40 |24000|
|Eduardo |Finance  |NY   |83000 |36 |19000|
|Mendes  |Finance  |CA   |79000 |53 |15000|
|Keyth   |Marketing|CA   |80000 |25 |18000|
|Truman  |Marketing|NY   |91000 |50 |21000|
+--------+---------+-----+------+---+-----+



In [0]:
from pyspark.sql.functions import monotonically_increasing_id

df = df.withColumn("id", monotonically_increasing_id())
display(df)


emp_name,dep_name,state,salary,age,bonus,id
Anderson,Sales,NY,90000,34,10000,0
Kenedy,Sales,CA,86000,56,20000,8589934592
Billy,Sales,NY,81000,30,23000,17179869184
Andy,Finance,CA,90000,24,23000,25769803776
Mary,Finance,NY,99000,40,24000,34359738368
Eduardo,Finance,NY,83000,36,19000,42949672960
Mendes,Finance,CA,79000,53,15000,51539607552
Keyth,Marketing,CA,80000,25,18000,60129542144
Truman,Marketing,NY,91000,50,21000,60129542145


In [0]:
#Verificar Salarios Unicos
approx = df.select(F.approx_count_distinct("salary")).collect()[0][0]
print(f"approx_count_distinct: {approx}")

approx_count_distinct: 8


In [0]:
avg = df.select(F.avg("salary")).collect()[0][0]
print(f"avg: {avg}")

avg: 86555.55555555556


In [0]:
#Agrupar um uma lista
df.select(F.collect_list("salary")).show(truncate=False)

+---------------------------------------------------------------+
|collect_list(salary)                                           |
+---------------------------------------------------------------+
|[90000, 86000, 81000, 90000, 99000, 83000, 79000, 80000, 91000]|
+---------------------------------------------------------------+



In [0]:
#Retorna uma lista de valores únicos
df.select(F.collect_set("salary")).show(truncate=False)

+--------------------------------------------------------+
|collect_set(salary)                                     |
+--------------------------------------------------------+
|[79000, 83000, 91000, 99000, 90000, 80000, 86000, 81000]|
+--------------------------------------------------------+



In [0]:
#Retorna numero de elementos distintos em uma coluna
df2 = df.select((F.countDistinct(F.col("salary")).alias("Salarios únicos")), (F.countDistinct(F.col("dep_name")).alias("Departamentos únicos")), (F.countDistinct(F.col("dep_name"), F.col("salary")).alias("Salarios únicos em relação aos departamentos")))
df2.show()


+---------------+--------------------+--------------------------------------------+
|Salarios únicos|Departamentos únicos|Salarios únicos em relação aos departamentos|
+---------------+--------------------+--------------------------------------------+
|              8|                   3|                                           9|
+---------------+--------------------+--------------------------------------------+



In [0]:
print(f"Contagem distinta de departamentos e salários: {df2.collect()[0]}")

Contagem distinta de departamentos e salários: Row(Salarios únicos=8, Departamentos únicos=3, Salarios únicos em relação aos departamentos=9)


In [0]:
salary = df.select(F.count("salary")).collect()[0][0]
print(f"Count Salarios: {salary}")

Count Salarios: 9


In [0]:
df.select(F.first("salary")).show(truncate=False)

+-------------+
|first(salary)|
+-------------+
|90000        |
+-------------+



In [0]:
df.select(F.max(F.col("salary"))).show(truncate=False)

+-----------+
|max(salary)|
+-----------+
|99000      |
+-----------+



In [0]:
df.select(F.min(F.col("salary"))).show(truncate=False)

+-----------+
|min(salary)|
+-----------+
|79000      |
+-----------+



In [0]:
df.select(F.sum(F.col("salary"))).show(truncate=False)

+-----------+
|sum(salary)|
+-----------+
|779000     |
+-----------+

