In [35]:
%pip install ipython-autotime
%pip install pyspark

time: 9.15 s (started: 2023-09-18 19:48:53 +00:00)


In [36]:
%%script echo 'ignore cell'
import os
import sys

# resolver problema de versão python x pyspark workers
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

ignore cell
time: 6.73 ms (started: 2023-09-18 19:49:02 +00:00)


In [37]:
from google.colab          import drive, files
from pyspark.sql           import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types     import *
from pyspark.sql.window    import Window

%load_ext autotime

The autotime extension is already loaded. To reload it, use:
  %reload_ext autotime
time: 1.2 ms (started: 2023-09-18 19:49:02 +00:00)


In [38]:
drive.mount('/content/drive', force_remount=True)
spark = SparkSession.builder.master('local').appName('pyspark_app').getOrCreate()
spark

Mounted at /content/drive


time: 4.61 s (started: 2023-09-18 19:49:02 +00:00)


In [39]:
df = spark.read.csv("/content/drive/MyDrive/datasets/housing/housing.csv", header=True, inferSchema=True, encoding='utf-8')
print(type(df))
print(f'rows: {df.count()}')
print(f'cols: {len(df.columns)}')
df.show(5)

<class 'pyspark.sql.dataframe.DataFrame'>
rows: 20640
cols: 10
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|       

In [40]:
print(df.columns)

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity']
time: 660 µs (started: 2023-09-18 19:49:08 +00:00)


In [41]:
df.printSchema()

root
 |-- longitude: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- housing_median_age: double (nullable = true)
 |-- total_rooms: double (nullable = true)
 |-- total_bedrooms: double (nullable = true)
 |-- population: double (nullable = true)
 |-- households: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- median_house_value: double (nullable = true)
 |-- ocean_proximity: string (nullable = true)

time: 4.58 ms (started: 2023-09-18 19:49:08 +00:00)


Verificação de valores NaN.

In [42]:
for column in df.columns:
  mask = df['longitude'].isNull()
  nan_amount = df.filter(mask).count()
  print(f'{column}: {nan_amount}')

longitude: 0
latitude: 0
housing_median_age: 0
total_rooms: 0
total_bedrooms: 0
population: 0
households: 0
median_income: 0
median_house_value: 0
ocean_proximity: 0
time: 3.57 s (started: 2023-09-18 19:49:08 +00:00)


Renomeação das colunas transformando tudo em letras maiúsculas.

In [43]:
upper = [column.upper() for column in df.columns]
for column, up in zip(df.columns, upper):
  df = df.withColumnRenamed(column, up)
print(df.columns)

['LONGITUDE', 'LATITUDE', 'HOUSING_MEDIAN_AGE', 'TOTAL_ROOMS', 'TOTAL_BEDROOMS', 'POPULATION', 'HOUSEHOLDS', 'MEDIAN_INCOME', 'MEDIAN_HOUSE_VALUE', 'OCEAN_PROXIMITY']
time: 88.3 ms (started: 2023-09-18 19:49:12 +00:00)


Renomeação das colunas transformando tudo em letras minúsculas.

In [44]:
lower = [column.lower() for column in df.columns]
for column, low in zip(df.columns, lower):
  df = df.withColumnRenamed(column, low)
print(df.columns)

['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income', 'median_house_value', 'ocean_proximity']
time: 108 ms (started: 2023-09-18 19:49:12 +00:00)


Selecionando colunas.

In [45]:
print(type(df.select(['longitude', 'latitude', 'households'])))
df.select(['longitude', 'latitude', 'households']).show(5)

<class 'pyspark.sql.dataframe.DataFrame'>
+---------+--------+----------+
|longitude|latitude|households|
+---------+--------+----------+
|  -122.23|   37.88|     126.0|
|  -122.22|   37.86|    1138.0|
|  -122.24|   37.85|     177.0|
|  -122.25|   37.85|     219.0|
|  -122.25|   37.85|     259.0|
+---------+--------+----------+
only showing top 5 rows

time: 191 ms (started: 2023-09-18 19:49:12 +00:00)


Selecionando colunas. Forma alternativa utilizando a função col() que retorna um objeto da classe Column.

In [46]:
print(type(df.select([col('latitude'), col('longitude'), col('households')])))
df.select([col('latitude'), col('longitude'), col('households')]).show(5)

<class 'pyspark.sql.dataframe.DataFrame'>
+--------+---------+----------+
|latitude|longitude|households|
+--------+---------+----------+
|   37.88|  -122.23|     126.0|
|   37.86|  -122.22|    1138.0|
|   37.85|  -122.24|     177.0|
|   37.85|  -122.25|     219.0|
|   37.85|  -122.25|     259.0|
+--------+---------+----------+
only showing top 5 rows

time: 266 ms (started: 2023-09-18 19:49:12 +00:00)


Atribuindo alias para cada coluna selecionada. Só é possível atribuir alias à colunas através da função col() que retorna um objeto Column.


In [47]:
lat = col('latitude').alias('lat')
lon = col('longitude').alias('lon')

print(lat)
df.select([lat, lon]).show(5)

Column<'latitude AS lat'>
+-----+-------+
|  lat|    lon|
+-----+-------+
|37.88|-122.23|
|37.86|-122.22|
|37.85|-122.24|
|37.85|-122.25|
|37.85|-122.25|
+-----+-------+
only showing top 5 rows

time: 264 ms (started: 2023-09-18 19:49:13 +00:00)


Aplicação de filtros.

In [48]:
mask = df['ocean_proximity'] == 'NEAR BAY'
print(mask)
df.where(mask).show(5)

Column<'(ocean_proximity = NEAR BAY)'>
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|       NEAR 

In [49]:
mask = (col('ocean_proximity') == 'NEAR BAY')
print(mask)
df.filter(mask).show(5)

Column<'(ocean_proximity = NEAR BAY)'>
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|       NEAR 

Filtros compostos.

In [50]:
mask = (col('ocean_proximity') == 'NEAR BAY') & (col('median_income') < 5.0)
print(mask)
df.where(mask).show(5)

Column<'((ocean_proximity = NEAR BAY) AND (median_income < 5.0))'>
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.25|   37.85|              52.0|     1627.0|         280.0|     565.0|     259.0|       3.8462|          342200.0|       NEAR BAY|
|  -122.25|   37.85|              52.0|      919.0|         213.0|     413.0|     193.0|       4.0368|          269700.0|       NEAR BAY|
|  -122.25|   37.84|              52.0|     2535.0|         489.0|    1094.0|     514.0|       3.6591|          299200.0|       NEAR BAY|
|  -122.25|   37.84|              52.0|     3104.0|         687.0|    1157.0|     647.0|         3.12|   

In [51]:
mask = (df['ocean_proximity'] == 'NEAR BAY') & (df['median_income'] < 5.0)
print(mask)
df.filter(mask).show(5)

Column<'((ocean_proximity = NEAR BAY) AND (median_income < 5.0))'>
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+
|  -122.25|   37.85|              52.0|     1627.0|         280.0|     565.0|     259.0|       3.8462|          342200.0|       NEAR BAY|
|  -122.25|   37.85|              52.0|      919.0|         213.0|     413.0|     193.0|       4.0368|          269700.0|       NEAR BAY|
|  -122.25|   37.84|              52.0|     2535.0|         489.0|    1094.0|     514.0|       3.6591|          299200.0|       NEAR BAY|
|  -122.25|   37.84|              52.0|     3104.0|         687.0|    1157.0|     647.0|         3.12|   

Criar novas colunas.

In [52]:
# atribuo um valor literal(True) à nova coluna chamada 'new_col'. A função lit() retorna um objeto Column.
df.withColumn('new_col', lit(True)).show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+-------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|new_col|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+-------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|   true|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|   true|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|   true|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|   

In [53]:
result = df['total_bedrooms'] / df['total_rooms']
print(type(result))
df.withColumn('new_col', result).show(5)

<class 'pyspark.sql.column.Column'>
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+-------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|            new_col|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+-------------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|0.14659090909090908|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|0.15579659106916466|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|0.12951601908657123|
|  -122.25|   37

In [54]:
# nova coluna usando a função substring()
df.withColumn('new_col', substring('ocean_proximity', 1, 4)).show(5)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+-------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|new_col|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+-------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|   NEAR|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|   NEAR|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|   NEAR|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|       5.6431|          341300.0|   

In [55]:
# concatenando duas colunas para formar uma nova
df.withColumn('new_col', concat(df['latitude'], df['longitude'])).show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|     new_col|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|37.88-122.23|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|37.86-122.22|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|37.85-122.24|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0|     219.0|      

In [56]:
# concatenando duas colunas para formar uma nova
df.withColumn('new_col', concat_ws(' # ', df['latitude'], df['longitude'])).show()

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|ocean_proximity|        new_col|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+---------------+---------------+
|  -122.23|   37.88|              41.0|      880.0|         129.0|     322.0|     126.0|       8.3252|          452600.0|       NEAR BAY|37.88 # -122.23|
|  -122.22|   37.86|              21.0|     7099.0|        1106.0|    2401.0|    1138.0|       8.3014|          358500.0|       NEAR BAY|37.86 # -122.22|
|  -122.24|   37.85|              52.0|     1467.0|         190.0|     496.0|     177.0|       7.2574|          352100.0|       NEAR BAY|37.85 # -122.24|
|  -122.25|   37.85|              52.0|     1274.0|         235.0|     558.0

Datas: Vou usar outro dataset que contenha datas em formato string para fazer a conversão de string para o formato DateType.

In [57]:
df_ = spark.read.csv("/content/drive/MyDrive/datasets/wc2018-players.csv", header=True, inferSchema=True, encoding='utf-8')
print(type(df_))
print(f'rows: {df_.count()}')
print(f'cols: {len(df_.columns)}')
df_.show(5)

<class 'pyspark.sql.dataframe.DataFrame'>
rows: 736
cols: 9
+---------+---+----+------------------+----------+----------+--------------------+------+------+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
|Argentina|  3|  DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)|   169|    65|
|Argentina| 22|  MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...|   169|    65|
|Argentina| 15|  MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...|   167|    66|
|Argentina| 18|  DF|    SALVIO Eduardo|13.07.1990|    SALVIO|    SL Benfica (POR)|   167|    69|
|Argentina| 10|  FW|      MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)|   170|    72|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
only showing top 5 rows

time: 1.27 s (started: 2023-09-18 19:49:15

In [58]:
df_.printSchema()

root
 |-- Team: string (nullable = true)
 |-- #: integer (nullable = true)
 |-- Pos.: string (nullable = true)
 |-- FIFA Popular Name: string (nullable = true)
 |-- Birth Date: string (nullable = true)
 |-- Shirt Name: string (nullable = true)
 |-- Club: string (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Weight: integer (nullable = true)

time: 5.14 ms (started: 2023-09-18 19:49:16 +00:00)


Note que 'Birth Date' está no formato string e também o separador dos componentes YYY, MM, DD é um ponto. Abaixo vemos uma forma de extrair os componentes.

In [59]:
dia = udf(lambda date:date.split('.')[0])
mes = udf(lambda date:date.split('.')[1])
ano = udf(lambda date:date.split('.')[2])

df_ = df_.withColumn('Dia', dia('Birth Date'))
df_ = df_.withColumn('Mes', mes('Birth Date'))
df_ = df_.withColumn('Ano', ano('Birth Date'))

df_.show(5)

+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Dia|Mes| Ano|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
|Argentina|  3|  DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)|   169|    65| 31| 08|1992|
|Argentina| 22|  MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...|   169|    65| 21| 01|1996|
|Argentina| 15|  MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...|   167|    66| 15| 02|1993|
|Argentina| 18|  DF|    SALVIO Eduardo|13.07.1990|    SALVIO|    SL Benfica (POR)|   167|    69| 13| 07|1990|
|Argentina| 10|  FW|      MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)|   170|    72| 24| 06|1987|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
only showi

A conversão de "Birth Date" para o formato DateType pode ser feita da seguinte forma.

In [60]:
df_.withColumn('Data Nascimento', to_date(col("Birth Date"), "dd.MM.yyyy")).printSchema()
# ou
#df_.withColumn('Data', to_date(col("Birth Date"), "dd.MM.yyyy").cast(DateType())).printSchema()

root
 |-- Team: string (nullable = true)
 |-- #: integer (nullable = true)
 |-- Pos.: string (nullable = true)
 |-- FIFA Popular Name: string (nullable = true)
 |-- Birth Date: string (nullable = true)
 |-- Shirt Name: string (nullable = true)
 |-- Club: string (nullable = true)
 |-- Height: integer (nullable = true)
 |-- Weight: integer (nullable = true)
 |-- Dia: string (nullable = true)
 |-- Mes: string (nullable = true)
 |-- Ano: string (nullable = true)
 |-- Data Nascimento: date (nullable = true)

time: 57.6 ms (started: 2023-09-18 19:49:18 +00:00)


Outra conversão de tipo com a função cast().

In [61]:
df_.withColumn('Height', col('Height').cast(FloatType())).show(5)

+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Dia|Mes| Ano|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
|Argentina|  3|  DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)| 169.0|    65| 31| 08|1992|
|Argentina| 22|  MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...| 169.0|    65| 21| 01|1996|
|Argentina| 15|  MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...| 167.0|    66| 15| 02|1993|
|Argentina| 18|  DF|    SALVIO Eduardo|13.07.1990|    SALVIO|    SL Benfica (POR)| 167.0|    69| 13| 07|1990|
|Argentina| 10|  FW|      MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)| 170.0|    72| 24| 06|1987|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
only showi

Remoção de uma coluna.

In [62]:
df_ = df_.withColumn('new_col', lit(True))
df_ = df_.drop('new_col')
df_.show(5)

+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Dia|Mes| Ano|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
|Argentina|  3|  DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)|   169|    65| 31| 08|1992|
|Argentina| 22|  MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...|   169|    65| 21| 01|1996|
|Argentina| 15|  MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...|   167|    66| 15| 02|1993|
|Argentina| 18|  DF|    SALVIO Eduardo|13.07.1990|    SALVIO|    SL Benfica (POR)|   167|    69| 13| 07|1990|
|Argentina| 10|  FW|      MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)|   170|    72| 24| 06|1987|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
only showi

Criando backup do dataframe

In [63]:
df_backup = df_

time: 543 µs (started: 2023-09-18 19:49:20 +00:00)


# Where
Função que implementa a cláusula WHERE do SQL. Uma alternativa é a função filter().

# GroupBy

Tendo uma coluna como referência, todas as linhas onde os valores dessa coluna são iguais são "colapsadas" em apenas uma. É preciso especificar o que deve ser feito com as outras colunas caso contrário elas serão ignoradas. Normalmente aplicamos funções de estatística descritiva.

In [64]:
df_.groupBy('Team').mean('Weight').orderBy('avg(Weight)', ascending=True).show(10)

+--------------+-----------------+
|          Team|      avg(Weight)|
+--------------+-----------------+
|         Japan|71.52173913043478|
|  Saudi Arabia|73.04347826086956|
|      Portugal| 73.6086956521739|
|        Mexico|74.08695652173913|
|    Costa Rica| 74.1304347826087|
|Korea Republic|74.43478260869566|
|       Uruguay| 74.6086956521739|
|       Morocco|74.65217391304348|
|         Spain|74.73913043478261|
|       Tunisia|             75.0|
+--------------+-----------------+
only showing top 10 rows

time: 1.06 s (started: 2023-09-18 19:49:20 +00:00)


Para especificar qual função de agregação deve ser usada em cada coluna podemos usar a função agg().

In [65]:
df_.groupBy('Team').agg({'Weight':'avg', 'Dia':'min', 'Height':'max'}).orderBy('max(Height)', ascending=False).show(10)

+--------------+-----------+--------+-----------------+
|          Team|max(Height)|min(Dia)|      avg(Weight)|
+--------------+-----------+--------+-----------------+
|       Croatia|        201|      02|79.30434782608695|
|       Denmark|        200|      01| 82.6086956521739|
|     Argentina|        199|      02|75.56521739130434|
|       Belgium|        199|      02|79.56521739130434|
|       Iceland|        198|      01|80.73913043478261|
|        Sweden|        198|      02|78.82608695652173|
|       Nigeria|        197|      01|80.47826086956522|
|Korea Republic|        197|      03|74.43478260869566|
|        France|        197|      03|             80.0|
|        Panama|        197|      01|             80.0|
+--------------+-----------+--------+-----------------+
only showing top 10 rows

time: 1.94 s (started: 2023-09-18 19:49:21 +00:00)


In [66]:
df_.groupBy('Team').agg(avg('Weight')).orderBy('avg(Weight)', ascending=True).show(10)

+--------------+-----------------+
|          Team|      avg(Weight)|
+--------------+-----------------+
|         Japan|71.52173913043478|
|  Saudi Arabia|73.04347826086956|
|      Portugal| 73.6086956521739|
|        Mexico|74.08695652173913|
|    Costa Rica| 74.1304347826087|
|Korea Republic|74.43478260869566|
|       Uruguay| 74.6086956521739|
|       Morocco|74.65217391304348|
|         Spain|74.73913043478261|
|       Tunisia|             75.0|
+--------------+-----------------+
only showing top 10 rows

time: 861 ms (started: 2023-09-18 19:49:23 +00:00)


# Window Functions
Tem o conceito muito parecido com groupby, mas enquanto neste, as linhas iguais são agrupadas formando agrupamentos de instâncias de dados baseados em uma ou mais colunas.

*   row_number()
*   rank()
*   dense_rank()
*   persent_rank()
*   ntile()

**Obs.** A função orderBy() usada com Window.partitionBy() não é a mesma usada com as funções de agregação de groupBy(). Enquanto essa retorna um DataFrame o outro cria uma WindowSpec.

In [68]:
df_.show(5)

+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Dia|Mes| Ano|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
|Argentina|  3|  DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)|   169|    65| 31| 08|1992|
|Argentina| 22|  MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...|   169|    65| 21| 01|1996|
|Argentina| 15|  MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...|   167|    66| 15| 02|1993|
|Argentina| 18|  DF|    SALVIO Eduardo|13.07.1990|    SALVIO|    SL Benfica (POR)|   167|    69| 13| 07|1990|
|Argentina| 10|  FW|      MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)|   170|    72| 24| 06|1987|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+
only showi

row_number()

In [69]:
prt = Window.partitionBy('Team').orderBy(desc('Height'))
print(type(prt))
print(type(row_number()))
df_.withColumn('row', row_number().over(prt)).show(10)

<class 'pyspark.sql.window.WindowSpec'>
<class 'pyspark.sql.column.Column'>
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+---+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Dia|Mes| Ano|row|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+---+
|Argentina|  6|  DF|    FAZIO Federico|17.03.1987|     FAZIO|       AS Roma (ITA)|   199|    85| 17| 03|1987|  1|
|Argentina|  1|  GK|     GUZMAN Nahuel|10.02.1986|    GUZMÁN|   Tigres UANL (MEX)|   192|    90| 10| 02|1986|  2|
|Argentina| 16|  DF|       ROJO Marcos|20.03.1990|      ROJO|Manchester United...|   189|    82| 20| 03|1990|  3|
|Argentina| 12|  GK|     ARMANI Franco|16.10.1986|    ARMANI|CA River Plate (ARG)|   189|    85| 16| 10|1986|  4|
|Argentina| 23|  GK|CABALLERO Wilfredo|28.09.1981| CABALLERO|    Chelsea FC (ENG)|   186|    80| 28| 09|1981|  5|
|Argentina| 

rank(): Note como rank=3 se repete duas vezes e depois há um salto para rank=5. Esta é uma peculiaridade dessa função.

In [70]:
prt = Window.partitionBy('Team').orderBy(desc('Height'))
df_.withColumn('rank', rank().over(prt)).show(10)

+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+----+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Dia|Mes| Ano|rank|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+----+
|Argentina|  6|  DF|    FAZIO Federico|17.03.1987|     FAZIO|       AS Roma (ITA)|   199|    85| 17| 03|1987|   1|
|Argentina|  1|  GK|     GUZMAN Nahuel|10.02.1986|    GUZMÁN|   Tigres UANL (MEX)|   192|    90| 10| 02|1986|   2|
|Argentina| 16|  DF|       ROJO Marcos|20.03.1990|      ROJO|Manchester United...|   189|    82| 20| 03|1990|   3|
|Argentina| 12|  GK|     ARMANI Franco|16.10.1986|    ARMANI|CA River Plate (ARG)|   189|    85| 16| 10|1986|   3|
|Argentina| 23|  GK|CABALLERO Wilfredo|28.09.1981| CABALLERO|    Chelsea FC (ENG)|   186|    80| 28| 09|1981|   5|
|Argentina|  9|  FW|   HIGUAIN Gonzalo|10.12.1987|   HIGUAÍN|   Juventus FC (ITA

dense_rank(): Aqui, mesmo que rank=3 se repita o próximo valor de rank é 4 e assim por diante. Não há saltos de valores.

In [71]:
prt = Window.partitionBy('Team').orderBy(desc('Height'))
df_.withColumn('dense_rank', dense_rank().over(prt)).show(10)

+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+----------+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Dia|Mes| Ano|dense_rank|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+----------+
|Argentina|  6|  DF|    FAZIO Federico|17.03.1987|     FAZIO|       AS Roma (ITA)|   199|    85| 17| 03|1987|         1|
|Argentina|  1|  GK|     GUZMAN Nahuel|10.02.1986|    GUZMÁN|   Tigres UANL (MEX)|   192|    90| 10| 02|1986|         2|
|Argentina| 16|  DF|       ROJO Marcos|20.03.1990|      ROJO|Manchester United...|   189|    82| 20| 03|1990|         3|
|Argentina| 12|  GK|     ARMANI Franco|16.10.1986|    ARMANI|CA River Plate (ARG)|   189|    85| 16| 10|1986|         3|
|Argentina| 23|  GK|CABALLERO Wilfredo|28.09.1981| CABALLERO|    Chelsea FC (ENG)|   186|    80| 28| 09|1981|         4|
|Argentina|  9|  FW|   HIGUAIN G

persent_rank(): ranking relativo(percentual)

In [72]:
prt = Window.partitionBy('Team').orderBy(desc('Height'))
df_.withColumn('persent_rank', percent_rank().over(prt)).show(10)

+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+--------------------+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Dia|Mes| Ano|        persent_rank|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+--------------------+
|Argentina|  6|  DF|    FAZIO Federico|17.03.1987|     FAZIO|       AS Roma (ITA)|   199|    85| 17| 03|1987|                 0.0|
|Argentina|  1|  GK|     GUZMAN Nahuel|10.02.1986|    GUZMÁN|   Tigres UANL (MEX)|   192|    90| 10| 02|1986|0.045454545454545456|
|Argentina| 16|  DF|       ROJO Marcos|20.03.1990|      ROJO|Manchester United...|   189|    82| 20| 03|1990| 0.09090909090909091|
|Argentina| 12|  GK|     ARMANI Franco|16.10.1986|    ARMANI|CA River Plate (ARG)|   189|    85| 16| 10|1986| 0.09090909090909091|
|Argentina| 23|  GK|CABALLERO Wilfredo|28.09.1981| CABALLERO|    Chelsea FC (ENG)| 

ntile(): Divide cada partição em uma quantidade n de quartiles. Cada quartile recebe um valor único. Caso uma partição não seja divisível por n o aloritmo ajustará a quantidade de instâncias de dados pertencentes aos últimos quartiles de modo que a partição tenha n quartiles. Por exemplo, na seleção da Argentina, quando n=5, o penúltimo quartil tem 4 instâncias para poder formar mais um, o último quartil que terá apenas uma instância.

In [73]:
prt = Window.partitionBy('Team').orderBy(desc('Height'))
df_.withColumn('ntile', ntile(5).over(prt)).show(20)

+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+-----+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Dia|Mes| Ano|ntile|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+-----+
|Argentina|  6|  DF|    FAZIO Federico|17.03.1987|     FAZIO|       AS Roma (ITA)|   199|    85| 17| 03|1987|    1|
|Argentina|  1|  GK|     GUZMAN Nahuel|10.02.1986|    GUZMÁN|   Tigres UANL (MEX)|   192|    90| 10| 02|1986|    1|
|Argentina| 16|  DF|       ROJO Marcos|20.03.1990|      ROJO|Manchester United...|   189|    82| 20| 03|1990|    1|
|Argentina| 12|  GK|     ARMANI Franco|16.10.1986|    ARMANI|CA River Plate (ARG)|   189|    85| 16| 10|1986|    1|
|Argentina| 23|  GK|CABALLERO Wilfredo|28.09.1981| CABALLERO|    Chelsea FC (ENG)|   186|    80| 28| 09|1981|    1|
|Argentina|  9|  FW|   HIGUAIN Gonzalo|10.12.1987|   HIGUAÍN|   Juventus

Lag function: O mesmo tipo de lag usado em séries temporais.

In [74]:
prt = Window.partitionBy('Team').orderBy(desc('Height'))
df_.withColumn('lag', lag('Weight', offset=2).over(prt)).show(10)

+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+----+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Dia|Mes| Ano| lag|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+----+
|Argentina|  6|  DF|    FAZIO Federico|17.03.1987|     FAZIO|       AS Roma (ITA)|   199|    85| 17| 03|1987|null|
|Argentina|  1|  GK|     GUZMAN Nahuel|10.02.1986|    GUZMÁN|   Tigres UANL (MEX)|   192|    90| 10| 02|1986|null|
|Argentina| 16|  DF|       ROJO Marcos|20.03.1990|      ROJO|Manchester United...|   189|    82| 20| 03|1990|  85|
|Argentina| 12|  GK|     ARMANI Franco|16.10.1986|    ARMANI|CA River Plate (ARG)|   189|    85| 16| 10|1986|  90|
|Argentina| 23|  GK|CABALLERO Wilfredo|28.09.1981| CABALLERO|    Chelsea FC (ENG)|   186|    80| 28| 09|1981|  82|
|Argentina|  9|  FW|   HIGUAIN Gonzalo|10.12.1987|   HIGUAÍN|   Juventus FC (ITA

Forward function: O mesmo tipo de forward usado em séries temporais.

In [75]:
prt = Window.partitionBy('Team').orderBy(desc('Height'))
df_.withColumn('lead', lead('Weight', offset=1).over(prt)).show(10)

+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+----+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|Dia|Mes| Ano|lead|
+---------+---+----+------------------+----------+----------+--------------------+------+------+---+---+----+----+
|Argentina|  6|  DF|    FAZIO Federico|17.03.1987|     FAZIO|       AS Roma (ITA)|   199|    85| 17| 03|1987|  90|
|Argentina|  1|  GK|     GUZMAN Nahuel|10.02.1986|    GUZMÁN|   Tigres UANL (MEX)|   192|    90| 10| 02|1986|  82|
|Argentina| 16|  DF|       ROJO Marcos|20.03.1990|      ROJO|Manchester United...|   189|    82| 20| 03|1990|  85|
|Argentina| 12|  GK|     ARMANI Franco|16.10.1986|    ARMANI|CA River Plate (ARG)|   189|    85| 16| 10|1986|  80|
|Argentina| 23|  GK|CABALLERO Wilfredo|28.09.1981| CABALLERO|    Chelsea FC (ENG)|   186|    80| 28| 09|1981|  75|
|Argentina|  9|  FW|   HIGUAIN Gonzalo|10.12.1987|   HIGUAÍN|   Juventus FC (ITA