In [0]:
df = spark.table("life_expectancy_csv")

In [0]:
display(df)

Country,Sum of Females Life Expectancy,Sum of Life Expectancy (both sexes),Sum of Males Life Expectancy
Chad,57.19,55.24,53.36
Nigeria,54.94,54.64,54.33
South Sudan,60.75,57.74,54.76
Lesotho,60.44,57.8,55.03
Central African Republic,59.56,57.67,55.51
Somalia,61.55,58.97,56.49
Burkina Faso,63.43,61.29,59.11
Mali,62.15,60.68,59.25
Benin,62.42,60.96,59.52
Guinea,62.09,60.9,59.66


In [0]:
df_pandas = df.toPandas()

In [0]:
df_pandas.head()

Unnamed: 0,Country,Sum of Females Life Expectancy,Sum of Life Expectancy (both sexes),Sum of Males Life Expectancy
0,Chad,57.19,55.24,53.36
1,Nigeria,54.94,54.64,54.33
2,South Sudan,60.75,57.74,54.76
3,Lesotho,60.44,57.8,55.03
4,Central African Republic,59.56,57.67,55.51


In [0]:
# Converter data frame de pandas para Spark
# Ao fazer um data frame ou extrair um df em Pandas é possível convertê-lo
df_spark = spark.createDataFrame(df_pandas)

In [0]:
df.select('Country').display

Out[8]: <bound method apply_dataframe_display_patch.<locals>.df_display of DataFrame[Country: string]>

In [0]:
# Dados descritivos
df.select(
    ["`Sum of Females  Life Expectancy`", "`Sum of Life Expectancy  (both sexes)`"]
).describe().display()


summary,Sum of Females Life Expectancy,Sum of Life Expectancy (both sexes)
count,200.0,200.0
mean,76.77689999999998,74.13370000000002
stddev,8.820332232601405,8.571318206025527
min,54.94,54.64
max,149.22,143.28


In [0]:
# Filtrando um país
df_filtered = df.filter(df["Country"] == "Brazil")
display(df_filtered)

Country,Sum of Females Life Expectancy,Sum of Life Expectancy (both sexes),Sum of Males Life Expectancy
Brazil,79.15,76.02,72.95


In [0]:
# Filtrando expectativa maior que 75
df_filtered = df.filter(df["`Sum of Females  Life Expectancy`"] > 75)
display(df_filtered)

Country,Sum of Females Life Expectancy,Sum of Life Expectancy (both sexes),Sum of Males Life Expectancy
Moldova,75.67,71.33,66.7
Russia,79.19,73.34,67.48
Mongolia,76.66,71.99,67.49
El Salvador,76.49,72.3,67.76
Kyrgyzstan,75.4,71.82,68.31
Venezuela,76.66,72.67,68.89
Saint Lucia,76.45,72.85,69.45
Tonga,76.54,73.07,69.5
Uzbekistan,75.55,72.53,69.56
Belarus,79.22,74.62,69.74


In [0]:
print(df.columns)

['Country', 'Sum of Females  Life Expectancy', 'Sum of Life Expectancy  (both sexes)', 'Sum of Males  Life Expectancy']


In [0]:
# Filtrando múltiplas condições
df_filtered = df.filter(
    (df["`Sum of Females  Life Expectancy`"] > 75) & 
    (df["`Sum of Life Expectancy  (both sexes)`"] > 70)
)
display(df_filtered)

Country,Sum of Females Life Expectancy,Sum of Life Expectancy (both sexes),Sum of Males Life Expectancy
Moldova,75.67,71.33,66.7
Russia,79.19,73.34,67.48
Mongolia,76.66,71.99,67.49
El Salvador,76.49,72.3,67.76
Kyrgyzstan,75.4,71.82,68.31
Venezuela,76.66,72.67,68.89
Saint Lucia,76.45,72.85,69.45
Tonga,76.54,73.07,69.5
Uzbekistan,75.55,72.53,69.56
Belarus,79.22,74.62,69.74


In [0]:
# Filtro between
df_filtered = df.filter(df["`Sum of Females  Life Expectancy`"].between(70, 80))
display(df_filtered)

Country,Sum of Females Life Expectancy,Sum of Life Expectancy (both sexes),Sum of Males Life Expectancy
Namibia,71.48,67.52,63.45
Myanmar,70.38,67.1,63.99
Malawi,70.77,67.56,64.27
Ethiopia,71.02,67.6,64.36
Tanzania,70.0,67.21,64.4
State of Palestine,74.25,69.21,65.22
Pakistan,70.32,67.8,65.46
Uganda,71.37,68.49,65.49
Rwanda,70.14,68.02,65.71
Gabon,71.24,68.51,66.06


In [0]:
# Renomeando as colunas
df = df.withColumnRenamed("Sum of Females  Life Expectancy", "Expectativa_de_Vida_Mulheres") \
       .withColumnRenamed("Sum of Life Expectancy  (both sexes)", "Expectativa_de_Vida_Geral") \
       .withColumnRenamed("Sum of Males  Life Expectancy", "Expectativa_de_Vida_Homens") \
       .withColumnRenamed("Country", "Pais")

print(df.columns)  # Verificar os novos nomes

['Pais', 'Expectativa_de_Vida_Mulheres', 'Expectativa_de_Vida_Geral', 'Expectativa_de_Vida_Homens']


In [0]:
# Realizando filtros com os novos nomes
df_filtrado = df.filter(df["Expectativa_de_Vida_Mulheres"].between(70, 80))
display(df_filtrado)

Pais,Expectativa_de_Vida_Mulheres,Expectativa_de_Vida_Geral,Expectativa_de_Vida_Homens
Namibia,71.48,67.52,63.45
Myanmar,70.38,67.1,63.99
Malawi,70.77,67.56,64.27
Ethiopia,71.02,67.6,64.36
Tanzania,70.0,67.21,64.4
State of Palestine,74.25,69.21,65.22
Pakistan,70.32,67.8,65.46
Uganda,71.37,68.49,65.49
Rwanda,70.14,68.02,65.71
Gabon,71.24,68.51,66.06


In [0]:
# Paises com expectativa de vida mais alta
df_top_vida = df.filter(df["Expectativa_de_Vida_Geral"] > 85).select("Pais", "Expectativa_de_Vida_Geral")
display(df_top_vida)

Pais,Expectativa_de_Vida_Geral
Hong Kong,85.63
Micronesia,143.28


In [0]:
# Comparar expectativas de vidas entre homens e mulheres
df_comparacao = df.withColumn("Diferenca_Expectativa_Vida", df["Expectativa_de_Vida_Mulheres"] - df["Expectativa_de_Vida_Homens"])
df_comparacao_filtrado = df_comparacao.select("Pais", "Expectativa_de_Vida_Mulheres", "Expectativa_de_Vida_Homens", "Diferenca_Expectativa_Vida")
display(df_comparacao_filtrado)

Pais,Expectativa_de_Vida_Mulheres,Expectativa_de_Vida_Homens,Diferenca_Expectativa_Vida
Chad,57.19,53.36,3.8299999999999983
Nigeria,54.94,54.33,0.6099999999999994
South Sudan,60.75,54.76,5.990000000000002
Lesotho,60.44,55.03,5.409999999999997
Central African Republic,59.56,55.51,4.050000000000004
Somalia,61.55,56.49,5.059999999999995
Burkina Faso,63.43,59.11,4.32
Mali,62.15,59.25,2.8999999999999986
Benin,62.42,59.52,2.8999999999999986
Guinea,62.09,59.66,2.430000000000007


In [0]:
# Distribuição da expectativa de vida
df.describe(["Expectativa_de_Vida_Mulheres", "Expectativa_de_Vida_Homens", "Expectativa_de_Vida_Geral"]).show()

+-------+----------------------------+--------------------------+-------------------------+
|summary|Expectativa_de_Vida_Mulheres|Expectativa_de_Vida_Homens|Expectativa_de_Vida_Geral|
+-------+----------------------------+--------------------------+-------------------------+
|  count|                         200|                       200|                      200|
|   mean|           76.77689999999998|         71.51719999999999|        74.13370000000002|
| stddev|           8.820332232601405|         8.419632546679534|        8.571318206025527|
|    min|                       54.94|                     53.36|                    54.64|
|    max|                      149.22|                    137.64|                   143.28|
+-------+----------------------------+--------------------------+-------------------------+



In [0]:
# Paises com maiores diferenças entre homens e mulheres
df_diferenca = df.withColumn("Diferenca", df["Expectativa_de_Vida_Mulheres"] - df["Expectativa_de_Vida_Homens"])
df_maior_diferenca = df_diferenca.orderBy(df_diferenca["Diferenca"], ascending=False).select("Pais", "Diferenca")
display(df_maior_diferenca)

Pais,Diferenca
Russia,11.709999999999994
Micronesia,11.580000000000013
U.S. Virgin Islands,10.75
Ukraine,9.599999999999994
Belarus,9.480000000000004
Georgia,9.480000000000004
Lithuania,9.379999999999995
Vietnam,9.329999999999998
Mongolia,9.170000000000002
State of Palestine,9.03


In [0]:
from pyspark.sql.functions import avg

df_media = df.select(
    avg("Expectativa_de_Vida_Mulheres").alias("Media_Expectativa_Mulheres"),
    avg("Expectativa_de_Vida_Homens").alias("Media_Expectativa_Homens"),
    avg("Expectativa_de_Vida_Geral").alias("Media_Expectativa_Geral")
)
df_media.show()

+--------------------------+------------------------+-----------------------+
|Media_Expectativa_Mulheres|Media_Expectativa_Homens|Media_Expectativa_Geral|
+--------------------------+------------------------+-----------------------+
|         76.77689999999998|       71.51719999999999|      74.13370000000002|
+--------------------------+------------------------+-----------------------+



In [0]:
# Criando coluna da diferença de expectativa de vidas
df = df.withColumn("Diferenca_Expectativa_Vida", 
                   df["Expectativa_de_Vida_Mulheres"] - df["Expectativa_de_Vida_Homens"])

In [0]:
# Criando coluna com a proporção da expectativa de vida geral para mulheres
df = df.withColumn("Proporcao_Mulheres_para_Geral", 
                   df["Expectativa_de_Vida_Mulheres"] / df["Expectativa_de_Vida_Geral"])

In [0]:
# Criando coluna com a proporção da expectativa de vida geral para homens
df = df.withColumn("Proporcao_Homens_para_Geral", 
                   df["Expectativa_de_Vida_Homens"] / df["Expectativa_de_Vida_Geral"])

In [0]:
# Classificação das diferenças de idade
from pyspark.sql.functions import when

df = df.withColumn("Classificacao_Expectativa_Geral", 
                   when(df["Expectativa_de_Vida_Geral"] < 60, "Baixa")
                   .when(df["Expectativa_de_Vida_Geral"] <= 80, "Média")
                   .otherwise("Alta"))

In [0]:
# Criando um índice da diferença de idades
df = df.withColumn("Indice_Diferenca", 
                   (df["Expectativa_de_Vida_Mulheres"] - df["Expectativa_de_Vida_Homens"]) /
                   df["Expectativa_de_Vida_Geral"])

In [0]:
# Expectativa acima da média
media_global = 75
df = df.withColumn("Acima_da_Media", df["Expectativa_de_Vida_Geral"] > media_global)