In [1]:
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("pysparkdf").getOrCreate()
from pyspark.sql.window import Window
from googletrans import Translator
translator = Translator()

In [2]:
felices2021 = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("C:/Users/Gonzalo/Desktop/BOOTCAMP BIGDATA/big-data-processing/practica/world-happiness-report-2021.csv")
felices = spark.read.format("csv")\
.option("header", "true")\
.option("inferSchema", "true")\
.load("C:/Users/Gonzalo/Desktop/BOOTCAMP BIGDATA/big-data-processing/practica/world-happiness-report.csv")

1 - País más felíz del 2021 ?

In [3]:
pais_mas_feliz = felices2021.select(
  "Country name",
  "Ladder score"
).orderBy(desc("Ladder score"))\
  .limit(1)

pais_mas_feliz.cache
pais_mas_feliz.show()
x = pais_mas_feliz.first()
pais = x["Country name"]
pais_traducido = translator.translate(pais, dest='es').text

print(f"El país más felíz es {pais_traducido}")

+------------+------------+
|Country name|Ladder score|
+------------+------------+
|     Finland|       7.842|
+------------+------------+

El país más felíz es Finlandia


2 - País más felíz 2021 por continente

In [18]:
continentes = spark.createDataFrame([("Western Europe", "Europe"), ("North America and ANZ", "America"),
  ("Middle East and North Africa", "Africa"), ("Latin America and Caribbean", "America"),
  ("Central and Eastern Europe", "Europe"), ("East Asia", "Asia"),
  ("Southeast Asia", "Asia"), ("Commonwealth of Independent States", "Asia"),
  ("Sub-Saharan Africa", "Africa"), ("South Asia","Asia")],["Regional Indicator","Continent"])

cruce2021 = felices2021.join(continentes, ["Regional Indicator"])

w_conti = Window.partitionBy("Continent").orderBy(desc("Ladder score"))

conti = cruce2021.withColumn("Rank", rank().over(w_conti))\
  .filter(col("Rank") == 1)\
  .select(
    "Continent",
    "Country name",
    "Ladder score"
  )

conti.cache
conti.show(truncate=False)

+---------+------------------------+------------+
|Continent|Country name            |Ladder score|
+---------+------------------------+------------+
|Africa   |Israel                  |7.157       |
|America  |New Zealand             |7.277       |
|Asia     |Taiwan Province of China|6.584       |
|Europe   |Finland                 |7.842       |
+---------+------------------------+------------+



3 - Cuál es el país que más veces top 1

In [16]:
w_y_felices = Window.partitionBy("year").orderBy(desc("Life Ladder"))

top = felices.withColumn("Rank", rank().over(w_y_felices))\
  .filter(col("Rank") == 1)\
  .groupBy("Country name").count().withColumnRenamed("count", "Times first")\
  .orderBy(desc("Times first"))\
  .limit(1)

top.cache
top.show()

+------------+-----------+
|Country name|Times first|
+------------+-----------+
|     Denmark|          7|
+------------+-----------+



4 - Puesto de Felicidad tiene el país con mayor GDP del 2020

In [17]:
w_y_GDP = Window.partitionBy("year").orderBy(desc("Log GDP per capita"))

top_GDP = felices.withColumn("GDP Rank", rank().over(w_y_GDP))\
  .withColumn("Life Ladder Rank", rank().over(w_y_felices))\
  .filter((col("year") == 2020) & (col("GDP Rank") == 1))\
  .select("Country name", "Life Ladder Rank", "Log GDP per capita")

top_GDP.cache
top_GDP.show()

+------------+----------------+------------------+
|Country name|Life Ladder Rank|Log GDP per capita|
+------------+----------------+------------------+
|     Ireland|              13|            11.323|
+------------+----------------+------------------+



5 - % ha variado a nivel mundial el GDP promedio del 2020 respecto al 2021

In [14]:
avg_2020 = felices.filter(col("year") == 2020).agg(
  avg(col("Log GDP per capita"))
).withColumnRenamed("avg(Log GDP per capita)", "Avg GDP 2020")

avg_2021 = felices2021.agg(
  avg(col("Logged GDP per capita"))
).withColumnRenamed("avg(Logged GDP per capita)", "Avg GDP 2021")

variacion = avg_2020.join(avg_2021)\
  .withColumn("Difference percentage", ((col("Avg GDP 2020") - col("Avg GDP 2021")) / col("Avg GDP 2021")) * 100)\
  .withColumn("Type of change", when(col("Difference percentage")> 0, "Increased").when(col("Difference percentage") == 0, "Unchanged").otherwise("Decreased"))

variacion.cache
variacion.show()
diferencia = variacion.select("Difference percentage").collect()[0][0]
if diferencia>0:
    estado = "Aumenta"
else:
    estado = "Desciende"

print(f"{estado} un {diferencia:.2f}% con respecto al año anterior")

+-----------------+-----------------+---------------------+--------------+
|     Avg GDP 2020|     Avg GDP 2021|Difference percentage|Type of change|
+-----------------+-----------------+---------------------+--------------+
|9.751329545454546|9.432208053691273|    3.383316927984697|     Increased|
+-----------------+-----------------+---------------------+--------------+

Aumenta un 3.38% con respecto al año anterior


6 - País con mayor expectativa de vida y cuánto tenía en en el 2019

In [11]:
expe = felices.select(col("year"), col("Country name"), col("Healthy life expectancy at birth"))
expe2021 = felices2021.select(lit(2021), col("Country name"), col("Healthy life expectancy").alias("Healthy life expectancy at birth"))
expecruce = expe.union(expe2021)

lastYear = expecruce.agg(max(col("year")))\
  .limit(1)\
  .take(1)[0][0]

w_year_expe = Window.partitionBy("year").orderBy(desc("Healthy life expectancy at birth"))

rank_expe = expecruce.withColumn("Rank", rank().over(w_year_expe))

pais_expe = expecruce.withColumn("Rank", rank().over(w_year_expe))\
  .filter((col("Rank") == 1) & (col("year")== lastYear))\
  .drop("Rank")

pais_expe.cache
pais_expe.show()

+----+------------+--------------------------------+
|year|Country name|Healthy life expectancy at birth|
+----+------------+--------------------------------+
|2021|   Singapore|                          76.953|
+----+------------+--------------------------------+



En 2019

In [13]:
pais_expe_2019 = pais_expe.select("Country name")\
  .join(rank_expe.filter(col("year") == 2019), ["Country name"])\
  .drop("Rank").drop("year")

pais_expe_2019.cache
rank_expe.unpersist
pais_expe_2019.show()

+------------+--------------------------------+
|Country name|Healthy life expectancy at birth|
+------------+--------------------------------+
|   Singapore|                            77.1|
+------------+--------------------------------+

