In [45]:
import os
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import (
    col,
    sum as _sum,
    avg as _avg
)
from schemas import (
    base,
    data_dir,
    get_schemas,
    load_dataframes_from_schemas
)

# Spark Context

In [8]:
spark = SparkContext(master="local", appName="Replicacion")
sql_context = SQLContext(spark)

# Load Dataframes

In [11]:
dataframes = load_dataframes_from_schemas(sql_context)

In [12]:
dataframes.keys()

dict_keys(['deporte', 'deportista', 'evento', 'juegos', 'paises', 'resultados'])

# Class - DF and replication

In [31]:
medallas_x_anio = dataframes["deportista"].join(
    dataframes["resultados"],
    dataframes["deportista"].deportista_id == dataframes["resultados"].deportista_id,
    "left"
).join(
    dataframes["juegos"],
    dataframes["juegos"].juego_id == dataframes["resultados"].juego_id,
    "left"
).join(
    dataframes["paises"],
    dataframes["paises"].pais_id == dataframes["deportista"].equipo_id,
    "left"
).join(
    dataframes["evento"],
    dataframes["evento"].evento_id == dataframes["resultados"].evento_id,
    "left"
).join(
    dataframes["deporte"],
    dataframes["evento"].deporte_id == dataframes["deporte"].deporte_id,
    "left"
).select(
    "sigla", "anio", "medalla",
    dataframes["evento"].evento.alias("Nombre subdisciplina"),
    dataframes["deporte"].deporte.alias("Nombre disciplina"),
    dataframes["deportista"].nombre
)


In [32]:
medallas_x_anio.show()

[Stage 20:>                                                         (0 + 1) / 1]21/08/21 17:29:22 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , annio
 Schema: juego_id, anio
Expected: juego_id but found: 
CSV file: file:///home/jovyan/work/files/juegos.csv
                                                                                21/08/21 17:29:22 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id, sigla
 Schema: pais_id, sigla
Expected: pais_id but found: id
CSV file: file:///home/jovyan/work/files/paises.csv


+-----+----+-------+--------------------+--------------------+--------------------+
|sigla|anio|medalla|Nombre subdisciplina|   Nombre disciplina|              nombre|
+-----+----+-------+--------------------+--------------------+--------------------+
|  CHN|1992|     NA|Basketball Men's ...|          Basketball|           A Dijiang|
|  CHN|2012|     NA|Judo Men's Extra-...|                Judo|            A Lamusi|
|  DEN|1920|     NA|Football Men's Fo...|            Football| Gunnar Nielsen Aaby|
|  SWE|1900|   Gold|Tug-Of-War Men's ...|          Tug-Of-War|Edgar Lindenau Aabye|
|  NED|1994|     NA|Speed Skating Wom...|       Speed Skating|Christine Jacoba ...|
|  NED|1994|     NA|Speed Skating Wom...|       Speed Skating|Christine Jacoba ...|
|  NED|1992|     NA|Speed Skating Wom...|       Speed Skating|Christine Jacoba ...|
|  NED|1992|     NA|Speed Skating Wom...|       Speed Skating|Christine Jacoba ...|
|  NED|1988|     NA|Speed Skating Wom...|       Speed Skating|Christine Jaco

In [40]:
medallas_x_anio_2 = (medallas_x_anio
     .filter(medallas_x_anio.medalla != "NA")
     .sort("anio")
     .groupBy("sigla", "anio", "Nombre subdisciplina")
     .count()
)

In [41]:
medallas_x_anio_2.printSchema()

root
 |-- sigla: string (nullable = true)
 |-- anio: integer (nullable = true)
 |-- Nombre subdisciplina: string (nullable = true)
 |-- count: long (nullable = false)



In [42]:
medallas_x_anio_2.show(5)

21/08/21 17:35:17 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , annio
 Schema: juego_id, anio
Expected: juego_id but found: 
CSV file: file:///home/jovyan/work/files/juegos.csv
21/08/21 17:35:17 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id, sigla
 Schema: pais_id, sigla
Expected: pais_id but found: id
CSV file: file:///home/jovyan/work/files/paises.csv


+-----+----+--------------------+-----+
|sigla|anio|Nombre subdisciplina|count|
+-----+----+--------------------+-----+
|  MEX|1984|Wrestling Men's F...|    1|
|  FIN|1960|Cross Country Ski...|    4|
|  CAN|2010|Snowboarding Men'...|    1|
|  YUG|1984|Water Polo Men's ...|   13|
|  RUS|2012|Athletics Women's...|    6|
+-----+----+--------------------+-----+
only showing top 5 rows



In [46]:
(medallas_x_anio_2
    .groupBy("sigla", "anio")
    .agg(
        _sum("count").alias("Total de medallas"),
        _avg("count").alias("Promedio medallas")
    )
    .show()
)

21/08/21 17:38:57 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , annio
 Schema: juego_id, anio
Expected: juego_id but found: 
CSV file: file:///home/jovyan/work/files/juegos.csv
21/08/21 17:38:57 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id, sigla
 Schema: pais_id, sigla
Expected: pais_id but found: id
CSV file: file:///home/jovyan/work/files/paises.csv

+-----+----+-----------------+------------------+
|sigla|anio|Total de medallas| Promedio medallas|
+-----+----+-----------------+------------------+
|  USA|2012|              248|2.7252747252747254|
|  FRA|2006|               15|1.6666666666666667|
|  KOR|2010|               18|               1.5|
|  FIN|1988|               38|               3.8|
|  BLR|2000|               15|               1.5|
|  VEN|2012|                1|               1.0|
|  FRA|1948|               77|2.3333333333333335|
|  GBR|2000|               55|1.9642857142857142|
|  FRG|1994|                6|               1.0|
|  JPN|1932|               31|2.5833333333333335|
|  QAT|2012|                2|               1.0|
|  KOR|1972|                1|               1.0|
|  NED|1972|               15|1.3636363636363635|
|  GER|1932|               57|              2.28|
|  NZL|1988|               24|1.8461538461538463|
|  AUS|1972|               20|1.1764705882352942|
|  THA|1988|                1|               1.0|


                                                                                