In [1]:
import os
from pyspark import SparkContext, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.types import (
    StructType,
    StructField,
    IntegerType,
    StringType,
    FloatType,
    Row
)

In [2]:
spark = SparkContext(master="local", appName="DataFrames")
sql_context = SQLContext(spark)

21/08/21 00:25:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Set parent directory

In [3]:
base = os.path.abspath(os.pardir)
data_dir = os.path.join(base, "work", "files")
data_dir

'/home/jovyan/work/files'

# Schema

In [4]:
juego_schema = StructType([
    StructField("juego_id", IntegerType(), False),
    StructField("nombre_juego", StringType(), False),
    StructField("anio", StringType(), False),
    StructField("temporada", StringType(), False),
    StructField("ciudad", StringType(), False)
])

juego_df = sql_context.read.schema(juego_schema).option("header", "true").csv(f"{data_dir}/juegos.csv")

In [5]:
juego_df.show(5)

+--------+------------+----+---------+---------+
|juego_id|nombre_juego|anio|temporada|   ciudad|
+--------+------------+----+---------+---------+
|       1| 1896 Verano|1896|   Verano|   Athina|
|       2| 1900 Verano|1900|   Verano|    Paris|
|       3| 1904 Verano|1904|   Verano|St. Louis|
|       4| 1906 Verano|1906|   Verano|   Athina|
|       5| 1908 Verano|1908|   Verano|   London|
+--------+------------+----+---------+---------+
only showing top 5 rows



21/08/21 00:25:13 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , nombre_juego, annio, temporada, ciudad
 Schema: juego_id, nombre_juego, anio, temporada, ciudad
Expected: juego_id but found: 
CSV file: file:///home/jovyan/work/files/juegos.csv


In [6]:
spark

## Convert RDD to DataFrame

In [28]:
deportista_olimpico_rdd = spark.textFile(f"{data_dir}/deportista.csv").map(lambda l: l.split(","))

In [29]:
deportista_olimpico_rdd.take(5)

[['deportista_id', 'nombre', 'genero', 'edad', 'altura', 'peso', 'equipo_id'],
 ['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199'],
 ['3', 'Gunnar Nielsen Aaby', '1', '24', '0', '0', '273'],
 ['4', 'Edgar Lindenau Aabye', '1', '34', '0', '0', '278']]

In [30]:
remove_headers = lambda idx, iterator: iter(list(iterator)[1:])

In [31]:
deportista_olimpico_rdd = deportista_olimpico_rdd.mapPartitionsWithIndex(remove_headers)

In [32]:
deportista_olimpico_rdd.take(5)

[['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199'],
 ['3', 'Gunnar Nielsen Aaby', '1', '24', '0', '0', '273'],
 ['4', 'Edgar Lindenau Aabye', '1', '34', '0', '0', '278'],
 ['5', 'Christine Jacoba Aaftink', '2', '21', '185', '82', '705']]

In [39]:
deportista_olimpico_rdd = deportista_olimpico_rdd.map(lambda l: (
    int(l[0]),
    l[1],
    int(l[2]),
    int(l[3]),
    int(l[4]),
    float(l[5]),
    int(l[6])
))

In [40]:
schema = StructType([
    StructField("deportista_id", IntegerType(), False),
    StructField("nombre", StringType(), False),
    StructField("genero", StringType(), False),
    StructField("edad", IntegerType(), False),
    StructField("altura", IntegerType(), False),
    StructField("peso", FloatType(), False),
    StructField("equipo_id", IntegerType(), False)
])


In [41]:
deportista_dataframe = sql_context.createDataFrame(deportista_olimpico_rdd, schema)

In [42]:
deportista_dataframe.show(5)

+-------------+--------------------+------+----+------+----+---------+
|deportista_id|              nombre|genero|edad|altura|peso|equipo_id|
+-------------+--------------------+------+----+------+----+---------+
|            1|           A Dijiang|     1|  24|   180|80.0|      199|
|            2|            A Lamusi|     1|  23|   170|60.0|      199|
|            3| Gunnar Nielsen Aaby|     1|  24|     0| 0.0|      273|
|            4|Edgar Lindenau Aabye|     1|  34|     0| 0.0|      278|
|            5|Christine Jacoba ...|     2|  21|   185|82.0|      705|
+-------------+--------------------+------+----+------+----+---------+
only showing top 5 rows



## Reto
- Transformar todos los CSV a dataframes

In [48]:
# Schemas

schemas = {
    "deporte": StructType([
        StructField("deporte_id", IntegerType(), False),
        StructField("deporte", StringType(), False)
    ]),
    "deportista": StructType([
        StructField("deportista_id", IntegerType(), False),
        StructField("nombre", StringType(), False),
        StructField("genero", StringType(), False),
        StructField("edad", IntegerType(), False),
        StructField("altura", IntegerType(), False),
        StructField("peso", FloatType(), False),
        StructField("equipo_id", IntegerType(), False)
    ]),
    "evento": StructType([
        StructField("evento_id", IntegerType(), False),
        StructField("evento", StringType(), False),
        StructField("deporte_id", IntegerType(), False)
    ]),
    "juegos": StructType([
        StructField("juego_id", IntegerType(), False),
        StructField("nombre_juego", StringType(), False),
        StructField("anio", IntegerType(), False),
        StructField("temporada", StringType(), False),
        StructField("ciudad", StringType(), False)
    ]),
    "paises": StructType([
        StructField("pais_id", IntegerType(), False),
        StructField("equipo", StringType(), False),
        StructField("sigla", StringType(), False)
    ]),
    "resultados": StructType([
        StructField("resultado_id", IntegerType(), False),
        StructField("medalla", StringType(), False),
        StructField("deportista_id", IntegerType(), False),
        StructField("juego_id", IntegerType(), False),
        StructField("evento_id", IntegerType(), False)
    ])
}

In [49]:
# Dataframes

dp1 = sql_context.read.schema(schemas['deportista']).option("header", "true").csv(f"{data_dir}/deportista.csv")
dp2 = sql_context.read.schema(schemas['deportista']).option("header", "false").csv(f"{data_dir}/deportista2.csv")

dataframes = {
    "deporte": sql_context.read.schema(schemas['deporte']).option("header", "true").csv(f"{data_dir}/deporte.csv"),
    "deportista": dp1.union(dp2),
    "evento": sql_context.read.schema(schemas['evento']).option("header", "true").csv(f"{data_dir}/evento.csv"),
    "juegos": sql_context.read.schema(schemas['juegos']).option("header", "true").csv(f"{data_dir}/juegos.csv"),
    "paises": sql_context.read.schema(schemas['paises']).option("header", "true").csv(f"{data_dir}/paises.csv"),
    "resultados": sql_context.read.schema(schemas['resultados']).option("header", "true").csv(f"{data_dir}/resultados.csv")
}
del(dp1, dp2)

In [53]:
dataframes['deporte'].show(5)

+----------+-------------+
|deporte_id|      deporte|
+----------+-------------+
|         1|   Basketball|
|         2|         Judo|
|         3|     Football|
|         4|   Tug-Of-War|
|         5|Speed Skating|
+----------+-------------+
only showing top 5 rows



In [54]:
dataframes['deportista'].show(5)

+-------------+--------------------+------+----+------+----+---------+
|deportista_id|              nombre|genero|edad|altura|peso|equipo_id|
+-------------+--------------------+------+----+------+----+---------+
|            1|           A Dijiang|     1|  24|   180|80.0|      199|
|            2|            A Lamusi|     1|  23|   170|60.0|      199|
|            3| Gunnar Nielsen Aaby|     1|  24|     0| 0.0|      273|
|            4|Edgar Lindenau Aabye|     1|  34|     0| 0.0|      278|
|            5|Christine Jacoba ...|     2|  21|   185|82.0|      705|
+-------------+--------------------+------+----+------+----+---------+
only showing top 5 rows



In [55]:
dataframes['evento'].show(5)

+---------+--------------------+----------+
|evento_id|              evento|deporte_id|
+---------+--------------------+----------+
|        1|Basketball Men's ...|         1|
|        2|Judo Men's Extra-...|         2|
|        3|Football Men's Fo...|         3|
|        4|Tug-Of-War Men's ...|         4|
|        5|Speed Skating Wom...|         5|
+---------+--------------------+----------+
only showing top 5 rows



In [56]:
dataframes['juegos'].show(5)

+--------+------------+----+---------+---------+
|juego_id|nombre_juego|anio|temporada|   ciudad|
+--------+------------+----+---------+---------+
|       1| 1896 Verano|1896|   Verano|   Athina|
|       2| 1900 Verano|1900|   Verano|    Paris|
|       3| 1904 Verano|1904|   Verano|St. Louis|
|       4| 1906 Verano|1906|   Verano|   Athina|
|       5| 1908 Verano|1908|   Verano|   London|
+--------+------------+----+---------+---------+
only showing top 5 rows



21/08/21 01:34:53 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , nombre_juego, annio, temporada, ciudad
 Schema: juego_id, nombre_juego, anio, temporada, ciudad
Expected: juego_id but found: 
CSV file: file:///home/jovyan/work/files/juegos.csv


In [57]:
dataframes['paises'].show(5)

+-------+--------------------+-----+
|pais_id|              equipo|sigla|
+-------+--------------------+-----+
|      1|         30. Februar|  AUT|
|      2|A North American ...|  MEX|
|      3|           Acipactli|  MEX|
|      4|             Acturus|  ARG|
|      5|         Afghanistan|  AFG|
+-------+--------------------+-----+
only showing top 5 rows



21/08/21 01:35:04 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: id, equipo, sigla
 Schema: pais_id, equipo, sigla
Expected: pais_id but found: id
CSV file: file:///home/jovyan/work/files/paises.csv


In [58]:
dataframes['resultados'].show(5)

+------------+-------+-------------+--------+---------+
|resultado_id|medalla|deportista_id|juego_id|evento_id|
+------------+-------+-------------+--------+---------+
|           1|     NA|            1|      39|        1|
|           2|     NA|            2|      49|        2|
|           3|     NA|            3|       7|        3|
|           4|   Gold|            4|       2|        4|
|           5|     NA|            5|      36|        5|
+------------+-------+-------------+--------+---------+
only showing top 5 rows



In [59]:
spark.stop()