In [1]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType
from pyspark.sql.types import Row
from pyspark.sql import SQLContext

In [2]:
spark = SparkContext(master="local", appName="DataFrames")
sqlContext = SQLContext(spark)

In [3]:
!ls ./data

data.csv	 deportista.csv       juegos.csv	     resultados.csv
deporte.csv	 deportistaError.csv  modelo_relacional.jpg
deportista2.csv  evento.csv	      paises.csv


In [4]:
juegoSchema = StructType([
    StructField("juego_id", IntegerType(), False),
    StructField("anio", StringType(), False),
    StructField("temporada",StringType(),False),
    StructField("ciudad", StringType(), False)
])

juegoDF = sqlContext.read.schema(juegoSchema)\
            .option("header", "true").csv("./data/juegos.csv")

In [5]:
juegoDF.show(4)

+--------+-----------+---------+------+
|juego_id|       anio|temporada|ciudad|
+--------+-----------+---------+------+
|       1|1896 Verano|     1896|Verano|
|       2|1900 Verano|     1900|Verano|
|       3|1904 Verano|     1904|Verano|
|       4|1906 Verano|     1906|Verano|
+--------+-----------+---------+------+
only showing top 4 rows



In [6]:
spark


In [35]:
deportistaOlimpicoRDD = spark.textFile("./data/deportista.csv") \
    .map(lambda l: l.split(","))
deportistaolimpicoRDD2 = spark.textFile("./data/deportista2.csv") \
    .map(lambda l: l.split(","))
# Concatenacion
deportistaOlimpicoRDD = deportistaOlimpicoRDD.union(deportistaolimpicoRDD2)
deportistaOlimpicoRDD.take(5)

[['deportista_id', 'nombre', 'genero', 'edad', 'altura', 'peso', 'equipo_id'],
 ['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199'],
 ['3', 'Gunnar Nielsen Aaby', '1', '24', '0', '0', '273'],
 ['4', 'Edgar Lindenau Aabye', '1', '34', '0', '0', '278']]

In [36]:
def delete_header(index, iterator):
    return iter(list(iterator)[1:])

In [37]:
deportistaOlimpicoRDD = deportistaOlimpicoRDD \
                        .mapPartitionsWithIndex(delete_header)

In [38]:
deportistaOlimpicoRDD.take(5)

[['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199'],
 ['3', 'Gunnar Nielsen Aaby', '1', '24', '0', '0', '273'],
 ['4', 'Edgar Lindenau Aabye', '1', '34', '0', '0', '278'],
 ['5', 'Christine Jacoba Aaftink', '2', '21', '185', '82', '705']]

In [39]:
deportistaOlimpicoRDD = deportistaOlimpicoRDD.map(lambda l: (
int(l[0]),l[1],int(l[2]),int(l[3]),int(l[4]),float(l[5]),int(l[6])))

In [40]:
schema = StructType([
    StructField("deportista_id", IntegerType(), False),
    StructField("nombre", StringType(), False),
    StructField("genero", IntegerType(), False),
    StructField("edad", IntegerType(), False),
    StructField("altura", IntegerType(), False),
    StructField("peso", FloatType(), False),
    StructField("equipo_id", IntegerType(), False)
])

In [41]:
deportistaOlimpicoDF = sqlContext.createDataFrame(deportistaOlimpicoRDD,schema)

In [42]:
deportistaOlimpicoDF.show(10)

+-------------+--------------------+------+----+------+----+---------+
|deportista_id|              nombre|genero|edad|altura|peso|equipo_id|
+-------------+--------------------+------+----+------+----+---------+
|            1|           A Dijiang|     1|  24|   180|80.0|      199|
|            2|            A Lamusi|     1|  23|   170|60.0|      199|
|            3| Gunnar Nielsen Aaby|     1|  24|     0| 0.0|      273|
|            4|Edgar Lindenau Aabye|     1|  34|     0| 0.0|      278|
|            5|Christine Jacoba ...|     2|  21|   185|82.0|      705|
|            6|     Per Knut Aaland|     1|  31|   188|75.0|     1096|
|            7|        John Aalberg|     1|  31|   183|72.0|     1096|
|            8|Cornelia Cor Aalt...|     2|  18|   168| 0.0|      705|
|            9|    Antti Sami Aalto|     1|  26|   186|96.0|      350|
|           10|Einar Ferdinand E...|     1|  26|     0| 0.0|      350|
+-------------+--------------------+------+----+------+----+---------+
only s

In [15]:
!ls ./data

data.csv	 deportista.csv       juegos.csv	     resultados.csv
deporte.csv	 deportistaError.csv  modelo_relacional.jpg
deportista2.csv  evento.csv	      paises.csv


In [16]:
#! cat ./data/evento.csv

In [17]:
schema_evento = StructType([
    StructField("evento_id", IntegerType(), False),
    StructField("evento", StringType(), False),
    StructField("deporte_id", IntegerType(), False),
])

eventoDF = sqlContext.read.format("csv")\
                .option("header", True)\
                .schema(schema_evento)\
                .load("./data/evento.csv")
    
eventoDF.show(5)

+---------+--------------------+----------+
|evento_id|              evento|deporte_id|
+---------+--------------------+----------+
|        1|Basketball Men's ...|         1|
|        2|Judo Men's Extra-...|         2|
|        3|Football Men's Fo...|         3|
|        4|Tug-Of-War Men's ...|         4|
|        5|Speed Skating Wom...|         5|
+---------+--------------------+----------+
only showing top 5 rows



In [18]:
#! cat ./data/paises.csv

In [19]:
schema_paises = StructType([
    StructField("id", IntegerType(), False),
    StructField("equipo", StringType(), False),
    StructField("sigla", StringType(), False),
])

paisesDF = sqlContext.read.format("csv")\
                .option("header", True)\
                .schema(schema_paises)\
                .load("./data/paises.csv")
    
paisesDF.show(5)

+---+--------------------+-----+
| id|              equipo|sigla|
+---+--------------------+-----+
|  1|         30. Februar|  AUT|
|  2|A North American ...|  MEX|
|  3|           Acipactli|  MEX|
|  4|             Acturus|  ARG|
|  5|         Afghanistan|  AFG|
+---+--------------------+-----+
only showing top 5 rows



In [20]:
# ! cat ./data/resultados.csv

In [21]:
schema_resultados = StructType([
    StructField("resultado_id", IntegerType(), False),
    StructField("medalla", StringType(), False),
    StructField("deportista_id", IntegerType(), False),
    StructField("juego_id", IntegerType(), False),
    StructField("evento_id", IntegerType(), False),
])

resultadosDF = sqlContext.read.format("csv")\
                .option("header", True)\
                .schema(schema_resultados)\
                .load("./data/resultados.csv")
    
resultadosDF.show(5)

+------------+-------+-------------+--------+---------+
|resultado_id|medalla|deportista_id|juego_id|evento_id|
+------------+-------+-------------+--------+---------+
|           1|     NA|            1|      39|        1|
|           2|     NA|            2|      49|        2|
|           3|     NA|            3|       7|        3|
|           4|   Gold|            4|       2|        4|
|           5|     NA|            5|      36|        5|
+------------+-------+-------------+--------+---------+
only showing top 5 rows



In [22]:
schema_deporte = StructType([
    StructField("id", IntegerType(), False),
    StructField("deporte", StringType(), False),
])

deportesDF = sqlContext.read.format("csv")\
                .option("header", True)\
                .schema(schema_deporte)\
                .load("./data/deporte.csv")
    
deportesDF.show(5)

+---+-------------+
| id|      deporte|
+---+-------------+
|  1|   Basketball|
|  2|         Judo|
|  3|     Football|
|  4|   Tug-Of-War|
|  5|Speed Skating|
+---+-------------+
only showing top 5 rows



In [23]:
deportesDF.printSchema()

root
 |-- id: integer (nullable = true)
 |-- deporte: string (nullable = true)



In [24]:
deportistaOlimpicoDF.printSchema()

root
 |-- deportista_id: integer (nullable = false)
 |-- nombre: string (nullable = false)
 |-- genero: integer (nullable = false)
 |-- edad: integer (nullable = false)
 |-- altura: integer (nullable = false)
 |-- peso: float (nullable = false)
 |-- equipo_id: integer (nullable = false)



In [25]:
deportistaOlimpicoDF = deportistaOlimpicoDF.withColumnRenamed("genero","sexo").drop("altura")
deportistaOlimpicoDF.printSchema()


root
 |-- deportista_id: integer (nullable = false)
 |-- nombre: string (nullable = false)
 |-- sexo: integer (nullable = false)
 |-- edad: integer (nullable = false)
 |-- peso: float (nullable = false)
 |-- equipo_id: integer (nullable = false)



In [26]:
from pyspark.sql.functions import *

deportistaOlimpicoDF = deportistaOlimpicoDF.select("deportista_id", 
                            "nombre", 
                            col("edad").alias("edadAlJugar"),"equipo_id")

In [27]:
deportistaOlimpicoDF.show(5)

+-------------+--------------------+-----------+---------+
|deportista_id|              nombre|edadAlJugar|equipo_id|
+-------------+--------------------+-----------+---------+
|            1|           A Dijiang|         24|      199|
|            2|            A Lamusi|         23|      199|
|            3| Gunnar Nielsen Aaby|         24|      273|
|            4|Edgar Lindenau Aabye|         34|      278|
|            5|Christine Jacoba ...|         21|      705|
+-------------+--------------------+-----------+---------+
only showing top 5 rows



In [28]:
deportistaOlimpicoDF.filter(
    deportistaOlimpicoDF.edadAlJugar != 0).sort("edadAlJugar").show()

+-------------+--------------------+-----------+---------+
|deportista_id|              nombre|edadAlJugar|equipo_id|
+-------------+--------------------+-----------+---------+
|        71691|  Dimitrios Loundras|         10|      333|
|        70616|          Liu Luyang|         11|      199|
|       118925|Megan Olwen Deven...|         11|      413|
|        52070|        Etsuko Inada|         11|      514|
|        22411|Magdalena Cecilia...|         11|      413|
|        40129|    Luigina Giavotti|         11|      507|
|        47618|Sonja Henie Toppi...|         11|      742|
|        76675|   Marcelle Matthews|         11|      967|
|        37333|Carlos Bienvenido...|         11|      982|
|        51268|      Beatrice Hutiu|         11|      861|
|       126307|        Liana Vicens|         11|      825|
|        48939|             Ho Gang|         12|      738|
|        49142|        Jan Hoffmann|         12|      302|
|        42835|   Werner Grieshofer|         12|       7

In [29]:
deportistaOlimpicoDF.printSchema()

root
 |-- deportista_id: integer (nullable = false)
 |-- nombre: string (nullable = false)
 |-- edadAlJugar: integer (nullable = false)
 |-- equipo_id: integer (nullable = false)



In [30]:
juegoDF.printSchema()

root
 |-- juego_id: integer (nullable = true)
 |-- anio: string (nullable = true)
 |-- temporada: string (nullable = true)
 |-- ciudad: string (nullable = true)



In [31]:
resultadosDF.printSchema()

root
 |-- resultado_id: integer (nullable = true)
 |-- medalla: string (nullable = true)
 |-- deportista_id: integer (nullable = true)
 |-- juego_id: integer (nullable = true)
 |-- evento_id: integer (nullable = true)



In [32]:
deportesDF.printSchema()

root
 |-- id: integer (nullable = true)
 |-- deporte: string (nullable = true)



In [33]:
eventoDF.printSchema()

root
 |-- evento_id: integer (nullable = true)
 |-- evento: string (nullable = true)
 |-- deporte_id: integer (nullable = true)



In [43]:
deportistaOlimpicoDF.join(resultadosDF,
    deportistaOlimpicoDF.deportista_id == resultadosDF.deportista_id,"left") \
    .join(juegoDF,juegoDF.juego_id == resultadosDF.juego_id,"left") \
    .join(deportesDF, deportesDF.id == resultadosDF.evento_id,"left") \
    .select(deportistaOlimpicoDF.nombre,col("edad").alias("Edad al jugar"),
             "medalla",col("anio").alias("Año del juego"),
             deportesDF.deporte.alias("Diciplina")).show()

+--------------------+-------------+-------+-------------+--------------------+
|              nombre|Edad al jugar|medalla|Año del juego|           Diciplina|
+--------------------+-------------+-------+-------------+--------------------+
|           A Dijiang|           24|     NA|  1992 Verano|          Basketball|
|            A Lamusi|           23|     NA|  2012 Verano|                Judo|
| Gunnar Nielsen Aaby|           24|     NA|  1920 Verano|            Football|
|Edgar Lindenau Aabye|           34|   Gold|  1900 Verano|          Tug-Of-War|
|Christine Jacoba ...|           21|     NA|1994 Invierno|Cross Country Skiing|
|Christine Jacoba ...|           21|     NA|1994 Invierno|       Speed Skating|
|Christine Jacoba ...|           21|     NA|1992 Invierno|Cross Country Skiing|
|Christine Jacoba ...|           21|     NA|1992 Invierno|       Speed Skating|
|Christine Jacoba ...|           21|     NA|1988 Invierno|Cross Country Skiing|
|Christine Jacoba ...|           21|    

In [44]:
resultadosDF.filter(resultadosDF.medalla != "NA") \
.join(deportistaOlimpicoDF, deportistaOlimpicoDF.deportista_id == resultadosDF.deportista_id, "left") \
.join(paisesDF, paisesDF.id == deportistaOlimpicoDF.equipo_id, "left") \
.select(resultadosDF.medalla, paisesDF.equipo,paisesDF.sigla) \
.sort(col("sigla").desc()).show()

+-------+--------+-----+
|medalla|  equipo|sigla|
+-------+--------+-----+
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
| Bronze|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
+-------+--------+-----+
only showing top 20 rows



In [45]:
medallistaXAnio = deportistaOlimpicoDF \
.join(resultadosDF, deportistaOlimpicoDF.deportista_id == resultadosDF.deportista_id) \
.join(juegoDF,juegoDF.juego_id == resultadosDF.juego_id,"left") \
.join(paisesDF, paisesDF.id == deportistaOlimpicoDF.equipo_id, "left") \
.join(deportesDF, deportesDF.id == resultadosDF.evento_id,"left")\
.join(eventoDF, eventoDF.deporte_id == deportesDF.id,"left")\
.select("sigla", 
        "anio", 
        "medalla", 
        eventoDF.evento.alias("Nombre subdisciplina"), 
        deportesDF.deporte.alias("Nombre diciplina"),
        deportistaOlimpicoDF.nombre)

In [46]:
medallistaXAnio.show()

+-----+-----------+-------+--------------------+----------------+--------------------+
|sigla|       anio|medalla|Nombre subdisciplina|Nombre diciplina|              nombre|
+-----+-----------+-------+--------------------+----------------+--------------------+
|  CHN|1992 Verano|     NA|Basketball Women'...|      Basketball|           A Dijiang|
|  CHN|1992 Verano|     NA|Basketball Men's ...|      Basketball|           A Dijiang|
|  CHN|2012 Verano|     NA|Judo Women's Ligh...|            Judo|            A Lamusi|
|  CHN|2012 Verano|     NA|Judo Women's Midd...|            Judo|            A Lamusi|
|  CHN|2012 Verano|     NA|Judo Women's Extr...|            Judo|            A Lamusi|
|  CHN|2012 Verano|     NA|Judo Women's Heav...|            Judo|            A Lamusi|
|  CHN|2012 Verano|     NA|Judo Women's Half...|            Judo|            A Lamusi|
|  CHN|2012 Verano|     NA|Judo Men's Half-H...|            Judo|            A Lamusi|
|  CHN|2012 Verano|     NA|Judo Men's Half-

In [47]:
medallistaXAnio2 = medallistaXAnio.filter(medallistaXAnio.medalla != "NA")\
    .sort("anio")\
    .groupBy("sigla","anio","Nombre subdisciplina")\
    .count()

In [48]:
medallistaXAnio2.printSchema()

root
 |-- sigla: string (nullable = true)
 |-- anio: string (nullable = true)
 |-- Nombre subdisciplina: string (nullable = true)
 |-- count: long (nullable = false)



In [49]:
medallistaXAnio2.groupBy("sigla","anio")\
    .agg(sum("count").alias("Total de medallas"), avg("count").alias("Medallas promedio")).show()

+-----+-----------+-----------------+------------------+
|sigla|       anio|Total de medallas| Medallas promedio|
+-----+-----------+-----------------+------------------+
|  USA|1896 Verano|               69|1.3529411764705883|
|  FRA|1896 Verano|               12|               3.0|
|  GRE|1896 Verano|              164|2.6031746031746033|
|  AUS|1896 Verano|                4|1.3333333333333333|
|  GER|1896 Verano|              268|1.8741258741258742|
|  SUI|1896 Verano|               44|               1.0|
|  SWE|1900 Verano|                6|               3.0|
|  FRA|1900 Verano|              282|20.142857142857142|
|  NOR|1900 Verano|               10|3.3333333333333335|
|  USA|1900 Verano|              199|3.9019607843137254|
|  GBR|1900 Verano|              108|              12.0|
|  BEL|1900 Verano|               66| 9.428571428571429|
|  CUB|1900 Verano|                2|               1.0|
|  HUN|1900 Verano|                6|               2.0|
|  DEN|1900 Verano|            

In [50]:
resultadosDF.registerTempTable("resultado")
deportistaOlimpicoDF.registerTempTable("deportista")
paisesDF.registerTempTable("paises")

In [51]:
sqlContext.sql("SELECT * FROM deportista").show(5)

+-------------+--------------------+------+----+------+----+---------+
|deportista_id|              nombre|genero|edad|altura|peso|equipo_id|
+-------------+--------------------+------+----+------+----+---------+
|            1|           A Dijiang|     1|  24|   180|80.0|      199|
|            2|            A Lamusi|     1|  23|   170|60.0|      199|
|            3| Gunnar Nielsen Aaby|     1|  24|     0| 0.0|      273|
|            4|Edgar Lindenau Aabye|     1|  34|     0| 0.0|      278|
|            5|Christine Jacoba ...|     2|  21|   185|82.0|      705|
+-------------+--------------------+------+----+------+----+---------+
only showing top 5 rows



In [52]:
sql_query = """
SELECT medalla, equipo, sigla FROM resultado r
JOIN deportista d
ON r.deportista_id = d.deportista_id
JOIN paises p
ON p.id = d.equipo_id
WHERE medalla <> "NA"
ORDER BY sigla DESC
"""

sqlContext.sql(sql_query).show()

+-------+--------+-----+
|medalla|  equipo|sigla|
+-------+--------+-----+
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
| Bronze|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
| Silver|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
|   Gold|Zimbabwe|  ZIM|
+-------+--------+-----+
only showing top 20 rows



In [53]:
spark

In [54]:
! head -n 5 ./data/deportistaError.csv

deportista_id,nombre,genero,edad,altura,peso,equipo_id
1,A Dijiang,1,24,180,80,199
2,A Lamusi,1,23,170,60,199
3,Gunnar Nielsen Aaby,1,24,,,273
4,Edgar Lindenau Aabye,1,34,,,278


In [55]:
def delete_header(index, iterator):
    return iter(list(iterator)[1:])

In [56]:
deportistaError = spark.textFile("./data/deportistaError.csv")\
    .map(lambda l: l.split(","))
deportistaError.take(2)

[['deportista_id', 'nombre', 'genero', 'edad', 'altura', 'peso', 'equipo_id'],
 ['1', 'A Dijiang', '1', '24', '180', '80', '199']]

In [57]:
deportistaError = deportistaError.mapPartitionsWithIndex(delete_header)

In [58]:
deportistaError.take(2)

[['1', 'A Dijiang', '1', '24', '180', '80', '199'],
 ['2', 'A Lamusi', '1', '23', '170', '60', '199']]

In [59]:
deportistaOlimpicoRDD =  spark.textFile("./data/deportistaError.csv").map(lambda line : line.split(","))
deportistaOlimpicoRDD=deportistaOlimpicoRDD.mapPartitionsWithIndex(delete_header)

deportistaOlimpicoRDD = deportistaOlimpicoRDD.map(lambda l : (
l[0],
l[1],
l[2],
l[3],
l[4],
l[5],
l[6]
))

schema = StructType([
StructField("deportista_id",StringType(),False)     ,
StructField("nombre",StringType(),False)        ,
StructField("genero",StringType(),False)        ,
StructField("edad",StringType(),True)      ,
StructField("altura",StringType(),True)        ,
StructField("peso",StringType(),True)      ,
StructField("equipo_id",StringType(),True)     
])

deportistaErrorDF = sqlContext.createDataFrame(deportistaOlimpicoRDD,schema)

In [60]:
deportistaErrorDF.show()

+-------------+--------------------+------+----+------+----+---------+
|deportista_id|              nombre|genero|edad|altura|peso|equipo_id|
+-------------+--------------------+------+----+------+----+---------+
|            1|           A Dijiang|     1|  24|   180|  80|      199|
|            2|            A Lamusi|     1|  23|   170|  60|      199|
|            3| Gunnar Nielsen Aaby|     1|  24|      |    |      273|
|            4|Edgar Lindenau Aabye|     1|  34|      |    |      278|
|            5|Christine Jacoba ...|     2|  21|   185|  82|      705|
|            6|     Per Knut Aaland|     1|  31|   188|  75|     1096|
|            7|        John Aalberg|     1|  31|   183|  72|     1096|
|            8|"Cornelia ""Cor""...|     2|  18|   168|    |      705|
|            9|    Antti Sami Aalto|     1|  26|   186|  96|      350|
|           10|"Einar Ferdinand ...|     1|  26|      |    |      350|
|           11|  Jorma Ilmari Aalto|     1|  22|   182|76.5|      350|
|     

In [61]:
def conversionEnteros(valor):
    try:
        return int(valor) if len(valor) > 0 else None
    except ValueError as e:
        return None
    
conversionEnteros_udf = udf(lambda z: conversionEnteros(z), IntegerType())
sqlContext.udf.register("conversionEnteros_udf", conversionEnteros_udf)


<function __main__.<lambda>(z)>

In [62]:
def ci(value: str) -> int:
    return int(value) if len(value) > 0 else None

ci_udf = udf(lambda z : ci(z), IntegerType())

sqlContext.udf.register("ci_udf", ci_udf)

deportistaErrorDF.select(ci_udf("altura").alias("altura")).show()

+------+
|altura|
+------+
|   180|
|   170|
|  null|
|  null|
|   185|
|   188|
|   183|
|   168|
|   186|
|  null|
|   182|
|   172|
|   159|
|   171|
|  null|
|   184|
|   175|
|   189|
|  null|
|   176|
+------+
only showing top 20 rows



In [63]:
# int("hola")

In [64]:
deportistaErrorDF.select(conversionEnteros_udf("altura").alias("alturaUDF")).show()

+---------+
|alturaUDF|
+---------+
|      180|
|      170|
|     null|
|     null|
|      185|
|      188|
|      183|
|      168|
|      186|
|     null|
|      182|
|      172|
|      159|
|      171|
|     null|
|      184|
|      175|
|      189|
|     null|
|      176|
+---------+
only showing top 20 rows



In [65]:
from pyspark.storagelevel import StorageLevel

In [66]:
medallistaXAnio.show(3)

+-----+-----------+-------+--------------------+----------------+---------+
|sigla|       anio|medalla|Nombre subdisciplina|Nombre diciplina|   nombre|
+-----+-----------+-------+--------------------+----------------+---------+
|  CHN|1992 Verano|     NA|Basketball Women'...|      Basketball|A Dijiang|
|  CHN|1992 Verano|     NA|Basketball Men's ...|      Basketball|A Dijiang|
|  CHN|2012 Verano|     NA|Judo Women's Ligh...|            Judo| A Lamusi|
+-----+-----------+-------+--------------------+----------------+---------+
only showing top 3 rows



In [67]:
medallistaXAnio.is_cached

False

In [70]:
medallistaXAnio.rdd.cache()

MapPartitionsRDD[215] at javaToPython at NativeMethodAccessorImpl.java:0

In [71]:
medallistaXAnio.is_cached

False

In [72]:
medallistaXAnio.rdd.getStorageLevel()

StorageLevel(False, True, False, False, 1)

In [73]:
medallistaXAnio.rdd.unpersist()

MapPartitionsRDD[215] at javaToPython at NativeMethodAccessorImpl.java:0

In [74]:
medallistaXAnio.rdd.persist(StorageLevel.MEMORY_AND_DISK_2)

MapPartitionsRDD[215] at javaToPython at NativeMethodAccessorImpl.java:0

In [75]:
medallistaXAnio.rdd.getStorageLevel()

StorageLevel(True, True, False, False, 2)

In [77]:
StorageLevel.MEMORY_AND_DISK_3 = StorageLevel(True, True, False,False,3)

In [78]:
medallistaXAnio.rdd.unpersist()
medallistaXAnio.rdd.persist(StorageLevel.MEMORY_AND_DISK_3)

MapPartitionsRDD[215] at javaToPython at NativeMethodAccessorImpl.java:0

In [79]:
medallistaXAnio.rdd.getStorageLevel()

StorageLevel(True, True, False, False, 3)

In [80]:
spark.stop()