In [11]:
%%html
<style>
.output_subarea.output_text.output_stream.output_stdout > pre {
   white-space: pre;
}
.p-Widget.jp-RenderedText.jp-OutputArea-output > pre {
   white-space: pre;
}
</style>

In [1]:
# Global data variables
DATA_PATH = "/Users/luis/Documents/Work/Telefonica/Courses/DATA/" 

In [2]:
from pyspark.sql import functions as F



# Creación o modificación de columnas

En Spark hay un único método para la creación o modificación de columnas y es `withColumn`. Este método es de nuevo una transformación y toma dos parámetros: el nombre de la columna a crear (o sobreescribir) y la operación que crea la nueva columna.

Para una ejecución más óptima se recomienda utilizar únicamente las funciones de PySpark cuando se define la operación, pero como se detallará más adelante se pueden utilizar funciones propias. 

In [3]:
movies_df = spark.read.csv(DATA_PATH + 'movies/movies.csv', sep=',', header=True, inferSchema=True)
ratings_df = spark.read.csv(DATA_PATH + 'movies/ratings.csv', sep=',', header=True, inferSchema=True)

In [4]:
ratings_movies_df = ratings_df.join(movies_df, on='movieId', how='inner')

In [5]:
ratings_movies_df = ratings_movies_df.cache()

In [5]:
ratings_movies_df.show(5)

+-------+------+------+----------+--------------------+----------------+
|movieId|userId|rating| timestamp|               title|          genres|
+-------+------+------+----------+--------------------+----------------+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|     Crime|Drama|
|   1246|     1|   5.0|1425941556|Dead Poets Societ...|           Drama|
+-------+------+------+----------+--------------------+----------------+
only showing top 5 rows





## Funciones de Spark



__valor fijo__

El ejemplo más sencillo es crear una columna con un valor fijo, en este caso, columna `now` con valor '2019/01/21 14:08', y columna `rating2`con valor 4.0.

Hint: `withColumn`

In [6]:
ratings_movies_df = ratings_movies_df.withColumn('now', F.lit('2019/01/21 14:08'))

In [8]:
ratings_movies_df.show(3)

+-------+------+------+----------+--------------------+----------------+----------------+
|movieId|userId|rating| timestamp|               title|          genres|             now|
+-------+------+------+----------+--------------------+----------------+----------------+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|2019/01/21 14:08|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|2019/01/21 14:08|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|2019/01/21 14:08|
+-------+------+------+----------+--------------------+----------------+----------------+
only showing top 3 rows



In [7]:
ratings_movies_df = ratings_movies_df.withColumn('rating2', F.lit(4.0))

In [10]:
ratings_movies_df.show(3)

+-------+------+------+----------+--------------------+----------------+----------------+-------+
|movieId|userId|rating| timestamp|               title|          genres|             now|rating2|
+-------+------+------+----------+--------------------+----------------+----------------+-------+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|2019/01/21 14:08|    4.0|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|2019/01/21 14:08|    4.0|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|2019/01/21 14:08|    4.0|
+-------+------+------+----------+--------------------+----------------+----------------+-------+
only showing top 3 rows





__duplicar columna__

In [12]:
ratings_movies_df.withColumn('title2', F.col('title'))\
                 .select('title', 'title2')\
                 .show(10)

+--------------------+--------------------+
|               title|              title2|
+--------------------+--------------------+
|   Braveheart (1995)|   Braveheart (1995)|
|Basketball Diarie...|Basketball Diarie...|
|Godfather, The (1...|Godfather, The (1...|
|Godfather: Part I...|Godfather: Part I...|
|Dead Poets Societ...|Dead Poets Societ...|
|Breakfast Club, T...|Breakfast Club, T...|
|Sixth Sense, The ...|Sixth Sense, The ...|
|Ferris Bueller's ...|Ferris Bueller's ...|
|   Fight Club (1999)|   Fight Club (1999)|
|      Memento (2000)|      Memento (2000)|
+--------------------+--------------------+
only showing top 10 rows



In [13]:
ratings_movies_df.select(F.col('title'), 
                         F.col('title').alias('title2')).show()

+--------------------+--------------------+
|               title|              title2|
+--------------------+--------------------+
|   Braveheart (1995)|   Braveheart (1995)|
|Basketball Diarie...|Basketball Diarie...|
|Godfather, The (1...|Godfather, The (1...|
|Godfather: Part I...|Godfather: Part I...|
|Dead Poets Societ...|Dead Poets Societ...|
|Breakfast Club, T...|Breakfast Club, T...|
|Sixth Sense, The ...|Sixth Sense, The ...|
|Ferris Bueller's ...|Ferris Bueller's ...|
|   Fight Club (1999)|   Fight Club (1999)|
|      Memento (2000)|      Memento (2000)|
| Donnie Darko (2001)| Donnie Darko (2001)|
|Igby Goes Down (2...|Igby Goes Down (2...|
|Batman Begins (2005)|Batman Begins (2005)|
|     Superbad (2007)|     Superbad (2007)|
|Dark Knight, The ...|Dark Knight, The ...|
|     Iron Man (2008)|     Iron Man (2008)|
|    Star Trek (2009)|    Star Trek (2009)|
|Harry Potter and ...|Harry Potter and ...|
|Sherlock Holmes (...|Sherlock Holmes (...|
|Harry Potter and ...|Harry Pott



__operaciones aritmeticas__

In [14]:
ratings_movies_df.withColumn('rating_10', F.col('rating') * 2)\
                 .select('rating', 'rating_10')\
                 .show(10)

+------+---------+
|rating|rating_10|
+------+---------+
|   1.0|      2.0|
|   4.5|      9.0|
|   5.0|     10.0|
|   5.0|     10.0|
|   5.0|     10.0|
|   4.0|      8.0|
|   4.5|      9.0|
|   5.0|     10.0|
|   4.0|      8.0|
|   4.0|      8.0|
+------+---------+
only showing top 10 rows



In [15]:
ratings_movies_df.withColumn('rating_avg', (F.col('rating') + F.col('rating2')) /  2)\
                 .select('rating', 'rating2', 'rating_avg')\
                 .show(10)

+------+-------+----------+
|rating|rating2|rating_avg|
+------+-------+----------+
|   1.0|    4.0|       2.5|
|   4.5|    4.0|      4.25|
|   5.0|    4.0|       4.5|
|   5.0|    4.0|       4.5|
|   5.0|    4.0|       4.5|
|   4.0|    4.0|       4.0|
|   4.5|    4.0|      4.25|
|   5.0|    4.0|       4.5|
|   4.0|    4.0|       4.0|
|   4.0|    4.0|       4.0|
+------+-------+----------+
only showing top 10 rows



In [16]:
ratings_movies_df.selectExpr('rating', 
                             'rating2', 
                             '(rating + rating2)/2 as mean_rating').show()

+------+-------+-----------+
|rating|rating2|mean_rating|
+------+-------+-----------+
|   1.0|    4.0|        2.5|
|   4.5|    4.0|       4.25|
|   5.0|    4.0|        4.5|
|   5.0|    4.0|        4.5|
|   5.0|    4.0|        4.5|
|   4.0|    4.0|        4.0|
|   4.5|    4.0|       4.25|
|   5.0|    4.0|        4.5|
|   4.0|    4.0|        4.0|
|   4.0|    4.0|        4.0|
|   5.0|    4.0|        4.5|
|   5.0|    4.0|        4.5|
|   4.0|    4.0|        4.0|
|   3.5|    4.0|       3.75|
|   4.0|    4.0|        4.0|
|   5.0|    4.0|        4.5|
|   5.0|    4.0|        4.5|
|   5.0|    4.0|        4.5|
|   5.0|    4.0|        4.5|
|   5.0|    4.0|        4.5|
+------+-------+-----------+
only showing top 20 rows



 

__if/else__

Crea la columna `kind_rating`, que sea 'high' en caso de que rating sea mayor que 4, y 'low' en caso contrario.

In [17]:
ratings_movies_df.withColumn('kind_rating', 
                              F.when(F.col('rating') >= 4, 'high').otherwise('low')).show(10)

+-------+------+------+----------+--------------------+--------------------+----------------+-------+-----------+
|movieId|userId|rating| timestamp|               title|              genres|             now|rating2|kind_rating|
+-------+------+------+----------+--------------------+--------------------+----------------+-------+-----------+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|    Action|Drama|War|2019/01/21 14:08|    4.0|        low|
|    147|     1|   4.5|1425942435|Basketball Diarie...|               Drama|2019/01/21 14:08|    4.0|       high|
|    858|     1|   5.0|1425941523|Godfather, The (1...|         Crime|Drama|2019/01/21 14:08|    4.0|       high|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|         Crime|Drama|2019/01/21 14:08|    4.0|       high|
|   1246|     1|   5.0|1425941556|Dead Poets Societ...|               Drama|2019/01/21 14:08|    4.0|       high|
|   1968|     1|   4.0|1425942148|Breakfast Club, T...|        Comedy|Drama|2019/01/21 1

In [10]:
quants = list(movie_rates.select(F.expr('percentile_approx(mean_rating, array(.25, .5, .75))')).first())[0]
quants

[3.01, 3.38, 3.67]

In [17]:
movie_rates.withColumn('quality', (F.when(F.col('mean_rating') < quants[0], 'bad')
                                  .when(F.col('mean_rating') < quants[1], 'regular')
                                  .when(F.col('mean_rating') < quants[2], 'good')
                                  .otherwise('very good'))).show()

+-------+-----------+-----------+---------+
|movieId|mean_rating|total_rates|  quality|
+-------+-----------+-----------+---------+
| 159817|       4.48|        754|very good|
|    318|       4.43|      91082|very good|
| 170705|       4.39|        284|very good|
|    858|       4.34|      57070|very good|
|     50|        4.3|      59271|very good|
|    527|       4.27|      67662|very good|
| 100044|       4.27|        197|very good|
|   2019|       4.26|      13994|very good|
|   1221|       4.26|      36679|very good|
|   1203|       4.23|      16896|very good|
|   2959|       4.23|      60024|very good|
|    904|       4.23|      21335|very good|
| 166024|       4.23|        183|very good|
|   1193|       4.23|      40103|very good|
| 163809|       4.22|        157|very good|
| 142115|       4.22|        273|very good|
|    912|       4.21|      30043|very good|
|   1212|       4.21|       7676|very good|
|    908|       4.21|      19013|very good|
|    750|       4.21|      28280

In [9]:
movie_rates = (ratings_movies_df.groupBy('movieId')
 .agg(F.round(F.avg(F.col('rating')), 2).alias('mean_rating'), 
      F.count('*').alias('total_rates'))
 .filter(F.col('total_rates') > 100)
 .orderBy(F.col('mean_rating').desc())
)

movie_rates.show(5)

+-------+-----------+-----------+
|movieId|mean_rating|total_rates|
+-------+-----------+-----------+
| 159817|       4.48|        754|
|    318|       4.43|      91082|
| 170705|       4.39|        284|
|    858|       4.34|      57070|
|     50|        4.3|      59271|
+-------+-----------+-----------+
only showing top 5 rows



In [28]:
# 1.- Clasificar movies dependiendo mean_rating (q1, q2, q3)
# 2.- Obtener distribucion de generos por grupo (opcional)



Se pueden concatenar multiples sentencias _when_. Esta vez, sobreescribe la columna `kind_rating` para crear un nivel intermedio, donde si es mayor que dos y menor que 4, `kind_rating` sea 'med'.

In [None]:
ratings_movies_df.withColumn('kind_rating', 
                              F.when(F.col('rating') >= 4, 'high')\
                               .when(F.col('rating') >= 2, 'med')\
                               .otherwise('low')).show(20)



__operaciones con strings__

Pon en mayúsculas todos los títulos de las películas

In [18]:
ratings_movies_df.withColumn('title upper', F.upper(F.col('title'))).show(3)

+-------+------+------+----------+--------------------+----------------+----------------+-------+--------------------+
|movieId|userId|rating| timestamp|               title|          genres|             now|rating2|         title upper|
+-------+------+------+----------+--------------------+----------------+----------------+-------+--------------------+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|2019/01/21 14:08|    4.0|   BRAVEHEART (1995)|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|2019/01/21 14:08|    4.0|BASKETBALL DIARIE...|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|2019/01/21 14:08|    4.0|GODFATHER, THE (1...|
+-------+------+------+----------+--------------------+----------------+----------------+-------+--------------------+
only showing top 3 rows





Extrae los 10 primeros caracteres de la columna `title`

In [19]:
ratings_movies_df.withColumn('short_title', F.substring(F.col('title'), 0, 10))\
                 .select('title', 'short_title')\
                 .show(10, False)

+-------------------------------+-----------+
|title                          |short_title|
+-------------------------------+-----------+
|Braveheart (1995)              |Braveheart |
|Basketball Diaries, The (1995) |Basketball |
|Godfather, The (1972)          |Godfather, |
|Godfather: Part II, The (1974) |Godfather: |
|Dead Poets Society (1989)      |Dead Poets |
|Breakfast Club, The (1985)     |Breakfast  |
|Sixth Sense, The (1999)        |Sixth Sens |
|Ferris Bueller's Day Off (1986)|Ferris Bue |
|Fight Club (1999)              |Fight Club |
|Memento (2000)                 |Memento (2 |
+-------------------------------+-----------+
only showing top 10 rows





Separa los diferentes géneros de la columna `genres` para obtener una lista, usando el separador '|'

In [20]:
ratings_movies_df.withColumn('genres', F.split(F.col('genres'), '\|')).show(4)

+-------+------+------+----------+--------------------+--------------------+----------------+-------+
|movieId|userId|rating| timestamp|               title|              genres|             now|rating2|
+-------+------+------+----------+--------------------+--------------------+----------------+-------+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|[Action, Drama, War]|2019/01/21 14:08|    4.0|
|    147|     1|   4.5|1425942435|Basketball Diarie...|             [Drama]|2019/01/21 14:08|    4.0|
|    858|     1|   5.0|1425941523|Godfather, The (1...|      [Crime, Drama]|2019/01/21 14:08|    4.0|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|      [Crime, Drama]|2019/01/21 14:08|    4.0|
+-------+------+------+----------+--------------------+--------------------+----------------+-------+
only showing top 4 rows



In [22]:
%%time 

(ratings_movies_df
 .withColumn('genres', F.split(F.col('genres'), '\|'))
 .filter(F.expr('array_contains(genres, "Horror")'))
).show()

+-------+------+------+----------+--------------------+--------------------+----------------+-------+
|movieId|userId|rating| timestamp|               title|              genres|             now|rating2|
+-------+------+------+----------+--------------------+--------------------+----------------+-------+
|   2762|     1|   4.5|1425941300|Sixth Sense, The ...|[Drama, Horror, M...|2019/01/21 14:08|    4.0|
|   1644|     4|   3.0|1042674845|I Know What You D...|[Horror, Mystery,...|2019/01/21 14:08|    4.0|
|   2338|     4|   2.0|1042674886|I Still Know What...|[Horror, Mystery,...|2019/01/21 14:08|    4.0|
|   2710|     4|   2.0|1042668544|Blair Witch Proje...|[Drama, Horror, T...|2019/01/21 14:08|    4.0|
|   2762|     4|   5.0|1042672546|Sixth Sense, The ...|[Drama, Horror, M...|2019/01/21 14:08|    4.0|
|   3476|     4|   4.0|1042674436|Jacob's Ladder (1...|   [Horror, Mystery]|2019/01/21 14:08|    4.0|
|   3798|     4|   4.0|1042672580|What Lies Beneath...|[Drama, Horror, M...|2019/0

In [23]:
%%time

ratings_movies_df.filter(F.col('genres').like('%Horror%')).show()

+-------+------+------+----------+--------------------+--------------------+----------------+-------+
|movieId|userId|rating| timestamp|               title|              genres|             now|rating2|
+-------+------+------+----------+--------------------+--------------------+----------------+-------+
|   2762|     1|   4.5|1425941300|Sixth Sense, The ...|Drama|Horror|Mystery|2019/01/21 14:08|    4.0|
|   1644|     4|   3.0|1042674845|I Know What You D...|Horror|Mystery|Th...|2019/01/21 14:08|    4.0|
|   2338|     4|   2.0|1042674886|I Still Know What...|Horror|Mystery|Th...|2019/01/21 14:08|    4.0|
|   2710|     4|   2.0|1042668544|Blair Witch Proje...|Drama|Horror|Thri...|2019/01/21 14:08|    4.0|
|   2762|     4|   5.0|1042672546|Sixth Sense, The ...|Drama|Horror|Mystery|2019/01/21 14:08|    4.0|
|   3476|     4|   4.0|1042674436|Jacob's Ladder (1...|      Horror|Mystery|2019/01/21 14:08|    4.0|
|   3798|     4|   4.0|1042672580|What Lies Beneath...|Drama|Horror|Mystery|2019/0



Crea una nueva columna `1st_genre` seleccionando el primer elemento de la lista del código anterior

In [24]:
ratings_movies_df.withColumn('1st_genre', F.split(F.col('genres'), '\|')[0])\
                 .select('genres', '1st_genre')\
                 .show(10, False)

+---------------------------+---------+
|genres                     |1st_genre|
+---------------------------+---------+
|Action|Drama|War           |Action   |
|Drama                      |Drama    |
|Crime|Drama                |Crime    |
|Crime|Drama                |Crime    |
|Drama                      |Drama    |
|Comedy|Drama               |Comedy   |
|Drama|Horror|Mystery       |Drama    |
|Comedy                     |Comedy   |
|Action|Crime|Drama|Thriller|Action   |
|Mystery|Thriller           |Mystery  |
+---------------------------+---------+
only showing top 10 rows





Reemplaza el caracter '|' por '-' en la columna `genres`

In [25]:
ratings_movies_df.withColumn('genres', F.regexp_replace(F.col('genres'), '\|', '-'))\
                 .select('title', 'genres')\
                 .show(10, truncate=False)

+-------------------------------+---------------------------+
|title                          |genres                     |
+-------------------------------+---------------------------+
|Braveheart (1995)              |Action-Drama-War           |
|Basketball Diaries, The (1995) |Drama                      |
|Godfather, The (1972)          |Crime-Drama                |
|Godfather: Part II, The (1974) |Crime-Drama                |
|Dead Poets Society (1989)      |Drama                      |
|Breakfast Club, The (1985)     |Comedy-Drama               |
|Sixth Sense, The (1999)        |Drama-Horror-Mystery       |
|Ferris Bueller's Day Off (1986)|Comedy                     |
|Fight Club (1999)              |Action-Crime-Drama-Thriller|
|Memento (2000)                 |Mystery-Thriller           |
+-------------------------------+---------------------------+
only showing top 10 rows





_Con expresiones regulares_

https://regexr.com/

In [29]:
ratings_movies_df.select(F.col('title')).show(5, False)

+------------------------------+
|title                         |
+------------------------------+
|Braveheart (1995)             |
|Basketball Diaries, The (1995)|
|Godfather, The (1972)         |
|Godfather: Part II, The (1974)|
|Dead Poets Society (1989)     |
+------------------------------+
only showing top 5 rows



In [31]:
ratings_movies_df = ratings_movies_df.withColumn('year', 
                                                 F.regexp_extract(F.col('title'),  '\((\d{4})\)', 1))

ratings_movies_df.show(5)

+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
|movieId|userId|rating| timestamp|               title|          genres|             now|rating2|year|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|2019/01/21 14:08|    4.0|1995|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|2019/01/21 14:08|    4.0|1995|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|2019/01/21 14:08|    4.0|1972|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|     Crime|Drama|2019/01/21 14:08|    4.0|1974|
|   1246|     1|   5.0|1425941556|Dead Poets Societ...|           Drama|2019/01/21 14:08|    4.0|1989|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
only showing top 5 rows





## Casting

Con el método `withColumn` también es posible convertir el tipo de una columna con la función `cast`. Es importante saber que en caso de no poder convertirse (por ejemplo una letra a número) no saltará error y el resultado será un valor nulo.

In [33]:
ratings_movies_df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- now: string (nullable = false)
 |-- rating2: double (nullable = false)
 |-- year: string (nullable = true)





Cambia el formato de `year` a entero, y `movieId` a string.

In [34]:
ratings_movies_df = ratings_movies_df.withColumn('year', F.col('year').cast('int'))
ratings_movies_df.show(5)

+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
|movieId|userId|rating| timestamp|               title|          genres|             now|rating2|year|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|2019/01/21 14:08|    4.0|1995|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|2019/01/21 14:08|    4.0|1995|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|2019/01/21 14:08|    4.0|1972|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|     Crime|Drama|2019/01/21 14:08|    4.0|1974|
|   1246|     1|   5.0|1425941556|Dead Poets Societ...|           Drama|2019/01/21 14:08|    4.0|1989|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+
only showing top 5 rows



In [35]:
ratings_movies_df = ratings_movies_df.withColumn('movieId', F.col('movieId').cast('string'))

In [36]:
ratings_movies_df.printSchema()

root
 |-- movieId: string (nullable = true)
 |-- userId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- now: string (nullable = false)
 |-- rating2: double (nullable = false)
 |-- year: integer (nullable = true)



In [37]:
ratings_movies_df.withColumn('error', F.col('title').cast('int')).show(5)

+-------+------+------+----------+--------------------+----------------+----------------+-------+----+-----+
|movieId|userId|rating| timestamp|               title|          genres|             now|rating2|year|error|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+-----+
|    110|     1|   1.0|1425941529|   Braveheart (1995)|Action|Drama|War|2019/01/21 14:08|    4.0|1995| null|
|    147|     1|   4.5|1425942435|Basketball Diarie...|           Drama|2019/01/21 14:08|    4.0|1995| null|
|    858|     1|   5.0|1425941523|Godfather, The (1...|     Crime|Drama|2019/01/21 14:08|    4.0|1972| null|
|   1221|     1|   5.0|1425941546|Godfather: Part I...|     Crime|Drama|2019/01/21 14:08|    4.0|1974| null|
|   1246|     1|   5.0|1425941556|Dead Poets Societ...|           Drama|2019/01/21 14:08|    4.0|1989| null|
+-------+------+------+----------+--------------------+----------------+----------------+-------+----+-----+
only showing top 5 



## UDF (User Defined Functions)

Cuando no es posible definir la operación con las funciones de spark se pueden crear funciones propias usando la UDFs. Primero se crea  una función de Python normal y posteriormente se crea la UDFs. Es necesario indicar el tipo de la columna de salida en la UDF.

In [39]:
from pyspark.sql.types import StringType, IntegerType, DoubleType, DateType



_Aumenta el rating en un 15% para cada película más antigua que 2000 (el máximo siempre es 5)._

In [40]:
def increase_rating(year, rating):
    
    if year < 2000:
        rating = min(rating * 1.15, 5.0)
    
    return rating

In [41]:
increase_rating_udf = F.udf(increase_rating, DoubleType())

In [42]:
ratings_movies_df.withColumn('rating_inc', 
                              increase_rating_udf(F.col('year'), F.col('rating')))\
                 .select('title', 'year', 'rating', 'rating_inc')\
                 .show(20)

+--------------------+----+------+----------+
|               title|year|rating|rating_inc|
+--------------------+----+------+----------+
|   Braveheart (1995)|1995|   1.0|      1.15|
|Basketball Diarie...|1995|   4.5|       5.0|
|Godfather, The (1...|1972|   5.0|       5.0|
|Godfather: Part I...|1974|   5.0|       5.0|
|Dead Poets Societ...|1989|   5.0|       5.0|
|Breakfast Club, T...|1985|   4.0|       4.6|
|Sixth Sense, The ...|1999|   4.5|       5.0|
|Ferris Bueller's ...|1986|   5.0|       5.0|
|   Fight Club (1999)|1999|   4.0|       4.6|
|      Memento (2000)|2000|   4.0|       4.0|
| Donnie Darko (2001)|2001|   5.0|       5.0|
|Igby Goes Down (2...|2002|   5.0|       5.0|
|Batman Begins (2005)|2005|   4.0|       4.0|
|     Superbad (2007)|2007|   3.5|       3.5|
|Dark Knight, The ...|2008|   4.0|       4.0|
|     Iron Man (2008)|2008|   5.0|       5.0|
|    Star Trek (2009)|2009|   5.0|       5.0|
|Harry Potter and ...|2009|   5.0|       5.0|
|Sherlock Holmes (...|2009|   5.0|

In [43]:
# Generar una nueva columna rating_len = longitud del titulo (usando udfs)

# 1.- Definir funcion
str_len = lambda x: len(x)

# 2.- Definir udf
str_len_udf = F.udf(str_len, IntegerType())

# 3.- Aplicar udf
(ratings_movies_df
 .withColumn('rating_len', str_len_udf(F.col('title')))
 .select(F.col('title'), F.col('rating'), F.col('rating_len'))
).show(10, False)

+-------------------------------+------+----------+
|title                          |rating|rating_len|
+-------------------------------+------+----------+
|Braveheart (1995)              |1.0   |17        |
|Basketball Diaries, The (1995) |4.5   |30        |
|Godfather, The (1972)          |5.0   |21        |
|Godfather: Part II, The (1974) |5.0   |30        |
|Dead Poets Society (1989)      |5.0   |25        |
|Breakfast Club, The (1985)     |4.0   |26        |
|Sixth Sense, The (1999)        |4.5   |23        |
|Ferris Bueller's Day Off (1986)|5.0   |31        |
|Fight Club (1999)              |4.0   |17        |
|Memento (2000)                 |4.0   |14        |
+-------------------------------+------+----------+
only showing top 10 rows





Extrae el año de la película sin usar expresiones regulares.

In [44]:
title = 'Trainspotting (1996)'

In [45]:
title.replace(')', '').replace('(', '')

'Trainspotting 1996'

In [46]:
year = title.replace(')', '').replace('(', '').split()[-1]
year = int(year)
year

1996

In [47]:
def get_year(title): 
    
    year = title.replace(')', '').replace('(', '').split()[-1]
    if year.isnumeric():
        year = int(year)
    else:
        year = -1
    
    return year

In [48]:
get_year_udf = F.udf(get_year, IntegerType())

In [49]:
ratings_movies_df.withColumn('year2', get_year_udf(F.col('title')))\
                 .select('title', 'year', 'year2').show(10, truncate=False)

+-------------------------------+----+-----+
|title                          |year|year2|
+-------------------------------+----+-----+
|Braveheart (1995)              |1995|1995 |
|Basketball Diaries, The (1995) |1995|1995 |
|Godfather, The (1972)          |1972|1972 |
|Godfather: Part II, The (1974) |1974|1974 |
|Dead Poets Society (1989)      |1989|1989 |
|Breakfast Club, The (1985)     |1985|1985 |
|Sixth Sense, The (1999)        |1999|1999 |
|Ferris Bueller's Day Off (1986)|1986|1986 |
|Fight Club (1999)              |1999|1999 |
|Memento (2000)                 |2000|2000 |
+-------------------------------+----+-----+
only showing top 10 rows



In [53]:
# 1.- Generar udf que obtenga el numero de vocales unicas dentro del titulo
# 2.- (Opcional) Basketball Diaries, The (1995) -> The Basketball Diaries 95


######


# Definir funcion
uniq_v = lambda x: len(set(x.lower()) & set('aeiou'))

# Definir UDF
uniq_v_udf = F.udf(uniq_v, IntegerType())

# Aplicar UDF

(ratings_movies_df.select(F.col('title'),
                          uniq_v_udf(F.col('title')).alias('unique_vowels'))).show(5, False)



+------------------------------+-------------+
|title                         |unique_vowels|
+------------------------------+-------------+
|Braveheart (1995)             |2            |
|Basketball Diaries, The (1995)|3            |
|Godfather, The (1972)         |3            |
|Godfather: Part II, The (1974)|4            |
|Dead Poets Society (1989)     |4            |
+------------------------------+-------------+
only showing top 5 rows





# Datetimes

Hay varias funciones de _pyspark_ que permiten trabajar con fechas: diferencia entre fechas, dia de la semana, año... Pero para ello primero es necesario transformar las columnas a tipo fecha. Se permite la conversion de dos formatos de fecha:
* timestamp de unix: una columna de tipo entero con los segundos trascurridos entre la medianoche del 1 de Enero de 1990 hasta la fecha.
* cadena: la fecha representada como una cadena siguiendo un formato específico que puede variar.

In [54]:
ratings_movies_df.select('title', 'timestamp', 'now').show(5)

+--------------------+----------+----------------+
|               title| timestamp|             now|
+--------------------+----------+----------------+
|   Braveheart (1995)|1425941529|2019/01/21 14:08|
|Basketball Diarie...|1425942435|2019/01/21 14:08|
|Godfather, The (1...|1425941523|2019/01/21 14:08|
|Godfather: Part I...|1425941546|2019/01/21 14:08|
|Dead Poets Societ...|1425941556|2019/01/21 14:08|
+--------------------+----------+----------------+
only showing top 5 rows



 

## unix timestamp a datetime

In [55]:
ratings_movies_df = ratings_movies_df.withColumn('datetime', F.from_unixtime(F.col('timestamp')))


+-------------------+----------+----------------+
|           datetime| timestamp|             now|
+-------------------+----------+----------------+
|2015-03-09 16:52:09|1425941529|2019/01/21 14:08|
|2015-03-09 17:07:15|1425942435|2019/01/21 14:08|
|2015-03-09 16:52:03|1425941523|2019/01/21 14:08|
|2015-03-09 16:52:26|1425941546|2019/01/21 14:08|
|2015-03-09 16:52:36|1425941556|2019/01/21 14:08|
|2015-03-09 17:02:28|1425942148|2019/01/21 14:08|
|2015-03-09 16:48:20|1425941300|2019/01/21 14:08|
|2015-03-09 16:53:13|1425941593|2019/01/21 14:08|
|2015-03-09 16:53:21|1425941601|2019/01/21 14:08|
|2015-03-09 17:03:48|1425942228|2019/01/21 14:08|
+-------------------+----------+----------------+
only showing top 10 rows



In [76]:
# %Y/%m/%d

ratings_movies_df.select('datetime', 'timestamp', 
                         F.to_timestamp(F.col('now'), 
                                   format='yyyy/MM/dd HH:mm').alias('now')).show(10)

+-------------------+----------+-------------------+
|           datetime| timestamp|                now|
+-------------------+----------+-------------------+
|2015-03-09 16:52:09|1425941529|2019-01-21 14:08:00|
|2015-03-09 17:07:15|1425942435|2019-01-21 14:08:00|
|2015-03-09 16:52:03|1425941523|2019-01-21 14:08:00|
|2015-03-09 16:52:26|1425941546|2019-01-21 14:08:00|
|2015-03-09 16:52:36|1425941556|2019-01-21 14:08:00|
|2015-03-09 17:02:28|1425942148|2019-01-21 14:08:00|
|2015-03-09 16:48:20|1425941300|2019-01-21 14:08:00|
|2015-03-09 16:53:13|1425941593|2019-01-21 14:08:00|
|2015-03-09 16:53:21|1425941601|2019-01-21 14:08:00|
|2015-03-09 17:03:48|1425942228|2019-01-21 14:08:00|
+-------------------+----------+-------------------+
only showing top 10 rows





## string a datetime

In [77]:
ratings_movies_df = ratings_movies_df.withColumn('now_datetime', 
                                                 F.from_unixtime(F.unix_timestamp(F.col('now'), 'yyyy/MM/dd HH:mm')))

ratings_movies_df.select('now', 'now_datetime').show(10)

+----------------+-------------------+
|             now|       now_datetime|
+----------------+-------------------+
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
|2019/01/21 14:08|2019-01-21 14:08:00|
+----------------+-------------------+
only showing top 10 rows





## funciones con datetimes

In [78]:
ratings_movies_df.select('now_datetime', 'datetime', 
                          F.datediff(F.col('now_datetime'), F.col('datetime'))).show(10)

+-------------------+-------------------+--------------------------------+
|       now_datetime|           datetime|datediff(now_datetime, datetime)|
+-------------------+-------------------+--------------------------------+
|2019-01-21 14:08:00|2015-03-09 16:52:09|                            1414|
|2019-01-21 14:08:00|2015-03-09 17:07:15|                            1414|
|2019-01-21 14:08:00|2015-03-09 16:52:03|                            1414|
|2019-01-21 14:08:00|2015-03-09 16:52:26|                            1414|
|2019-01-21 14:08:00|2015-03-09 16:52:36|                            1414|
|2019-01-21 14:08:00|2015-03-09 17:02:28|                            1414|
|2019-01-21 14:08:00|2015-03-09 16:48:20|                            1414|
|2019-01-21 14:08:00|2015-03-09 16:53:13|                            1414|
|2019-01-21 14:08:00|2015-03-09 16:53:21|                            1414|
|2019-01-21 14:08:00|2015-03-09 17:03:48|                            1414|
+-------------------+----

In [79]:
ratings_movies_df.select('datetime', F.date_add(F.col('datetime'), 10)).show(10)

+-------------------+----------------------+
|           datetime|date_add(datetime, 10)|
+-------------------+----------------------+
|2015-03-09 16:52:09|            2015-03-19|
|2015-03-09 17:07:15|            2015-03-19|
|2015-03-09 16:52:03|            2015-03-19|
|2015-03-09 16:52:26|            2015-03-19|
|2015-03-09 16:52:36|            2015-03-19|
|2015-03-09 17:02:28|            2015-03-19|
|2015-03-09 16:48:20|            2015-03-19|
|2015-03-09 16:53:13|            2015-03-19|
|2015-03-09 16:53:21|            2015-03-19|
|2015-03-09 17:03:48|            2015-03-19|
+-------------------+----------------------+
only showing top 10 rows



In [80]:
ratings_movies_df.withColumn('datetime_plus_4_months', F.add_months(F.col('datetime'), 4))\
                  .select('datetime', 'datetime_plus_4_months').show(5)

+-------------------+----------------------+
|           datetime|datetime_plus_4_months|
+-------------------+----------------------+
|2015-03-09 16:52:09|            2015-07-09|
|2015-03-09 17:07:15|            2015-07-09|
|2015-03-09 16:52:03|            2015-07-09|
|2015-03-09 16:52:26|            2015-07-09|
|2015-03-09 16:52:36|            2015-07-09|
+-------------------+----------------------+
only showing top 5 rows



In [81]:
ratings_movies_df.select('datetime', F.month(F.col('datetime')).alias('month')).show(10)

+-------------------+-----+
|           datetime|month|
+-------------------+-----+
|2015-03-09 16:52:09|    3|
|2015-03-09 17:07:15|    3|
|2015-03-09 16:52:03|    3|
|2015-03-09 16:52:26|    3|
|2015-03-09 16:52:36|    3|
|2015-03-09 17:02:28|    3|
|2015-03-09 16:48:20|    3|
|2015-03-09 16:53:13|    3|
|2015-03-09 16:53:21|    3|
|2015-03-09 17:03:48|    3|
+-------------------+-----+
only showing top 10 rows



In [82]:
ratings_movies_df.select('datetime', F.last_day(F.col('datetime')).alias('last_day')).show(10)

+-------------------+----------+
|           datetime|  last_day|
+-------------------+----------+
|2015-03-09 16:52:09|2015-03-31|
|2015-03-09 17:07:15|2015-03-31|
|2015-03-09 16:52:03|2015-03-31|
|2015-03-09 16:52:26|2015-03-31|
|2015-03-09 16:52:36|2015-03-31|
|2015-03-09 17:02:28|2015-03-31|
|2015-03-09 16:48:20|2015-03-31|
|2015-03-09 16:53:13|2015-03-31|
|2015-03-09 16:53:21|2015-03-31|
|2015-03-09 17:03:48|2015-03-31|
+-------------------+----------+
only showing top 10 rows



In [84]:
ratings_movies_df.select('datetime', F.dayofmonth(F.col('datetime')).alias('day'),
                                     F.dayofyear(F.col('datetime')).alias('year_day'),
                                     F.date_format(F.col('datetime'), 'EEEE').alias('weekday')).show(10)

+-------------------+---+--------+-------+
|           datetime|day|year_day|weekday|
+-------------------+---+--------+-------+
|2015-03-09 16:52:09|  9|      68| Monday|
|2015-03-09 17:07:15|  9|      68| Monday|
|2015-03-09 16:52:03|  9|      68| Monday|
|2015-03-09 16:52:26|  9|      68| Monday|
|2015-03-09 16:52:36|  9|      68| Monday|
|2015-03-09 17:02:28|  9|      68| Monday|
|2015-03-09 16:48:20|  9|      68| Monday|
|2015-03-09 16:53:13|  9|      68| Monday|
|2015-03-09 16:53:21|  9|      68| Monday|
|2015-03-09 17:03:48|  9|      68| Monday|
+-------------------+---+--------+-------+
only showing top 10 rows





Para filtrar por fechas se pueden comparar directamente con una cadena en el formato YYYY-MM-DD hh:mm:ss ya que será interpretada como una fecha.

In [85]:
ratings_movies_df.filter(F.col('datetime') >= "2015-09-30 20:00:00").select('datetime', 'title', 'rating').show(10)

+-------------------+--------------------+------+
|           datetime|               title|rating|
+-------------------+--------------------+------+
|2017-02-04 18:14:07|Léon: The Profess...|   5.0|
|2017-02-04 18:13:15|Shawshank Redempt...|   4.0|
|2017-02-04 18:13:19|  Matrix, The (1999)|   4.0|
|2017-02-04 18:23:06|American Beauty (...|   4.5|
|2017-02-04 18:13:22|   Fight Club (1999)|   4.5|
|2017-02-04 18:16:21|American Psycho (...|   5.0|
|2017-02-04 18:14:59|Meet the Parents ...|   1.5|
|2017-02-04 18:14:43|    Cast Away (2000)|   4.0|
|2017-02-04 18:14:32|        Shrek (2001)|   2.5|
|2017-02-04 18:14:47|Harry Potter and ...|   4.5|
+-------------------+--------------------+------+
only showing top 10 rows



In [86]:
ratings_movies_df.filter(F.col('datetime').between("2003-01-31", "2003-02-10"))\
                  .select('datetime', 'title', 'rating').show(5)

+-------------------+--------------------+------+
|           datetime|               title|rating|
+-------------------+--------------------+------+
|2003-01-31 07:46:21|Sense and Sensibi...|   4.0|
|2003-01-31 07:33:25|   Braveheart (1995)|   3.0|
|2003-01-31 07:35:28|  French Kiss (1995)|   1.0|
|2003-01-31 07:41:21| Pulp Fiction (1994)|   3.0|
|2003-01-31 07:41:31|Muriel's Wedding ...|   2.0|
+-------------------+--------------------+------+
only showing top 5 rows



In [87]:
ratings_movies_df.filter(F.year(F.col('datetime')) >= 2012)\
                 .select('datetime', 'title', 'rating').show(5)

+-------------------+--------------------+------+
|           datetime|               title|rating|
+-------------------+--------------------+------+
|2015-03-09 16:52:09|   Braveheart (1995)|   1.0|
|2015-03-09 17:07:15|Basketball Diarie...|   4.5|
|2015-03-09 16:52:03|Godfather, The (1...|   5.0|
|2015-03-09 16:52:26|Godfather: Part I...|   5.0|
|2015-03-09 16:52:36|Dead Poets Societ...|   5.0|
+-------------------+--------------------+------+
only showing top 5 rows





# Ejercicio 1

1) Cree una función que acepte un DataFrame y un diccionario. La función debe usar el diccionario para renombrar un grupo de columnas y devolver el DataFrame ya modificado.

Use el siguiente DataFrame y diccionario:

In [None]:
pokemon_df = spark.read.csv(DATA_PATH + 'pokemon.csv', sep=',', header=True, inferSchema=True)

rename_dict = {'Sp. Atk': 'sp_atk',
               'Sp. Def': 'sp_def'}

In [None]:
pokemon_df.show(3)

In [None]:
# Respuesta aqui



2) Use la función definida en el punto anterior para cambiar los nombres del DF usando el diccionario dado.

3) Modifique la función de tal forma que también acepte una función en lugar de un diccionario. Use la función para renombrar las columnas.

4) Estandarice según las buenas prácticas los nombres de las columnas usando la función que acaba de definir.

5) Cree otra función que acepte un DataFrame y una lista con un subconjunto de columnas. El objetivo de esta función es determinar el número de filas duplicadas del DF.

6) Use la función creada para obtener el número de duplicados del DataFrame pokemon_df en todas las columnas excepto el nombre (`name`)

In [None]:
# Respuesta aqui

In [None]:
# Respuesta aqui

In [None]:
# Respuesta aqui

In [None]:
# Respuesta aqui

In [None]:
# Respuesta aqui



# Ejercicio 2

Crea la misma lógica definida en el siguiente UDF, pero sin usar UDFs, es decir, usando exclusivamente funciones de SparkSQL.

In [None]:
movies_df = spark.read.csv(DATA_PATH + 'movie-ratings/movies.csv', sep=',', header=True, inferSchema=True)
movies_df = movies_df.withColumn('genres', F.split(F.col('genres'), '\|'))

from pyspark.sql.types import StringType, IntegerType, DoubleType, BooleanType

def value_in_col(col, value):
    return value in col

value_in_col_udf = F.udf(value_in_col, BooleanType())



*Pista*: Mira la función *explode*.

In [88]:
# Crimes in Vancouver
crime = spark.read.csv(DATA_PATH + 'crime_in_vancouver.csv', 
                       header=True, 
                       inferSchema=True,
                       sep=',')
crime.show(5)

+--------------------+----+-----+---+----+------+--------------------+--------------------+---------+----------+
|                TYPE|YEAR|MONTH|DAY|HOUR|MINUTE|       HUNDRED_BLOCK|       NEIGHBOURHOOD|        X|         Y|
+--------------------+----+-----+---+----+------+--------------------+--------------------+---------+----------+
|Offence Against a...|2003|    5| 17|null|  null|OFFSET TO PROTECT...|                null|      0.0|       0.0|
|  Theft from Vehicle|2003|    1|  7|  10|     0|CARDERO ST / W GE...|            West End|490503.48|5459766.67|
|  Theft from Vehicle|2003|    7| 27|   5|     0|    53XX CHAMBERS ST| Renfrew-Collingwood|496145.89|5453740.68|
|Break and Enter R...|2003|    3|  8|   4|    15|     19XX E 12TH AVE|Kensington-Cedar ...|495302.97|5456313.79|
|  Theft from Vehicle|2003|   10|  9|  16|     0|     16XX CHARLES ST|  Grandview-Woodland|494877.89| 5457816.4|
+--------------------+----+-----+---+----+------+--------------------+--------------------+-----

In [None]:
# 1.- Hacer una columna fecha.
# 2.- Hacer una columna weekend (True si la fecha es (viernes, sabado, domingo), False e.o.c.)
# 3.- Agrupar por weekend y neigh y obtener crimenes mas frecuentes. 

In [108]:
w_crime = (crime
 .select(F.col('TYPE'), 
         F.col('NEIGHBOURHOOD'), 
         F.to_date(F.concat(F.col('YEAR'), 
                      F.lit('-'), 
                      F.col('MONTH'),
                      F.lit('-'),
                      F.col('DAY')
                     )).alias('DATE'))
 .filter(F.col('NEIGHBOURHOOD').isNotNull())
 .withColumn('WEEKEND', 
             F.when((F
                     .date_format(F.col('DATE'), 'E')
                     .isin(['Fri', 'Sat', 'Sun'])), True)
             .otherwise(False))
 .groupBy(F.col('TYPE'), F.col('NEIGHBOURHOOD'), F.col('WEEKEND'))
 .agg(F.count('*').alias('total_crime'))
 .orderBy(F.col('NEIGHBOURHOOD'), F.col('WEEKEND'), F.col('total_crime').desc())
)

In [115]:
w_crime.show()

+--------------------+--------------------+-------+-----------+
|                TYPE|       NEIGHBOURHOOD|WEEKEND|total_crime|
+--------------------+--------------------+-------+-----------+
|  Theft from Vehicle|       Arbutus Ridge|  false|       1094|
|Break and Enter R...|       Arbutus Ridge|  false|        936|
|            Mischief|       Arbutus Ridge|  false|        463|
|    Theft of Vehicle|       Arbutus Ridge|  false|        267|
|         Other Theft|       Arbutus Ridge|  false|        217|
|Vehicle Collision...|       Arbutus Ridge|  false|        188|
|Break and Enter C...|       Arbutus Ridge|  false|        182|
|    Theft of Bicycle|       Arbutus Ridge|  false|         95|
|Vehicle Collision...|       Arbutus Ridge|  false|          2|
|  Theft from Vehicle|       Arbutus Ridge|   true|        826|
|Break and Enter R...|       Arbutus Ridge|   true|        779|
|            Mischief|       Arbutus Ridge|   true|        499|
|    Theft of Vehicle|       Arbutus Rid

In [107]:
from pyspark.sql import Window

In [109]:
wc = (Window()
      .partitionBy(F.col('NEIGHBOURHOOD'), F.col('WEEKEND'))
      .orderBy(F.col('total_crime').desc()))

In [114]:
(w_crime
 .select('*', F.dense_rank().over(wc).alias('top_crime'))
 .filter(F.col('top_crime') <= 3)
 .orderBy(F.col('NEIGHBOURHOOD'), F.col('WEEKEND'), F.col('top_crime').desc())
).show(27, False)

+---------------------------------+-------------------------+-------+-----------+---------+
|TYPE                             |NEIGHBOURHOOD            |WEEKEND|total_crime|top_crime|
+---------------------------------+-------------------------+-------+-----------+---------+
|Mischief                         |Arbutus Ridge            |false  |463        |3        |
|Break and Enter Residential/Other|Arbutus Ridge            |false  |936        |2        |
|Theft from Vehicle               |Arbutus Ridge            |false  |1094       |1        |
|Mischief                         |Arbutus Ridge            |true   |499        |3        |
|Break and Enter Residential/Other|Arbutus Ridge            |true   |779        |2        |
|Theft from Vehicle               |Arbutus Ridge            |true   |826        |1        |
|Mischief                         |Central Business District|false  |9181       |3        |
|Other Theft                      |Central Business District|false  |11513      