In [24]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder.appName("joins").getOrCreate()

movies = spark.read.csv(
path="../../../data-sets/ml-latest-small/movies.csv",
    header=True
)

movies.show()
movies.printSchema()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [10]:
tags = spark.read.csv(
path="../../../data-sets/ml-latest-small/tags.csv",
    header=True
)

tags.show()

ratings = spark.read.csv(
path="../../../data-sets/ml-latest-small/ratings.csv",
    header=True
)

ratings.show()

+------+-------+-----------------+----------+
|userId|movieId|              tag| timestamp|
+------+-------+-----------------+----------+
|     2|  60756|            funny|1445714994|
|     2|  60756|  Highly quotable|1445714996|
|     2|  60756|     will ferrell|1445714992|
|     2|  89774|     Boxing story|1445715207|
|     2|  89774|              MMA|1445715200|
|     2|  89774|        Tom Hardy|1445715205|
|     2| 106782|            drugs|1445715054|
|     2| 106782|Leonardo DiCaprio|1445715051|
|     2| 106782|  Martin Scorsese|1445715056|
|     7|  48516|     way too long|1169687325|
|    18|    431|        Al Pacino|1462138765|
|    18|    431|         gangster|1462138749|
|    18|    431|            mafia|1462138755|
|    18|   1221|        Al Pacino|1461699306|
|    18|   1221|            Mafia|1461699303|
|    18|   5995|        holocaust|1455735472|
|    18|   5995|       true story|1455735479|
|    18|  44665|     twist ending|1456948283|
|    18|  52604|  Anthony Hopkins|

In [13]:
opinions = movies.join(tags, ["movieId"], "inner").show()

+-------+--------------------+--------------------+------+----------------+----------+
|movieId|               title|              genres|userId|             tag| timestamp|
+-------+--------------------+--------------------+------+----------------+----------+
|      1|    Toy Story (1995)|Adventure|Animati...|   567|             fun|1525286013|
|      1|    Toy Story (1995)|Adventure|Animati...|   474|           pixar|1137206825|
|      1|    Toy Story (1995)|Adventure|Animati...|   336|           pixar|1139045764|
|      2|      Jumanji (1995)|Adventure|Childre...|   474|            game|1137375552|
|      2|      Jumanji (1995)|Adventure|Childre...|    62|  Robin Williams|1528843907|
|      2|      Jumanji (1995)|Adventure|Childre...|    62|magic board game|1528843932|
|      2|      Jumanji (1995)|Adventure|Childre...|    62|         fantasy|1528843929|
|      3|Grumpier Old Men ...|      Comedy|Romance|   289|             old|1143424860|
|      3|Grumpier Old Men ...|      Comedy|

In [14]:
opinions = movies.join(tags, ["movieId"], "left").show()

+-------+--------------------+--------------------+------+----------------+----------+
|movieId|               title|              genres|userId|             tag| timestamp|
+-------+--------------------+--------------------+------+----------------+----------+
|      1|    Toy Story (1995)|Adventure|Animati...|   567|             fun|1525286013|
|      1|    Toy Story (1995)|Adventure|Animati...|   474|           pixar|1137206825|
|      1|    Toy Story (1995)|Adventure|Animati...|   336|           pixar|1139045764|
|      2|      Jumanji (1995)|Adventure|Childre...|   474|            game|1137375552|
|      2|      Jumanji (1995)|Adventure|Childre...|    62|  Robin Williams|1528843907|
|      2|      Jumanji (1995)|Adventure|Childre...|    62|magic board game|1528843932|
|      2|      Jumanji (1995)|Adventure|Childre...|    62|         fantasy|1528843929|
|      3|Grumpier Old Men ...|      Comedy|Romance|   289|             old|1143424860|
|      3|Grumpier Old Men ...|      Comedy|

In [22]:
opinions = (movies.join(tags, ["movieId"], "left")
.select("movieId", "userId", "title", "tag", "timestamp"))

opinions.show()

+-------+------+--------------------+----------------+----------+
|movieId|userId|               title|             tag| timestamp|
+-------+------+--------------------+----------------+----------+
|      1|   567|    Toy Story (1995)|             fun|1525286013|
|      1|   474|    Toy Story (1995)|           pixar|1137206825|
|      1|   336|    Toy Story (1995)|           pixar|1139045764|
|      2|   474|      Jumanji (1995)|            game|1137375552|
|      2|    62|      Jumanji (1995)|  Robin Williams|1528843907|
|      2|    62|      Jumanji (1995)|magic board game|1528843932|
|      2|    62|      Jumanji (1995)|         fantasy|1528843929|
|      3|   289|Grumpier Old Men ...|             old|1143424860|
|      3|   289|Grumpier Old Men ...|           moldy|1143424860|
|      4|  null|Waiting to Exhale...|            null|      null|
|      5|   474|Father of the Bri...|          remake|1137373903|
|      5|   474|Father of the Bri...|       pregnancy|1137373903|
|      6| 

In [23]:
opinions_time = opinions.withColumnRenamed("timestamp", "tag_time")
opinions_time.join(ratings, ["movieId", "userId"], "inner").show()

+-------+------+--------------------+----------------+----------+------+----------+
|movieId|userId|               title|             tag|  tag_time|rating| timestamp|
+-------+------+--------------------+----------------+----------+------+----------+
|      1|   567|    Toy Story (1995)|             fun|1525286013|   3.5|1525286001|
|      1|   474|    Toy Story (1995)|           pixar|1137206825|   4.0| 978575760|
|      1|   336|    Toy Story (1995)|           pixar|1139045764|   4.0|1122227329|
|      2|   474|      Jumanji (1995)|            game|1137375552|   3.0|1046886814|
|      2|    62|      Jumanji (1995)|  Robin Williams|1528843907|   4.0|1528843890|
|      2|    62|      Jumanji (1995)|magic board game|1528843932|   4.0|1528843890|
|      2|    62|      Jumanji (1995)|         fantasy|1528843929|   4.0|1528843890|
|      3|   289|Grumpier Old Men ...|             old|1143424860|   2.5|1143424657|
|      3|   289|Grumpier Old Men ...|           moldy|1143424860|   2.5|1143