In [4]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

spark = SparkSession.builder.appName("DataWrangling").getOrCreate()

df = spark.read.csv(
path="../../../data-sets/ml-latest-small/movies.csv",
header=True,
inferSchema=True)

df.show(10, truncate=False)
df.printSchema()

+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
|6      |Heat (1995)                       |Action|Crime|Thriller                      |
|7      |Sabrina (1995)                    |Comedy|Romance                             |
|8      |Tom and Huck (1995)               |Adventure|Children                         |
|9      |Sudden Death

In [5]:
df.where(f.col("genres") == "Action").show(10, truncate=False)

+-------+-----------------------------------------------------------+------+
|movieId|title                                                      |genres|
+-------+-----------------------------------------------------------+------+
|9      |Sudden Death (1995)                                        |Action|
|71     |Fair Game (1995)                                           |Action|
|204    |Under Siege 2: Dark Territory (1995)                       |Action|
|251    |Hunted, The (1995)                                         |Action|
|667    |Bloodsport 2 (a.k.a. Bloodsport II: The Next Kumite) (1996)|Action|
|1170   |Best of the Best 3: No Turning Back (1995)                 |Action|
|1497   |Double Team (1997)                                         |Action|
|1599   |Steel (1997)                                               |Action|
|2196   |Knock Off (1998)                                           |Action|
|2534   |Avalanche (1978)                                           |Action|

In [7]:
df = df.withColumn("genre", f.explode(f.split("genres", "\|"))).select("movieId", "title", "genre")

df.show(10, truncate=False)

+-------+-----------------------+---------+
|movieId|title                  |genre    |
+-------+-----------------------+---------+
|1      |Toy Story (1995)       |Adventure|
|1      |Toy Story (1995)       |Animation|
|1      |Toy Story (1995)       |Children |
|1      |Toy Story (1995)       |Comedy   |
|1      |Toy Story (1995)       |Fantasy  |
|2      |Jumanji (1995)         |Adventure|
|2      |Jumanji (1995)         |Children |
|2      |Jumanji (1995)         |Fantasy  |
|3      |Grumpier Old Men (1995)|Comedy   |
|3      |Grumpier Old Men (1995)|Romance  |
+-------+-----------------------+---------+
only showing top 10 rows



In [11]:
df.select("genre").distinct().show(35)
print(df.select("genre").distinct().count())

+------------------+
|             genre|
+------------------+
|             Crime|
|           Romance|
|          Thriller|
|         Adventure|
|             Drama|
|               War|
|       Documentary|
|           Fantasy|
|           Mystery|
|           Musical|
|         Animation|
|         Film-Noir|
|(no genres listed)|
|              IMAX|
|            Horror|
|           Western|
|            Comedy|
|          Children|
|            Action|
|            Sci-Fi|
+------------------+

20


In [14]:
df.where(f.col("genre") == "(no genres listed)").show()
print(df.where(f.col("genre") == "(no genres listed)").count())

+-------+--------------------+------------------+
|movieId|               title|             genre|
+-------+--------------------+------------------+
| 114335|   La cravate (1957)|(no genres listed)|
| 122888|      Ben-hur (2016)|(no genres listed)|
| 122896|Pirates of the Ca...|(no genres listed)|
| 129250|   Superfast! (2015)|(no genres listed)|
| 132084| Let It Be Me (1995)|(no genres listed)|
| 134861|Trevor Noah: Afri...|(no genres listed)|
| 141131|    Guardians (2016)|(no genres listed)|
| 141866|   Green Room (2015)|(no genres listed)|
| 142456|The Brand New Tes...|(no genres listed)|
| 143410|          Hyena Road|(no genres listed)|
| 147250|The Adventures of...|(no genres listed)|
| 149330|A Cosmic Christma...|(no genres listed)|
| 152037|  Grease Live (2016)|(no genres listed)|
| 155589|Noin 7 veljestä (...|(no genres listed)|
| 156605|            Paterson|(no genres listed)|
| 159161|Ali Wong: Baby Co...|(no genres listed)|
| 159779|A Midsummer Night...|(no genres listed)|
