In [0]:
# 1. Load the raw IMDb files
basics = spark.read.option("sep", "\t").option("header", True).csv("/FileStore/tables/title_basics.tsv")
ratings = spark.read.option("sep", "\t").option("header", True).csv("/FileStore/tables/title_ratings.tsv")

# 2. Keep only full-length movies and drop rows with missing data
from pyspark.sql.functions import col

movies_only = basics.filter(col("titleType") == "movie") \
                    .dropna(subset=["primaryTitle", "startYear", "genres"])

# 3. Join with ratings and cast rating columns to proper types
imdb_clean = movies_only.join(ratings, on="tconst", how="inner") \
    .withColumn("averageRating", col("averageRating").cast("float")) \
    .withColumn("numVotes", col("numVotes").cast("int"))

# 4. Optional: Preview the cleaned data
imdb_clean.select("primaryTitle", "startYear", "genres", "averageRating", "numVotes").show(10, truncate=False)



+-------------------------------------------+---------+------+-------------+--------+
|primaryTitle                               |startYear|genres|averageRating|numVotes|
+-------------------------------------------+---------+------+-------------+--------+
|Hamlet                                     |1908     |Drama |3.2          |33      |
|Don Quijote                                |1908     |Drama |4.3          |23      |
|Faldgruben                                 |1909     |\N    |4.5          |19      |
|Locura de amor                             |1909     |Drama |4.6          |30      |
|Hamlet                                     |1910     |Drama |3.9          |53      |
|Orientalsk dans                            |1910     |\N    |4.0          |13      |
|Captain Starlight, or Gentleman of the Road|1911     |\N    |4.6          |16      |
|The Infant at Snakeville                   |1911     |\N    |5.0          |20      |
|Les Misérables, Part 1: Jean Valjean       |1913     

In [0]:
imdb_clean.write.mode("overwrite").option("header", True) \
    .csv("/FileStore/moviedata/imdb/cleaned_imdb_data")


In [0]:

# Rename  part file to a clean name
dbutils.fs.cp(
  "dbfs:/FileStore/moviedata/imdb/cleaned_imdb_data/part-00000-tid-5116392208614131044-d2bdc5de-c2cf-4ff6-a851-55ddd9ae2a2e-304-1-c000.csv",
  "dbfs:/FileStore/cleaned_imdb.csv"
)



Out[53]: True