
**Load data**

In [None]:
# # If running in Databricks, SparkSession exists already as `spark`.
# # If running locally, uncomment and adjust:

# from pyspark.sql import SparkSession

# spark = SparkSession.builder. \
# appName("Movies101"). \
# getOrCreate()

spark

In [None]:
# Read a Spark table from a specific catalog and schema
df = spark.table("workspace.default.movies")

# Show the first 5 rows
# df.show(5, truncate=False)
df.show(5)

+--------------------+---------+------------+-----------+--------------------+-------+--------+---------+--------+--------+
|               title| industry|release_year|imdb_rating|              studio| budget| revenue|     unit|currency|language|
+--------------------+---------+------------+-----------+--------------------+-------+--------+---------+--------+--------+
|     Pather Panchali|Bollywood|        1955|        8.3|Government of Wes...|70000.0|100000.0|Thousands|     INR| Bengali|
|Doctor Strange in...|Hollywood|        2022|          7|      Marvel Studios|  200.0|   954.8| Millions|     USD| English|
|Thor: The Dark Wo...|Hollywood|        2013|        6.8|      Marvel Studios|  165.0|   644.8| Millions|     USD| English|
|     Thor: Ragnarok |Hollywood|        2017|        7.9|      Marvel Studios|  180.0|   854.0| Millions|     USD| English|
|Thor: Love and Th...|Hollywood|        2022|        6.8|      Marvel Studios|  250.0|   670.0| Millions|     USD| English|
+-------

In [None]:
display(df)

title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
Pather Panchali,Bollywood,1955,8.3,Government of West Bengal,70000.0,100000.0,Thousands,INR,Bengali
Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,200.0,954.8,Millions,USD,English
Thor: The Dark World,Hollywood,2013,6.8,Marvel Studios,165.0,644.8,Millions,USD,English
Thor: Ragnarok,Hollywood,2017,7.9,Marvel Studios,180.0,854.0,Millions,USD,English
Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,250.0,670.0,Millions,USD,English
The Shawshank Redemption,Hollywood,1994,9.3,Castle Rock Entertainment,25.0,73.3,Millions,USD,English
Interstellar,Hollywood,2014,8.6,Warner Bros. Pictures,165.0,701.8,Millions,USD,English
The Pursuit of Happyness,Hollywood,2006,8.0,Columbia Pictures,55.0,307.1,Millions,USD,English
Gladiator,Hollywood,2000,8.5,Universal Pictures,103.0,460.5,Millions,USD,English
Titanic,Hollywood,1997,7.9,Paramount Pictures,200.0,2202.0,Millions,USD,English


In [None]:
# Print the schema of the DataFrame.
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- release_year: long (nullable = true)
 |-- imdb_rating: string (nullable = true)
 |-- studio: string (nullable = true)
 |-- budget: double (nullable = true)
 |-- revenue: double (nullable = true)
 |-- unit: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- language: string (nullable = true)



In [None]:
# columns & row count
print("Columns:", df.columns)
print("Row Count:", df.count())

Columns: ['title', 'industry', 'release_year', 'imdb_rating', 'studio', 'budget', 'revenue', 'unit', 'currency', 'language']
Row Count: 37


### Dataframe Statistics

In [None]:
# Returns count, mean, stddev, min, quartiles, max for numeric columns
display(df.describe())


summary,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
count,37,37,37.0,37.0,34,37.0,37.0,37,37,37
mean,,,2007.027027027027,7.919444444444445,,2084.9751351351347,4117.135135135136,,,
stddev,,,17.657995492263687,1.2049468143436146,,11477.487145324878,16372.462681608891,,,
min,3 Idiots,Bollywood,1946.0,1.9,20th Century Fox,1.0,3.1,Billions,INR,Bengali
max,Titanic,Hollywood,2022.0,,Zee Studios,70000.0,100000.0,Thousands,USD,Telugu


In [None]:
# Gives additional metrics like variance, skewness, kurtosis
display(df.summary())

summary,title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language
count,37,37,37.0,37.0,34,37.0,37.0,37,37,37
mean,,,2007.027027027027,7.919444444444445,,2084.9751351351347,4117.135135135136,,,
stddev,,,17.657995492263687,1.2049468143436146,,11477.487145324878,16372.462681608891,,,
min,3 Idiots,Bollywood,1946.0,1.9,20th Century Fox,1.0,3.1,Billions,INR,Bengali
25%,,,2001.0,7.8,,15.5,263.1,,,
50%,,,2014.0,8.1,,165.0,701.8,,,
75%,,,2018.0,8.4,,250.0,2000.0,,,
max,Titanic,Hollywood,2022.0,,Zee Studios,70000.0,100000.0,Thousands,USD,Telugu


### Column Filtering 

**==> Print only movie name, rating and industry**

In [None]:
# Select only specific columns
df_trimmed = df.select("title", "imdb_rating", "industry")

# Show the first 3 rows
df_trimmed.show(3, truncate=False)

+-------------------------------------------+-----------+---------+
|title                                      |imdb_rating|industry |
+-------------------------------------------+-----------+---------+
|Pather Panchali                            |8.3        |Bollywood|
|Doctor Strange in the Multiverse of Madness|7          |Hollywood|
|Thor: The Dark World                       |6.8        |Hollywood|
+-------------------------------------------+-----------+---------+
only showing top 3 rows


### Row Filtering 

**==> Print movies released between 2000 to 2010**

In [None]:
from pyspark.sql.functions import col

# ðŸŽ¬ Filter rows where release_year is between 2000 and 2010 (inclusive)
df_filtered = df.filter(
    (col("release_year") >= 2000) & (col("release_year") <= 2010)
)

# ðŸ‘€ Show a few rows
df_filtered.show()

+--------------------+---------+------------+-----------+--------------------+------+-------+--------+--------+--------+
|               title| industry|release_year|imdb_rating|              studio|budget|revenue|    unit|currency|language|
+--------------------+---------+------------+-----------+--------------------+------+-------+--------+--------+--------+
|The Pursuit of Ha...|Hollywood|        2006|          8|   Columbia Pictures|  55.0|  307.1|Millions|     USD| English|
|           Gladiator|Hollywood|        2000|        8.5|Universal Pictures  | 103.0|  460.5|Millions|     USD| English|
|              Avatar|Hollywood|        2009|        7.8|    20th Century Fox| 237.0| 2847.0|Millions|     USD| English|
|     The Dark Knight|Hollywood|        2008|          9|             Syncopy| 185.0| 1006.0|Millions|     USD| English|
|            3 Idiots|Bollywood|        2009|        8.4|  Vinod Chopra Films| 550.0| 4000.0|Millions|     INR|   Hindi|
|Kabhi Khushi Kabh...|Bollywood|

In [None]:
df_filtered = df.filter(col("release_year").between(2000, 2010))

df_filtered.show()

+--------------------+---------+------------+-----------+--------------------+------+-------+--------+--------+--------+
|               title| industry|release_year|imdb_rating|              studio|budget|revenue|    unit|currency|language|
+--------------------+---------+------------+-----------+--------------------+------+-------+--------+--------+--------+
|The Pursuit of Ha...|Hollywood|        2006|          8|   Columbia Pictures|  55.0|  307.1|Millions|     USD| English|
|           Gladiator|Hollywood|        2000|        8.5|Universal Pictures  | 103.0|  460.5|Millions|     USD| English|
|              Avatar|Hollywood|        2009|        7.8|    20th Century Fox| 237.0| 2847.0|Millions|     USD| English|
|     The Dark Knight|Hollywood|        2008|          9|             Syncopy| 185.0| 1006.0|Millions|     USD| English|
|            3 Idiots|Bollywood|        2009|        8.4|  Vinod Chopra Films| 550.0| 4000.0|Millions|     INR|   Hindi|
|Kabhi Khushi Kabh...|Bollywood|

**==> Print movies from marvel studios**

In [None]:
df_marvel = df.filter(col("studio") == 'Marvel Studios')

df_marvel.show()

+--------------------+---------+------------+-----------+--------------+------+-------+--------+--------+--------+
|               title| industry|release_year|imdb_rating|        studio|budget|revenue|    unit|currency|language|
+--------------------+---------+------------+-----------+--------------+------+-------+--------+--------+--------+
|Doctor Strange in...|Hollywood|        2022|          7|Marvel Studios| 200.0|  954.8|Millions|     USD| English|
|Thor: The Dark Wo...|Hollywood|        2013|        6.8|Marvel Studios| 165.0|  644.8|Millions|     USD| English|
|     Thor: Ragnarok |Hollywood|        2017|        7.9|Marvel Studios| 180.0|  854.0|Millions|     USD| English|
|Thor: Love and Th...|Hollywood|        2022|        6.8|Marvel Studios| 250.0|  670.0|Millions|     USD| English|
|   Avengers: Endgame|Hollywood|        2019|        8.4|Marvel Studios| 400.0| 2798.0|Millions|     USD| English|
|Avengers: Infinit...|Hollywood|        2018|        8.4|Marvel Studios| 400.0| 

**How many distinct movie industries are there in our dataset?**

In [None]:
# Get unique/distinct values of the 'industry' column
unique_industries = df.select("industry").distinct()
display(unique_industries)

industry
Bollywood
Hollywood


In [None]:
# Get unique/distinct values of the 'language' column
unique_language = df.select("language").distinct()
display(unique_language)

language
Hindi
Kannada
English
Bengali
Telugu


### Add / Modify Columns

In [None]:
# ðŸŽ¬ Add a new column 'profit' = revenue - budget
df = df.withColumn("profit", col("revenue") - col("budget"))
display(df)

title,industry,release_year,imdb_rating,studio,budget,revenue,unit,currency,language,profit
Pather Panchali,Bollywood,1955,8.3,Government of West Bengal,70000.0,100000.0,Thousands,INR,Bengali,30000.0
Doctor Strange in the Multiverse of Madness,Hollywood,2022,7.0,Marvel Studios,200.0,954.8,Millions,USD,English,754.8
Thor: The Dark World,Hollywood,2013,6.8,Marvel Studios,165.0,644.8,Millions,USD,English,479.8
Thor: Ragnarok,Hollywood,2017,7.9,Marvel Studios,180.0,854.0,Millions,USD,English,674.0
Thor: Love and Thunder,Hollywood,2022,6.8,Marvel Studios,250.0,670.0,Millions,USD,English,420.0
The Shawshank Redemption,Hollywood,1994,9.3,Castle Rock Entertainment,25.0,73.3,Millions,USD,English,48.3
Interstellar,Hollywood,2014,8.6,Warner Bros. Pictures,165.0,701.8,Millions,USD,English,536.8
The Pursuit of Happyness,Hollywood,2006,8.0,Columbia Pictures,55.0,307.1,Millions,USD,English,252.1
Gladiator,Hollywood,2000,8.5,Universal Pictures,103.0,460.5,Millions,USD,English,357.5
Titanic,Hollywood,1997,7.9,Paramount Pictures,200.0,2202.0,Millions,USD,English,2002.0


In [None]:
### rename column

# ðŸŽ¬ Rename 'revenue' to 'total_revenue'
df = df.withColumnRenamed("revenue", "total_revenue")

# Verify the change
df.printSchema()

root
 |-- title: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- release_year: long (nullable = true)
 |-- imdb_rating: string (nullable = true)
 |-- studio: string (nullable = true)
 |-- budget: double (nullable = true)
 |-- total_revenue: double (nullable = true)
 |-- unit: string (nullable = true)
 |-- currency: string (nullable = true)
 |-- language: string (nullable = true)
 |-- profit: double (nullable = true)

