In [1]:
%%bash
python --version
java -version
#scala -version

Python 3.9.7


java version "1.8.0_311"
Java(TM) SE Runtime Environment (build 1.8.0_311-b11)
Java HotSpot(TM) 64-Bit Server VM (build 25.311-b11, mixed mode)


In [2]:
java.sun.com
http://java.sun.com/products/autodl/j2se
/Library/Internet Plug-Ins/JavaAppletPlugin.plugin/Contents/Home/bin/java

SyntaxError: invalid syntax (2910056770.py, line 2)

In [None]:
import pyspark

In [None]:
from pyspark import SparkContext
from pyspark.sql import SQLContext

In [None]:
sc = SparkContext()
sc

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName("Analyzing Movies Dataset") \
        .getOrCreate()

#### Load the dataset

Both datasets have been sourced from https://www.kaggle.com/rounakbanik/the-movies-dataset. <br />
The first file (ratings.csv) contains 26 million ratings from 270,000 users on all the 45,000 movies but here we are considering only 2 million ratings for this demo. <br />
The second file (movies_metadata.csv) includes information about budget, revenue, release dates, languages, production countries etc. for each movie.

In [None]:
movie_ratings = spark.read\
               .format("csv")\
               .option("sep", ",")\
               .option("header", "true")\
               .load("../datasets/ratings.csv")

In [None]:
movie_ratings.printSchema()

In [None]:
movie_metadata = spark.read\
                         .format("csv")\
                         .option("sep", ",")\
                         .option("header", "true")\
                         .load("../datasets/movies_metadata.csv")

In [None]:
movie_metadata.printSchema()

In [None]:
movie_ratings.count(),movie_metadata.count()

#### Distinct number of Id

In [None]:
movie_metadata.select('id')\
                 .distinct()\
                 .count()

#### Drop columns which are not necessary for our analysis

In [None]:
movie_metadata = movie_metadata.drop(
                        'belongs_to_collection', 
                        'genres',
                        'homepage',
                        'imdb_id',
                        'overview',
                        'poster_path',
                        'production_companies',
                        'production_countries',
                        'spoken_languages',
                        'tagline')
movie_metadata

#### View the schema of the dataframes
Though many numeric columns are currently formatted as strings, we'll adjust their types when we need to later on

In [None]:
movie_metadata.printSchema()

#### Drop the invalid data from the dataframes

In [None]:
movie_metadata = movie_metadata.dropna()
movie_ratings = movie_ratings.dropna()

#### The size of movie_metadata has reduced

In [None]:
movie_ratings.count(),movie_metadata.count()

#### Cast the numeric columns as float
These are currently strings

In [None]:
for col_name in ["budget", "revenue", "vote_average", "popularity"]:
    movie_metadata = movie_metadata.withColumn(col_name, 
                                               movie_metadata[col_name].cast("float"))
    
movie_metadata.printSchema()

#### We are interested in positive budget and revenue values

In [None]:
movie_metadata = movie_metadata[(movie_metadata.budget > 0) \
                                & (movie_metadata.revenue > 0)]

In [None]:
movie_metadata.count()

In [None]:
movie_metadata.select("budget", "revenue").show()

In [None]:
movie_metadata = movie_metadata.withColumn("profit",
                                           (movie_metadata["revenue"] - movie_metadata["budget"]))

In [None]:
movie_metadata.select("profit").sort(movie_metadata["profit"].desc())\
                               .show()

In [None]:
movie_metadata.printSchema()

In [None]:
from pyspark.sql.functions import udf

#### We are spliting the date column and from here we will take only year and save it in another column.

In [None]:
year_extract_udf = udf(lambda date: date.split('-')[0])

movie_metadata = movie_metadata.withColumn(
                    "release_year",
                    year_extract_udf(movie_metadata.release_date)
)

#### Drop the date column

In [None]:
movie_metadata =movie_metadata.drop("release_date")

In [None]:
movie_metadata.s

#### Filter the dataset to movies released since 1995 

In [None]:
movies_1995 = movie_metadata.filter(movie_metadata.release_year >= 1995)

In [None]:
movies_1995.count()

#### Check the number of distinct languages in this filtered dataset

In [None]:
movies_1995.select(movies_1995.original_language)\
       .distinct()\
       .count()

#### Check average popularity and profits for movies in each language

In [None]:
movies_1995 = movies_1995.groupBy("original_language")\
                         .agg({"popularity": "avg",
                               "profit": "avg"
                              })

In [None]:
movies_1995 =movies_1995.withColumnRenamed("avg(popularity)", "popularity")\
                        .withColumnRenamed("avg(profit)", "profit")

#### How popular and profitable are movies in each language?
* English, German, Spanish films are popular and profitable
* Vietnamese films are neither profitable nor popular
* Romanian films are mildly popular but unprofitable

In [None]:
movies_1995.show()

#### Define your own metric to calculate how "good" a movie is
We scale the popularity and profitabiltiy values

In [None]:
weight_popularity = 10
weight_profit=0.000001

In [None]:
movies = movies_1995.withColumn("score",
                                    (movies_1995.popularity * weight_popularity + \
                                     movies_1995.profit * weight_profit))

In [None]:
movies.show()

#### We are interested in movies with a score greater than 50
English, Japanese and Chinese movies top our list

In [None]:
movies = movies.filter(movies.score > 50)\
                   .sort(movies.score.desc())
    
movies.show(15)

In [None]:
from pyspark.sql.functions import broadcast

In [None]:
movie_ratings.count(),movie_metadata.count()

### Broadcast & Join

* Broadcast the smaller dataframe so it is available on all cluster machines
* The data should be small enough so it is held in memory
* All nodes in the cluster distribute the data as fast as they can so overall computation is faster

In [None]:
movies_joined = movie_ratings.select('movieId',
                                     'rating')\
                             .join(broadcast(movie_metadata),
                                   movie_ratings.movieId == movie_metadata.id, 'inner')

In [None]:
movies_joined.columns

In [None]:
movies_joined = movies_joined.drop(
                                     'adult',
                                     'id',
                                     'budget',
                                     'original_title',
                                     'popularity',
                                     'revenue',
                                     'runtime',
                                     'status',
                                     'video',
                                     'vote_average',
                                     'vote_count',
                                     'profit',
                                     'release_year')
movies_joined

In [None]:
movies_joined.show(truncate = False)

In [None]:
movies_joined.groupBy(["movieID", "title"])\
             .agg({"rating": "avg"}).show()

In [None]:
movies_joined.select("rating").describe().show()

### Accumulators
* Shared variables which are updated by processes running across multiple nodes

In [None]:
terrible_count = spark.sparkContext.accumulator(0)
subpar_count = spark.sparkContext.accumulator(0)
average_count = spark.sparkContext.accumulator(0)
good_count = spark.sparkContext.accumulator(0)

#### Get movie count by ratings

In [None]:
def count_movie_by_rating(row):
    rating = float(row.rating)
    
    if (rating <= 2.0 ):
        terrible_count.add(1)
    elif (rating <= 3.0 and rating > 2.0 ):
        subpar_count.add(1)
    elif (rating <= 4.0 and rating > 3.0 ):
        average_count.add(1)
    elif (rating > 4.0) :
        good_count.add(1)

In [None]:
movies_joined.foreach(lambda x: count_movie_by_rating(x))

In [None]:
print("Terrible movies: ", terrible_count.value)
print("Sub-par movies: ", subpar_count.value)
print("Average movies: ", average_count.value)
print("Good movies: ", good_count.value)

### Save the dataframe to a file

#### Save to a CSV file
Saves the data into a directory. You could also give a path in HDFS

In [None]:
movies_1995.write\
           .option("header","true")\
           .csv("movies_1995_csv")

#### Coalesce data into one file
Data from all partitions will coalesce into a single list

In [None]:
movies_1995.coalesce(1)\
           .write\
           .option("header","true")\
           .csv("movies_1995_coalesce_csv")

#### Save data as a JSON file

In [None]:
movies_1995.coalesce(1)\
           .write\
           .json("movie_overall_json")