In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from datetime import datetime
from pyspark.sql.functions import from_unixtime

# Create Spark session
spark = SparkSession.builder \
    .appName("Spark with Hive") \
    .enableHiveSupport() \
    .getOrCreate()

23/12/04 03:42:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [2]:
# Reading movies data

hdfs_path = '/tmp/spark_movie/movies.csv'
df_movies = spark.read.format('csv').option('header', 'true').option('inferSchema', 'true').load(hdfs_path)

# Print schema and sample data
df_movies.printSchema()
df_movies.show(5)

                                                                                

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [3]:
# Define the correct schema based on your CSV structure
schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("rating", FloatType(), True),
    StructField("timestamp",IntegerType(), True),
])
hdfs_path = '/tmp/spark_movie/ratings.csv'
# Read the CSV file into a DataFrame
df_ratings = spark.read.format('csv').option('header', 'true').option('inferSchema', 'false').schema(schema).load(hdfs_path)

# Convert timestamp to TimestampType
df_ratings = df_ratings.withColumn("timestamp", from_unixtime("timestamp").cast(TimestampType()))

# Show the DataFrame
df_ratings.show()

[Stage 3:>                                                          (0 + 1) / 1]

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|      1|   4.0|2000-07-30 18:45:03|
|     1|      3|   4.0|2000-07-30 18:20:47|
|     1|      6|   4.0|2000-07-30 18:37:04|
|     1|     47|   5.0|2000-07-30 19:03:35|
|     1|     50|   5.0|2000-07-30 18:48:51|
|     1|     70|   3.0|2000-07-30 18:40:00|
|     1|    101|   5.0|2000-07-30 18:14:28|
|     1|    110|   4.0|2000-07-30 18:36:16|
|     1|    151|   5.0|2000-07-30 19:07:21|
|     1|    157|   5.0|2000-07-30 19:08:20|
|     1|    163|   5.0|2000-07-30 19:00:50|
|     1|    216|   5.0|2000-07-30 18:20:08|
|     1|    223|   3.0|2000-07-30 18:16:25|
|     1|    231|   5.0|2000-07-30 18:19:39|
|     1|    235|   4.0|2000-07-30 18:15:08|
|     1|    260|   5.0|2000-07-30 18:28:00|
|     1|    296|   3.0|2000-07-30 18:49:27|
|     1|    316|   3.0|2000-07-30 18:38:30|
|     1|    333|   5.0|2000-07-30 18:19:39|
|     1|    349|   4.0|2000-07-3

                                                                                

In [4]:
# Define the correct schema based on your CSV structure
schema = StructType([
    StructField("userId", IntegerType(), True),
    StructField("movieId", IntegerType(), True),
    StructField("tag", StringType(), True),
    StructField("timestamp",IntegerType(), True),
])

hdfs_path = '/tmp/spark_movie/tags.csv'
# Read the CSV file into a DataFrame
df_tags = spark.read.format('csv').option('header', 'true').option('inferSchema', 'false').schema(schema).load(hdfs_path)

# Convert timestamp to TimestampType
df_tags = df_tags.withColumn("timestamp", from_unixtime("timestamp").cast(TimestampType()))

# Show the DataFrame
df_tags.show()

+------+-------+-----------------+-------------------+
|userId|movieId|              tag|          timestamp|
+------+-------+-----------------+-------------------+
|     2|  60756|            funny|2015-10-24 19:29:54|
|     2|  60756|  Highly quotable|2015-10-24 19:29:56|
|     2|  60756|     will ferrell|2015-10-24 19:29:52|
|     2|  89774|     Boxing story|2015-10-24 19:33:27|
|     2|  89774|              MMA|2015-10-24 19:33:20|
|     2|  89774|        Tom Hardy|2015-10-24 19:33:25|
|     2| 106782|            drugs|2015-10-24 19:30:54|
|     2| 106782|Leonardo DiCaprio|2015-10-24 19:30:51|
|     2| 106782|  Martin Scorsese|2015-10-24 19:30:56|
|     7|  48516|     way too long|2007-01-25 01:08:45|
|    18|    431|        Al Pacino|2016-05-01 21:39:25|
|    18|    431|         gangster|2016-05-01 21:39:09|
|    18|    431|            mafia|2016-05-01 21:39:15|
|    18|   1221|        Al Pacino|2016-04-26 19:35:06|
|    18|   1221|            Mafia|2016-04-26 19:35:03|
|    18|  

In [5]:
# Work with spark SQL

df_movies.createOrReplaceTempView("MOVIES")
df_ratings.createOrReplaceTempView("RATINGS")
df_tags.createOrReplaceTempView("TAGS")

In [6]:
# Aggregated number of ratings per year

query= """Select year(timestamp) as year,count(rating) as ratings 
       from RATINGS 
       group by 1 
        order by year(timestamp) desc"""


output = spark.sql(query)
output.show()

# Write data in HDFS into single file

# output.coalesce(1).write.format('csv').option('header', 'true').option('delimiter', ',').save('/tmp/output_data/spark_movie/')
output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/agg_Ratings.csv')
print("Write Successfull")

                                                                                

+----+-------+
|year|ratings|
+----+-------+
|2018|   6418|
|2017|   8198|
|2016|   6703|
|2015|   6616|
|2014|   1439|
|2013|   1664|
|2012|   4656|
|2011|   1690|
|2010|   2301|
|2009|   4158|
|2008|   4351|
|2007|   7114|
|2006|   4059|
|2005|   5813|
|2004|   3279|
|2003|   4014|
|2002|   3478|
|2001|   3922|
|2000|  10061|
|1999|   2439|
+----+-------+
only showing top 20 rows



                                                                                

Write Successfull


In [7]:
# Average Monthly number of ratings

query= """Select left(timestamp,7) as year_month,avg(rating) as avg_rating
       from RATINGS 
       group by 1 
        order by  left(timestamp,7) desc"""


output = spark.sql(query)
output.show()

# Write data in HDFS into single file

# output.coalesce(1).write.format('csv').option('header', 'true').option('delimiter', ',').save('/tmp/output_data/spark_movie/')
output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/avg_monthly_Ratings.csv')
print("Write Successfull")

                                                                                

+----------+------------------+
|year_month|        avg_rating|
+----------+------------------+
|   2018-09| 3.568708609271523|
|   2018-08|3.5577617328519855|
|   2018-07| 4.010238907849829|
|   2018-06| 3.979713603818616|
|   2018-05|2.9516298633017874|
|   2018-04|              3.75|
|   2018-03| 3.786817713697219|
|   2018-02|2.7386655260906756|
|   2018-01|3.4194736842105264|
|   2017-12|3.2611940298507465|
|   2017-11| 3.652173913043478|
|   2017-10|3.5244444444444443|
|   2017-09|3.6827830188679247|
|   2017-08| 4.076923076923077|
|   2017-07| 4.052941176470588|
|   2017-06|2.9594240837696337|
|   2017-05| 3.480183562786817|
|   2017-04| 3.626218851570964|
|   2017-03| 3.051001821493625|
|   2017-02|2.7547619047619047|
+----------+------------------+
only showing top 20 rows



                                                                                

Write Successfull


                                                                                

In [8]:
# Ratings Level Distribution

query= """with t1 as (
    Select rating,case when rating between 0 and 2 THEN '0.0-2.0'
    WHEN rating between 2.3 and 4 THEN '2.5-4.0'
    ELSE '>4' END as rating_bucket
    from RATINGS),
    
    t2 as (select rating_bucket,count(rating) as counts
    from t1
    group by 1
    order by 1)
    
    Select rating_bucket,counts,counts*100/sum(counts)over() as percentage
    from t2"""

# Select rating_bucket,count(*) as counts
# from t1
# group by 1
# order by 1"""


output = spark.sql(query)
output.show()

# Write data in HDFS into single file

output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/distribution_ratings.csv')
print("Write Successfull")

23/12/04 03:43:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:09 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:10 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 0

+-------------+------+------------------+
|rating_bucket|counts|        percentage|
+-------------+------+------------------+
|      0.0-2.0| 13523|13.410885001388394|
|      2.5-4.0| 65551| 65.00753699075727|
|           >4| 21762|21.581578007854336|
+-------------+------+------------------+



23/12/04 03:43:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:12 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:13 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 0

Write Successfull


In [9]:
# Movies Tagged but not Rated

query= """
          with t1 as (Select distinct 
          t.movieID from TAGS as t
          left join RATINGS as r
          on t.movieID=r.movieID
          where r.movieID IS NULL)
          
          Select m.title 
          from MOVIES as m
          inner join t1
          on m.movieID=t1.movieID
          order by 1"""
          

output = spark.sql(query)
output.show()

# Write data in HDFS into single file

output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/tagged_not_rated.csv')
print("Write Successfull")

+--------------------+
|               title|
+--------------------+
|Browning Version,...|
|Call Northside 77...|
|  Chalet Girl (2011)|
|  Chosen, The (1981)|
|Color of Paradise...|
|For All Mankind (...|
|I Know Where I'm ...|
|In the Realms of ...|
|Innocents, The (1...|
|Mutiny on the Bou...|
|      Niagara (1953)|
|Parallax View, Th...|
|        Proof (1991)|
|Road Home, The (W...|
|Roaring Twenties,...|
|      Scrooge (1970)|
|This Gun for Hire...|
|Twentieth Century...|
+--------------------+

Write Successfull


In [10]:
# Movies Rated but not Tagged

query= """
          with t1 as (Select distinct 
          r.movieID from RATINGS as r
          left join TAGS as t
          on t.movieID=r.movieID
          where t.movieID IS NULL)
          
          Select m.title 
          from MOVIES as m
          inner join t1
          on m.movieID=t1.movieID
          order by 1"""
          

output = spark.sql(query)
output.show()

# Write data in HDFS into single file

output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/rated_not_tagged.csv')
print("Write Successfull")

                                                                                

+--------------------+
|               title|
+--------------------+
|          '71 (2014)|
|'Hellboy': The Se...|
|'Round Midnight (...|
| 'Salem's Lot (2004)|
|'Til There Was Yo...|
|'Tis the Season f...|
|  'burbs, The (1989)|
|'night Mother (1986)|
|*batteries not in...|
|...All the Marble...|
|00 Schneider - Ja...|
|   1-900 (06) (1994)|
|           10 (1979)|
|10 Cent Pistol (2...|
|10 Items or Less ...|
|     10 Years (2011)|
|    10,000 BC (2008)|
|    100 Girls (2000)|
|  100 Streets (2016)|
|101 Dalmatians II...|
+--------------------+
only showing top 20 rows

Write Successfull


In [11]:
# Rated but untagged movies (With more than 30 user ratings) -- Top Movies in terms of avg rating and number of ratings

query= """ with t1 as 
            (Select movieid
                from ratings
                 group by 1
              having count(distinct userid)>30),

        t2 as (Select 
           t1.movieID from t1
           left join TAGS as t
           on t1.movieID=t.movieID
           where t.movieID IS NULL),
          
           t3 as (Select m.title,m.movieID 
           from MOVIES as m
          inner join t2
           on m.movieID=t2.movieID
           order by 1),
           
           t4 as (Select t3.title,avg(r.rating) as avg_rating,
           dense_rank()over(order by avg(r.rating) desc) as avg_rank
           from t3 left join RATINGS as r
           on t3.movieID=r.movieID
           group by 1),
           
           t5 as (Select t3.title,count(rating) as counts,
           dense_rank()over(order by count(rating) desc) as count_rank
           from t3 left join RATINGS as r
           on t3.movieID=r.movieID
           group by 1)
           
           Select t4.title as Movie_title1,t4.avg_rank,Round(t4.avg_rating,4) as avg_rating,t5.title as Movie_title2,t5.count_rank,t5.counts
           from t4 inner join t5
           on t4.avg_rank=t5.count_rank
           where t4.avg_rank<=10 and t5.count_rank<=10
        
           """
          

output = spark.sql(query)
output.show()

# Write data in HDFS into single file

output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/top_10_avgratings&count_ratings.csv')
print("Write Successfull")

23/12/04 03:43:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 0

+--------------------+--------+----------+--------------------+----------+------+
|        Movie_title1|avg_rank|avg_rating|        Movie_title2|count_rank|counts|
+--------------------+--------+----------+--------------------+----------+------+
|Boondock Saints, ...|       1|    4.2209|American Beauty (...|         1|   204|
|       Brazil (1985)|       2|     4.178|Ace Ventura: Pet ...|         2|   161|
|Cinema Paradiso (...|       3|    4.1618|    Mask, The (1994)|         3|   157|
|       Snatch (2000)|       4|    4.1559|     Die Hard (1988)|         4|   145|
|For a Few Dollars...|       5|    4.1515|Die Hard: With a ...|         5|   144|
|Lives of Others, ...|       6|    4.1176|Groundhog Day (1993)|         6|   143|
|  Toy Story 3 (2010)|       7|    4.1091|Dumb & Dumber (Du...|         7|   133|
|Boogie Nights (1997)|       8|    4.0769|Monsters, Inc. (2...|         8|   132|
|Boogie Nights (1997)|       8|    4.0769|    GoldenEye (1995)|         8|   132|
|American Beauty

23/12/04 03:43:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:43:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 0

Write Successfull


In [12]:
# Tags per movie vs Tags per User

query= """with t1 as(
          Select '1' as key, round((sum(CASE when tag IS NOT NULL THEN 1 ELSE 0 END)/count(distinct movieid)),2) as tags_per_movie
          from TAGS),
          
          t2 as ( Select '1' as key, (sum(CASE WHEN tag IS NOT NULL THEN 1 ELSE 0 END)/count(distinct userid)) as tags_per_user
          from TAGS)
          
          Select t1.tags_per_movie,t2.tags_per_user,
          CASE WHEN tags_per_user>tags_per_movie THEN 'tags_per_user is higher'
          ELSE 'tags_per_movie is higher' END as Comparison
          from t1 inner join t2 on t1.key=t2.key"""
          

output = spark.sql(query)
output.show()

# Write data in HDFS into single file

output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/tags_per_movieVStags_per_user.csv')
print("Write Successfull")

+--------------+-------------+--------------------+
|tags_per_movie|tags_per_user|          Comparison|
+--------------+-------------+--------------------+
|          2.34|         63.5|tags_per_user is ...|
+--------------+-------------+--------------------+

Write Successfull


In [13]:
# Users that tagged but did not Rate movies

query= """
         
         Select distinct t.userid
         from TAGS as t
         left join RATINGS as r
         on t.movieID=r.movieID
         where r.userID is NULL"""
          

output = spark.sql(query)
output.show()

# Write data in HDFS into single file

output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/users_tagged_not_rate.csv')
print("Write Successfull")

                                                                                

+------+
|userid|
+------+
|   474|
|   318|
|   543|
|   288|
+------+

Write Successfull


In [14]:
# Ratings per user versus Ratings per Movie

query= """with t1 as(
          Select '1' as key, round((SUM(CASE when rating IS NOT NULL THEN 1 ELSE 0 END)/count(distinct userid)),2) as ratings_per_user
          from RATINGS),
          
          t2 as ( Select '1' as key, round((sum(CASE WHEN rating IS NOT NULL THEN 1 ELSE 0 END)/count(distinct movieid)),2) as ratings_per_movie
          from RATINGS)
          
          Select t1.ratings_per_user,t2.ratings_per_movie
          from t1 inner join t2 on t1.key=t2.key"""


output = spark.sql(query)
output.show()

# Write data in HDFS into single file

output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/ratings_per_userVSratings_per_movie.csv')
print("Write Successfull")

+----------------+-----------------+
|ratings_per_user|ratings_per_movie|
+----------------+-----------------+
|           165.3|            10.37|
+----------------+-----------------+

Write Successfull


In [15]:
# Predominant Genre per rating level

query= """with t1 as(
          Select r.rating,m.genres,count(*) as counts,
          dense_rank()over(partition by r.rating order by count(*) desc) as ranker
          from RATINGS AS r
          left join MOVIES as m
          on r.movieID=m.movieID
          group by 1,2)
          
          Select rating,genres as most_frequent_genre from t1 
          where ranker=1
          order by rating desc"""


          

output = spark.sql(query)
output.show()

# Write data in HDFS into single file

output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/freq_genre_per_rating.csv')
print("Write Successfull")

                                                                                

+------+-------------------+
|rating|most_frequent_genre|
+------+-------------------+
|   5.0|              Drama|
|   4.5|              Drama|
|   4.0|              Drama|
|   3.5|             Comedy|
|   3.0|             Comedy|
|   2.5|             Comedy|
|   2.0|             Comedy|
|   1.5|             Comedy|
|   1.0|             Comedy|
|   0.5|             Comedy|
+------+-------------------+



                                                                                

Write Successfull


In [17]:
# Predominant tag per genre

query= """with t1 as(
          Select m.genres,t.tag,count(*) as counts,
          dense_rank()over(partition by m.genres order by count(*) desc) as ranker
          from MOVIES AS m
          left join TAGS as t
          on t.movieID=m.movieID
          group by 1,2)
          
          Select genres,tag as most_frequent_tag from t1 
          where ranker=1
          order by genres desc"""


          

output = spark.sql(query)
output.show()

# Write data in HDFS into single file

output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/freq_tag_per_genre.csv')
print("Write Successfull")

+--------------------+-----------------+
|              genres|most_frequent_tag|
+--------------------+-----------------+
|             Western|             null|
|                 War|             null|
|            Thriller|             null|
|Sci-Fi|Thriller|IMAX|             null|
|     Sci-Fi|Thriller|             null|
|         Sci-Fi|IMAX|      time-travel|
|         Sci-Fi|IMAX|           sci-fi|
|              Sci-Fi|             null|
|     Romance|Western|             null|
|         Romance|War|        Hemingway|
|    Romance|Thriller|             null|
|Romance|Sci-Fi|Th...|         artistic|
|Romance|Sci-Fi|Th...|            artsy|
|Romance|Sci-Fi|Th...|             null|
|Romance|Sci-Fi|Th...|        dreamlike|
|Romance|Sci-Fi|Th...|      atmospheric|
|Romance|Sci-Fi|Th...|   existentialism|
|Romance|Sci-Fi|Th...|        Beautiful|
|      Romance|Sci-Fi|             null|
|             Romance|             null|
+--------------------+-----------------+
only showing top

In [18]:
# Top 10 popular movies (most users seen/rated it)

query= """with t1 as(
          Select r.movieID,m.title,count(distinct r.userID) as counts,
          dense_rank()over(order by count(distinct r.userid) desc) as ranker
          from RATINGS as r
          left join MOVIES as m
          on r.movieID=m.movieID
          group by 1,2)
          
          Select title,counts from t1 
          where ranker<=10
          """


          

output = spark.sql(query)
output.show()

# Write data in HDFS into single file

output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/popular_movies.csv')
print("Write Successfull")

23/12/04 03:44:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:30 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 0

+--------------------+------+
|               title|counts|
+--------------------+------+
| Forrest Gump (1994)|   329|
|Shawshank Redempt...|   317|
| Pulp Fiction (1994)|   307|
|Silence of the La...|   279|
|  Matrix, The (1999)|   278|
|Star Wars: Episod...|   251|
|Jurassic Park (1993)|   238|
|   Braveheart (1995)|   237|
|Terminator 2: Jud...|   224|
|Schindler's List ...|   220|
+--------------------+------+



23/12/04 03:44:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 0

Write Successfull


                                                                                

In [19]:
# Top 10 movies in terms of avg rating (>30 users reviewed)

query= """with t1 as(
          Select movieid,avg(rating) as avg_rating,
          dense_rank()over (order by avg(rating) desc) as ranker
          from RATINGS
          group by 1
          having count(distinct userID)>30)
          
          Select m.title,round(t1.avg_rating,9) as avg_rating,t1.ranker from t1
          left join MOVIES as m
          on t1.movieID=m.movieID
          where ranker<=10
          """


          

output = spark.sql(query)
output.show()

# Write data in HDFS into single file

output.coalesce(1).write.mode("overwrite").format('csv').option('header', 'true') .option('delimiter', ',').save('/tmp/output_data/spark_movie/top_10_morethan30users.csv')
print("Write Successfull")

23/12/04 03:44:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:39 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:40 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 0

+--------------------+-----------+------+
|               title| avg_rating|ranker|
+--------------------+-----------+------+
|Shawshank Redempt...|4.429022082|     1|
|Lawrence of Arabi...|        4.3|     2|
|Godfather, The (1...|  4.2890625|     3|
|   Fight Club (1999)| 4.27293578|     4|
|Cool Hand Luke (1...|4.271929825|     5|
|Dr. Strangelove o...|4.268041237|     6|
|  Rear Window (1954)|4.261904762|     7|
|Godfather: Part I...|4.259689922|     8|
|Departed, The (2006)|4.252336449|     9|
|   Goodfellas (1990)|       4.25|    10|
+--------------------+-----------+------+



23/12/04 03:44:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:42 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 03:44:43 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
23/12/04 0

Write Successfull
