In [1]:
import findspark
findspark.init()

In [3]:
from pyspark.conf import SparkConf
config = SparkConf()
# config.set("property", "value")
config.setMaster("local").setAppName("MovieLens")

from pyspark.sql import SparkSession
spark =SparkSession.builder\
                    .config(conf=config)\
                    .getOrCreate()

sc= spark.sparkContext

In [4]:
from pyspark.sql.types import StructType, LongType, StringType, IntegerType, DoubleType

movieSchema = StructType()\
                    .add('movieId', IntegerType(), True)\
                    .add('title', StringType(), True)\
                    .add('genres', StringType(), True)

ratingSchema =  StructType()\
                    .add('userId', IntegerType(), True)\
                    .add('movieId', StringType(), True)\
                    .add('rating', StringType(), True)\
                    .add('timestamp', LongType(), True)

In [7]:
movieDf = spark.read.format('csv')\
                .option('header', True)\
                .schema(movieSchema)\
                .load('hdfs://localhost:9000/movies')

movieDf.printSchema()
movieDf.show(2)

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)

+-------+----------------+--------------------+
|movieId|           title|              genres|
+-------+----------------+--------------------+
|      1|Toy Story (1995)|Adventure|Animati...|
|      2|  Jumanji (1995)|Adventure|Childre...|
+-------+----------------+--------------------+
only showing top 2 rows



In [9]:
ratingDf = spark.read.format('csv')\
                .option('header', True)\
                .schema(ratingSchema)\
                .load('hdfs://localhost:9000/ratings')

ratingDf.printSchema()
ratingDf.show(2)

root
 |-- userId: integer (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: long (nullable = true)

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|     1|      1|   4.0|964982703|
|     1|      3|   4.0|964981247|
+------+-------+------+---------+
only showing top 2 rows



In [10]:
print (movieDf.count())
print (ratingDf.count())

9742
100836


In [11]:
ratingDf.take(2)

[Row(userId=1, movieId='1', rating='4.0', timestamp=964982703),
 Row(userId=1, movieId='3', rating='4.0', timestamp=964981247)]

In [12]:
ratingDf.select('rating').distinct().show()

                                                                                

+------+
|rating|
+------+
|   1.0|
|   4.5|
|   2.5|
|   3.5|
|   5.0|
|   0.5|
|   4.0|
|   1.5|
|   2.0|
|   3.0|
+------+



In [14]:
from pyspark.sql.functions import col, desc, avg, count

df = ratingDf\
    .groupBy('movieId')\
    .agg(count('userId').alias('total_ratings'))


df.printSchema()
df.show(20)

root
 |-- movieId: string (nullable = true)
 |-- total_ratings: long (nullable = false)

+-------+-------------+
|movieId|total_ratings|
+-------+-------------+
|    296|          307|
|   1090|           63|
| 115713|           28|
|   3210|           42|
|  88140|           32|
|    829|            9|
|   2088|           18|
|   2294|           45|
|   4821|            5|
|  48738|           20|
|   3959|            8|
|  89864|           19|
|   2136|           14|
|    691|            3|
|   3606|            4|
| 121007|            1|
|   6731|            8|
|  27317|            6|
|  26082|            3|
| 100553|            2|
+-------+-------------+
only showing top 20 rows



In [16]:
from pyspark.sql.functions import col, desc, avg, count

df = ratingDf\
    .groupBy('movieId')\
    .agg(avg('rating').alias('avg_rating'))\
    .sort(desc('avg_rating'))


df.printSchema()
df.show(20)

root
 |-- movieId: string (nullable = true)
 |-- avg_rating: double (nullable = true)





+-------+----------+
|movieId|avg_rating|
+-------+----------+
| 179135|       5.0|
| 151769|       5.0|
|  69469|       5.0|
| 102217|       5.0|
|    495|       5.0|
|  47736|       5.0|
| 149350|       5.0|
| 170597|       5.0|
|   5059|       5.0|
| 136447|       5.0|
| 149508|       5.0|
| 136355|       5.0|
| 140627|       5.0|
| 147330|       5.0|
| 139640|       5.0|
|    467|       5.0|
|   6402|       5.0|
|   1349|       5.0|
|  67618|       5.0|
| 160644|       5.0|
+-------+----------+
only showing top 20 rows



                                                                                

In [24]:
from pyspark.sql.functions import col, desc, avg, count

mostPopularMoviesDf = ratingDf\
    .groupBy('movieId')\
    .agg(avg('rating').alias('avg_rating'), count('userId').alias('total_ratings'))\
    .filter ((col('total_ratings') >= 100 ) & (col('avg_rating') >=3.5))\
    .sort(desc('total_ratings'))

mostPopularMoviesDf.cache()

mostPopularMoviesDf.printSchema()
mostPopularMoviesDf.show(20)

root
 |-- movieId: string (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- total_ratings: long (nullable = false)



                                                                                

+-------+------------------+-------------+
|movieId|        avg_rating|total_ratings|
+-------+------------------+-------------+
|    356| 4.164133738601824|          329|
|    318| 4.429022082018927|          317|
|    296| 4.197068403908795|          307|
|    593| 4.161290322580645|          279|
|   2571| 4.192446043165468|          278|
|    260| 4.231075697211155|          251|
|    480|              3.75|          238|
|    110| 4.031645569620253|          237|
|    589| 3.970982142857143|          224|
|    527|             4.225|          220|
|   2959| 4.272935779816514|          218|
|      1|3.9209302325581397|          215|
|   1196|4.2156398104265405|          211|
|   2858| 4.056372549019608|          204|
|     50| 4.237745098039215|          204|
|     47|3.9753694581280787|          203|
|    150| 3.845771144278607|          201|
|   1198|            4.2075|          200|
|   4993| 4.106060606060606|          198|
|   1210| 4.137755102040816|          196|
+-------+--

In [27]:
popularMoviesDf = mostPopularMoviesDf.join(movieDf, mostPopularMoviesDf.movieId == movieDf.movieId)\
                                    .select(movieDf.movieId, 'title', 'avg_rating', 'total_ratings')\
                                    .sort (desc ('total_ratings'))

popularMoviesDf.show(100)

+-------+--------------------+------------------+-------------+
|movieId|               title|        avg_rating|total_ratings|
+-------+--------------------+------------------+-------------+
|    356| Forrest Gump (1994)| 4.164133738601824|          329|
|    318|Shawshank Redempt...| 4.429022082018927|          317|
|    296| Pulp Fiction (1994)| 4.197068403908795|          307|
|    593|Silence of the La...| 4.161290322580645|          279|
|   2571|  Matrix, The (1999)| 4.192446043165468|          278|
|    260|Star Wars: Episod...| 4.231075697211155|          251|
|    480|Jurassic Park (1993)|              3.75|          238|
|    110|   Braveheart (1995)| 4.031645569620253|          237|
|    589|Terminator 2: Jud...| 3.970982142857143|          224|
|    527|Schindler's List ...|             4.225|          220|
|   2959|   Fight Club (1999)| 4.272935779816514|          218|
|      1|    Toy Story (1995)|3.9209302325581397|          215|
|   1196|Star Wars: Episod...|4.21563981

In [28]:
popularMoviesDf.rdd.getNumPartitions()

72

In [31]:
popularMoviesDf.write.mode('overwrite')\
                .option('header', True)\
                .csv('hdfs://localhost:9000/most-popular-movies-many-files')

java.lang.InterruptedException
	at java.lang.Object.wait(Native Method)
	at java.lang.Thread.join(Thread.java:1252)
	at java.lang.Thread.join(Thread.java:1326)
	at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.closeResponder(DFSOutputStream.java:609)
	at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.endBlock(DFSOutputStream.java:370)
	at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:546)
java.lang.InterruptedException
	at java.lang.Object.wait(Native Method)
	at java.lang.Thread.join(Thread.java:1252)
	at java.lang.Thread.join(Thread.java:1326)
	at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.closeResponder(DFSOutputStream.java:609)
	at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.endBlock(DFSOutputStream.java:370)
	at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:546)
                                                                                

In [33]:
popularMoviesDf.coalesce(1).write.mode('overwrite')\
                .option('header', True)\
                .csv('hdfs://localhost:9000/most-popular-movies')

In [34]:
popularMovies = spark.read.format('csv')\
                    .option('header', True)\
                    .option('inferSchema', True)\
                    .load('hdfs://localhost:9000/most-popular-movies-many-files')

popularMovies.printSchema()
print('Partitions', popularMovies.rdd.getNumPartitions())
popularMovies.show()

                                                                                

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- total_ratings: integer (nullable = true)

Partitions 3
+-------+--------------------+------------------+-------------+
|movieId|               title|        avg_rating|total_ratings|
+-------+--------------------+------------------+-------------+
|   1206|Clockwork Orange,...| 3.995833333333333|          120|
|   2716|Ghostbusters (a.k...|             3.775|          120|
|   4973|Amelie (Fabuleux ...| 4.183333333333334|          120|
|   5445|Minority Report (...|            3.6375|          120|
|   1089|Reservoir Dogs (1...| 4.202290076335878|          131|
|   1240|Terminator, The (...|3.8969465648854964|          131|
|   6874|Kill Bill: Vol. 1...|3.9618320610687023|          131|
|   7361|Eternal Sunshine ...|4.1603053435114505|          131|
|   1208|Apocalypse Now (1...| 4.219626168224299|          107|
|   4896|Harry Potter and ...|3.7616822429906542

In [35]:
popularMovies = spark.read.format('csv')\
                    .option('header', True)\
                    .option('inferSchema', True)\
                    .load('hdfs://localhost:9000/most-popular-movies')

popularMovies.printSchema()
print('Partitions', popularMovies.rdd.getNumPartitions())
popularMovies.show()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- total_ratings: integer (nullable = true)

Partitions 1
+-------+--------------------+------------------+-------------+
|movieId|               title|        avg_rating|total_ratings|
+-------+--------------------+------------------+-------------+
|    356| Forrest Gump (1994)| 4.164133738601824|          329|
|    318|Shawshank Redempt...| 4.429022082018927|          317|
|    296| Pulp Fiction (1994)| 4.197068403908795|          307|
|    593|Silence of the La...| 4.161290322580645|          279|
|   2571|  Matrix, The (1999)| 4.192446043165468|          278|
|    260|Star Wars: Episod...| 4.231075697211155|          251|
|    480|Jurassic Park (1993)|              3.75|          238|
|    110|   Braveheart (1995)| 4.031645569620253|          237|
|    589|Terminator 2: Jud...| 3.970982142857143|          224|
|    527|Schindler's List ...|             4.225