# Connecting to Spark

We're going to be using Python with DataFrames, which is only available in Spark 1.3 or later.  We're going to be using a recent version of open source spark.  To use it, you'll have to import the `SQLContext`.

In [1]:
from pyspark.sql import SQLContext
sql = SQLContext(sc)

Let's set up some common functions

# Reading a table

In [2]:
user = sql.read.format("org.apache.spark.sql.cassandra").\
               load(keyspace="training", table="user")

# Display

In [3]:
user.collect()

[Row(user_id=1, age=34, favorite_foods=[u'Bacon', u'Cheese'], name=u'Jon'),
 Row(user_id=2, age=22, favorite_foods=[u'Kale', u'Pizza', u'Wine'], name=u'Dani'),
 Row(user_id=4, age=1, favorite_foods=[u'Candy', u'Fear'], name=u'Baby Luke'),
 Row(user_id=3, age=108, favorite_foods=[u'Muffins', u'Pie', u'Steak'], name=u'Patrick'),
 Row(user_id=5, age=10, favorite_foods=[u'Anger'], name=u'Larry')]

# Basic Filtering

In [15]:
from pyspark.sql.functions import *

In [4]:
user[user.name == "Jon"].collect()

[Row(user_id=1, age=34, favorite_foods=[u'Bacon', u'Cheese'], name=u'Jon')]

In [5]:
tmp = user.age > 100

In [6]:
user.registerTempTable("user")

In [7]:
tmp = sql.sql("SELECT * from user WHERE name = 'Larry' or age > 30")

In [8]:
tmp.explain()

Filter ((name#3 = Larry) || (age#1 > 30))
 PhysicalRDD [user_id#0,age#1,favorite_foods#2,name#3], MapPartitionsRDD[13] at executedPlan at NativeMethodAccessorImpl.java:-2



In [9]:
user.filter('name = "Larry" or name = "Jon"').show()

+-------+---+--------------------+-----+
|user_id|age|      favorite_foods| name|
+-------+---+--------------------+-----+
|      1| 34|ArrayBuffer(Bacon...|  Jon|
|      5| 10|  ArrayBuffer(Anger)|Larry|
+-------+---+--------------------+-----+



In [10]:
user.filter("age > 100 or name = 'Larry'").collect()

[Row(user_id=3, age=108, favorite_foods=[u'Muffins', u'Pie', u'Steak'], name=u'Patrick'),
 Row(user_id=5, age=10, favorite_foods=[u'Anger'], name=u'Larry')]

In [22]:
# Find users who like a certain food
user.select("user_id", "name", explode(user.favorite_foods).alias("food")).where("food='Bacon'").collect()

[Row(user_id=1, name=u'Jon', food=u'Bacon')]

# A nicer reader

In [23]:
def create_reader(sql, keyspace):
    def reader(table):
        df = sql.read.format("org.apache.spark.sql.cassandra").\
               load(keyspace=keyspace, table=table)
        return df
    return reader

reader = create_reader(sql, "training")

# A Nicer Writer

In [24]:
def create_writer(sql, keyspace, mode="append"):
    def writer(df, table):
        df.write.format("org.apache.spark.sql.cassandra").\
                 options(table=table, keyspace=keyspace).save(mode="append")
    return writer

writer = create_writer(sql, "training")

# Data Migrations

In [25]:

result = user.select(explode(user.favorite_foods).alias("food"), "user_id")
writer(result, "favorite_foods_index")

# SparkSQL

Register dataframe as a table

do whatever you want with it

In [26]:
user.registerTempTable("user")

In [27]:
sql.sql("select * from user where age > 15").collect()

[Row(user_id=1, age=34, favorite_foods=[u'Bacon', u'Cheese'], name=u'Jon'),
 Row(user_id=2, age=22, favorite_foods=[u'Kale', u'Pizza', u'Wine'], name=u'Dani'),
 Row(user_id=3, age=108, favorite_foods=[u'Muffins', u'Pie', u'Steak'], name=u'Patrick')]

# Load the movie lens data

Dataset lives in ml-10M100K directory


In [28]:
movies = sc.textFile("ml-10M100K/movies.dat").map(lambda x: x.split("::") )

In [29]:
movies = movies.map(lambda (x,y,z): (x,y,z.split("|")))

In [30]:
movies = movies.toDF(["movie_id", "name", "tags"])

In [31]:
movies.registerTempTable("movies")

In [32]:
movies.head(10)

[Row(movie_id=u'1', name=u'Toy Story (1995)', tags=[u'Adventure', u'Animation', u'Children', u'Comedy', u'Fantasy']),
 Row(movie_id=u'2', name=u'Jumanji (1995)', tags=[u'Adventure', u'Children', u'Fantasy']),
 Row(movie_id=u'3', name=u'Grumpier Old Men (1995)', tags=[u'Comedy', u'Romance']),
 Row(movie_id=u'4', name=u'Waiting to Exhale (1995)', tags=[u'Comedy', u'Drama', u'Romance']),
 Row(movie_id=u'5', name=u'Father of the Bride Part II (1995)', tags=[u'Comedy']),
 Row(movie_id=u'6', name=u'Heat (1995)', tags=[u'Action', u'Crime', u'Thriller']),
 Row(movie_id=u'7', name=u'Sabrina (1995)', tags=[u'Comedy', u'Romance']),
 Row(movie_id=u'8', name=u'Tom and Huck (1995)', tags=[u'Adventure', u'Children']),
 Row(movie_id=u'9', name=u'Sudden Death (1995)', tags=[u'Action']),
 Row(movie_id=u'10', name=u'GoldenEye (1995)', tags=[u'Action', u'Adventure', u'Thriller'])]

In [33]:
writer(movies, "movie")

In [34]:
sql.sql("SELECT * from movies where movie_id=1").show()

+--------+----------------+--------------------+
|movie_id|            name|                tags|
+--------+----------------+--------------------+
|       1|Toy Story (1995)|ArrayBuffer(Adven...|
+--------+----------------+--------------------+



In [35]:
sql.sql("select * from movies where name like 'Toy%'").show()

+--------+-------------------+--------------------+
|movie_id|               name|                tags|
+--------+-------------------+--------------------+
|       1|   Toy Story (1995)|ArrayBuffer(Adven...|
|    2253|        Toys (1992)|ArrayBuffer(Actio...|
|    3114| Toy Story 2 (1999)|ArrayBuffer(Adven...|
|    4929|    Toy, The (1982)| ArrayBuffer(Comedy)|
|    5843|Toy Soldiers (1991)|ArrayBuffer(Actio...|
+--------+-------------------+--------------------+



In [36]:
sql.sql("select * from movies where name like '%(1981)'").show()

+--------+--------------------+--------------------+
|movie_id|                name|                tags|
+--------+--------------------+--------------------+
|     610|  Heavy Metal (1981)|ArrayBuffer(Actio...|
|     681|Clean Slate (Coup...|  ArrayBuffer(Crime)|
|    1033|Fox and the Hound...|ArrayBuffer(Anima...|
|    1124|On Golden Pond (1...|  ArrayBuffer(Drama)|
|    1129|Escape from New Y...|ArrayBuffer(Actio...|
|    1198|Raiders of the Lo...|ArrayBuffer(Actio...|
|    1233|Boat, The (Das Bo...|ArrayBuffer(Actio...|
|    1264|         Diva (1981)|ArrayBuffer(Actio...|
|    1321|American Werewolf...|ArrayBuffer(Comed...|
|    1335|  Blood Beach (1981)|ArrayBuffer(Actio...|
|    1663|      Stripes (1981)|ArrayBuffer(Comed...|
|    1957|Chariots of Fire ...|  ArrayBuffer(Drama)|
|    1975|Friday the 13th P...| ArrayBuffer(Horror)|
|    1983| Halloween II (1981)| ArrayBuffer(Horror)|
|    2041|    Condorman (1981)|ArrayBuffer(Actio...|
|    2044|Devil and Max Dev...|ArrayBuffer(Com

In [39]:
# UserID::MovieID::Rating::Timestamp
ratings_rdd = sc.textFile("ml-10M100K/ratings.dat").map(lambda x: x.split("::") )

In [40]:
ratings = ratings_rdd.toDF(["user_id", "movie_id", "rating", "timestamp"])

In [42]:
ratings = ratings.select("movie_id", "user_id", "rating", "timestamp")

In [43]:
writer(ratings, "rating_by_movie")

# JOINS and Aggregations

In [44]:
ratings.registerTempTable("ratings")

In [45]:
aggregated_ratings = sql.sql("select movie_id, avg(rating) as rating from ratings group by movie_id")

In [46]:
aggregated_ratings.registerTempTable("average_ratings")

In [47]:
writer(aggregated_ratings, "average_rating")

In [48]:
movies.registerTempTable('movie')

In [49]:
sql.sql("""select movie.movie_id, name, tags, rating FROM movie join average_ratings on movie.movie_id = average_ratings.movie_id  order by rating desc limit 20""").collect()

[Row(movie_id=u'51209', name=u'Fighting Elegy (Kenka erejii) (1966)', tags=[u'Action', u'Comedy'], rating=5.0),
 Row(movie_id=u'33264', name=u"Satan's Tango (S\xe1t\xe1ntang\xf3) (1994)", tags=[u'Drama'], rating=5.0),
 Row(movie_id=u'53355', name=u'Sun Alley (Sonnenallee) (1999)', tags=[u'Comedy', u'Romance'], rating=5.0),
 Row(movie_id=u'42783', name=u'Shadows of Forgotten Ancestors (1964)', tags=[u'Drama', u'Romance'], rating=5.0),
 Row(movie_id=u'64275', name=u'Blue Light, The (Das Blaue Licht) (1932)', tags=[u'Drama', u'Fantasy', u'Mystery'], rating=5.0),
 Row(movie_id=u'26048', name=u'Human Condition II, The (Ningen no joken II) (1959)', tags=[u'Drama', u'War'], rating=4.75),
 Row(movie_id=u'26073', name=u'Human Condition III, The (Ningen no joken III) (1961)', tags=[u'Drama', u'War'], rating=4.75),
 Row(movie_id=u'65001', name=u"Constantine's Sword (2007)", tags=[u'Documentary'], rating=4.75),
 Row(movie_id=u'4454', name=u'More (1998)', tags=[u'Animation', u'IMAX', u'Sci-Fi'], ra

# Top movies for a tag

In [50]:
movies.columns

[u'movie_id', u'name', u'tags']

In [51]:
movies_by_tag = movies.select(explode(movies.tags).alias("tag"), "movie_id", "name")

In [53]:
writer(movies_by_tag, "movies_by_tag")

In [54]:
movies_by_tag.head(10)

[Row(tag=u'Adventure', movie_id=u'1', name=u'Toy Story (1995)'),
 Row(tag=u'Animation', movie_id=u'1', name=u'Toy Story (1995)'),
 Row(tag=u'Children', movie_id=u'1', name=u'Toy Story (1995)'),
 Row(tag=u'Comedy', movie_id=u'1', name=u'Toy Story (1995)'),
 Row(tag=u'Fantasy', movie_id=u'1', name=u'Toy Story (1995)'),
 Row(tag=u'Adventure', movie_id=u'2', name=u'Jumanji (1995)'),
 Row(tag=u'Children', movie_id=u'2', name=u'Jumanji (1995)'),
 Row(tag=u'Fantasy', movie_id=u'2', name=u'Jumanji (1995)'),
 Row(tag=u'Comedy', movie_id=u'3', name=u'Grumpier Old Men (1995)'),
 Row(tag=u'Romance', movie_id=u'3', name=u'Grumpier Old Men (1995)')]

In [55]:
aggregated_ratings.columns

[u'movie_id', u'rating']

In [56]:
#joined = movies.join(movies_by_tag, movies.movie_id == movies_by_tag.movie_id)
movies_by_tag.where(movies_by_tag.tag == "Adventure").\
              join(aggregated_ratings, movies_by_tag.movie_id==aggregated_ratings.movie_id).\
              orderBy(aggregated_ratings.rating, ascending=False).\
              select(movies_by_tag.movie_id, movies_by_tag.name, aggregated_ratings.rating).\
              limit(10).collect()

[Row(movie_id=u'26649', name=u'Lonesome Dove (1989)', rating=4.3076923076923075),
 Row(movie_id=u'908', name=u'North by Northwest (1959)', rating=4.261366341347299),
 Row(movie_id=u'1198', name=u'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)', rating=4.261317249919736),
 Row(movie_id=u'25789', name=u'Shanghai Express (1932)', rating=4.25),
 Row(movie_id=u'720', name=u'Wallace & Gromit: The Best of Aardman Animation (1996)', rating=4.22979797979798),
 Row(movie_id=u'260', name=u'Star Wars: Episode IV - A New Hope (a.k.a. Star Wars) (1977)', rating=4.2202093397745575),
 Row(movie_id=u'1204', name=u'Lawrence of Arabia (1962)', rating=4.209582164476598),
 Row(movie_id=u'1196', name=u'Star Wars: Episode V - The Empire Strikes Back (1980)', rating=4.1943614395218916),
 Row(movie_id=u'1197', name=u'Princess Bride, The (1987)', rating=4.194158595641646),
 Row(movie_id=u'5618', name=u'Spirited Away (Sen to Chihiro no kamikakushi) (2001)', rating=4.18645294725956

In [60]:
movies_by_tag.show()

+---------+--------+--------------------+
|      tag|movie_id|                name|
+---------+--------+--------------------+
|Adventure|       1|    Toy Story (1995)|
|Animation|       1|    Toy Story (1995)|
| Children|       1|    Toy Story (1995)|
|   Comedy|       1|    Toy Story (1995)|
|  Fantasy|       1|    Toy Story (1995)|
|Adventure|       2|      Jumanji (1995)|
| Children|       2|      Jumanji (1995)|
|  Fantasy|       2|      Jumanji (1995)|
|   Comedy|       3|Grumpier Old Men ...|
|  Romance|       3|Grumpier Old Men ...|
|   Comedy|       4|Waiting to Exhale...|
|    Drama|       4|Waiting to Exhale...|
|  Romance|       4|Waiting to Exhale...|
|   Comedy|       5|Father of the Bri...|
|   Action|       6|         Heat (1995)|
|    Crime|       6|         Heat (1995)|
| Thriller|       6|         Heat (1995)|
|   Comedy|       7|      Sabrina (1995)|
|  Romance|       7|      Sabrina (1995)|
|Adventure|       8| Tom and Huck (1995)|
+---------+--------+--------------