Let's start with the basics.

# Connecting to Spark

We're going to be using Python with DataFrames, which is only available in Spark 1.3 or later.  We're going to be using a recent version of open source spark.  To use it, you'll have to import the `SQLContext`.

In [1]:
from pyspark.sql import SQLContext
sql = SQLContext(sc)

Let's set up some common functions

# Reading a table

In [4]:
user = sql.read.format("org.apache.spark.sql.cassandra").\
               load(keyspace="training", table="user")

# Display

In [3]:
user.collect()

[Row(user_id=1, favorite_foods=[u'Bacon', u'Cheese'], name=u'Jon'),
 Row(user_id=2, favorite_foods=[u'Kale', u'Pizza', u'Wine'], name=u'Dani'),
 Row(user_id=3, favorite_foods=[u'Muffins', u'Pie', u'Steak'], name=u'Patrick')]

# Basic Filtering

In [9]:
user[user.age > 35].collect()

[Row(user_id=3, age=108, favorite_foods=[u'Muffins', u'Pie', u'Steak'], name=u'Patrick')]

In [10]:
user[user.age < 20 and user.name.startswith("Baby") ].collect()

[Row(user_id=4, age=1, favorite_foods=[u'Candy', u'Fear'], name=u'Baby Luke')]

# A nicer reader

In [14]:
def create_reader(sql, keyspace):
    def reader(table):
        df = sql.read.format("org.apache.spark.sql.cassandra").\
               load(keyspace=keyspace, table=table)
        return df
    return reader

training_reader = create_reader(sql, "training")

# A Nicer Writer

In [15]:
def create_writer(sql, keyspace, mode="append"):
    def writer(df, table):
        df.write.format("org.apache.spark.sql.cassandra").\
                 options(table=table, keyspace=keyspace).save(mode="append")
    return writer

training_writer = create_writer(sql, "training")

# Aggregations

In [17]:
user = training_reader("user")

# Working with Collections

# Migrating to a new structure

In [20]:
from pyspark.sql.functions import *
result = user.select(explode(user.favorite_foods).alias("food"), "user_id")
writer(result, "favorite_foods_index")

# SparkSQL

Register dataframe as a table

do whatever you want with it

In [21]:
user.registerTempTable("user")

In [22]:
sql.sql("select * from user where age > 15").collect()

[Row(user_id=1, age=34, favorite_foods=[u'Bacon', u'Cheese'], name=u'Jon'),
 Row(user_id=2, age=22, favorite_foods=[u'Kale', u'Pizza', u'Wine'], name=u'Dani'),
 Row(user_id=3, age=108, favorite_foods=[u'Muffins', u'Pie', u'Steak'], name=u'Patrick')]

# Load the movie lens data

Dataset lives in ml-10M100K directory


In [31]:
movies = sc.textFile("ml-10M100K/movies.dat").map(lambda x: x.split("::") )

In [32]:
movies = movies.map(lambda (x,y,z): (x,y,z.split("|")))

[(u'1',
  u'Toy Story (1995)',
  [u'Adventure', u'Animation', u'Children', u'Comedy', u'Fantasy']),
 (u'2', u'Jumanji (1995)', [u'Adventure', u'Children', u'Fantasy']),
 (u'3', u'Grumpier Old Men (1995)', [u'Comedy', u'Romance']),
 (u'4', u'Waiting to Exhale (1995)', [u'Comedy', u'Drama', u'Romance']),
 (u'5', u'Father of the Bride Part II (1995)', [u'Comedy']),
 (u'6', u'Heat (1995)', [u'Action', u'Crime', u'Thriller']),
 (u'7', u'Sabrina (1995)', [u'Comedy', u'Romance']),
 (u'8', u'Tom and Huck (1995)', [u'Adventure', u'Children']),
 (u'9', u'Sudden Death (1995)', [u'Action']),
 (u'10', u'GoldenEye (1995)', [u'Action', u'Adventure', u'Thriller']),
 (u'11', u'American President, The (1995)', [u'Comedy', u'Drama', u'Romance']),
 (u'12', u'Dracula: Dead and Loving It (1995)', [u'Comedy', u'Horror']),
 (u'13', u'Balto (1995)', [u'Animation', u'Children']),
 (u'14', u'Nixon (1995)', [u'Drama']),
 (u'15', u'Cutthroat Island (1995)', [u'Action', u'Adventure', u'Romance']),
 (u'16', u'Casino

In [34]:
movies = movies.toDF(["movie_id", "name", "tags"])

In [35]:
movies.collect()

[Row(movie_id=u'1', name=u'Toy Story (1995)', tags=[u'Adventure', u'Animation', u'Children', u'Comedy', u'Fantasy']),
 Row(movie_id=u'2', name=u'Jumanji (1995)', tags=[u'Adventure', u'Children', u'Fantasy']),
 Row(movie_id=u'3', name=u'Grumpier Old Men (1995)', tags=[u'Comedy', u'Romance']),
 Row(movie_id=u'4', name=u'Waiting to Exhale (1995)', tags=[u'Comedy', u'Drama', u'Romance']),
 Row(movie_id=u'5', name=u'Father of the Bride Part II (1995)', tags=[u'Comedy']),
 Row(movie_id=u'6', name=u'Heat (1995)', tags=[u'Action', u'Crime', u'Thriller']),
 Row(movie_id=u'7', name=u'Sabrina (1995)', tags=[u'Comedy', u'Romance']),
 Row(movie_id=u'8', name=u'Tom and Huck (1995)', tags=[u'Adventure', u'Children']),
 Row(movie_id=u'9', name=u'Sudden Death (1995)', tags=[u'Action']),
 Row(movie_id=u'10', name=u'GoldenEye (1995)', tags=[u'Action', u'Adventure', u'Thriller']),
 Row(movie_id=u'11', name=u'American President, The (1995)', tags=[u'Comedy', u'Drama', u'Romance']),
 Row(movie_id=u'12', nam

In [37]:
training_writer(movies, "movie")

In [38]:
ratings = sc.textFile("ml-10M100K/ratings.dat").map(lambda x: x.split("::") )

In [None]:
ratings.collect()