In [None]:
docker network create network1
docker run --name cassandra1 --network network1 -d cassandra
docker run -it --network network1 --rm cassandra cqlsh cassandra1
docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' cassandra1


In [1]:
import os
CASSANDRA_IP=os.getenv('CASSANDRA1')
print(CASSANDRA_IP)

if CASSANDRA_IP is None:
    CASSANDRA_IP = '172.18.0.2'

from cassandra.cluster import Cluster
cluster = Cluster([CASSANDRA_IP])
session = cluster.connect()
session.execute('DROP KEYSPACE IF EXISTS classroom')
session.execute("CREATE KEYSPACE classroom WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor':'1'}")
session = cluster.connect('classroom')
session.execute("create table student(id int PRIMARY KEY, firstname text, lastname text, emails set<text>)")
session.execute("insert into student (id, firstname, lastname, emails) values (1, 'Joe', 'Smith', {'joes@xyz.com', 'joe.smith@abc.net'})")
session.execute("update student set firstname = 'Joseph' where id = 1")
session.execute("insert into student (id, firstname, lastname, emails) values (2, 'Mike', 'Jones', {'mikej@xyz.com', 'mike.jones@def.net', 'mike1234@gmail.com'})")
rows = session.execute('SELECT id, firstname, lastname, emails from student')
print(list(rows))


172.18.0.2
[Row(id=1, firstname='Joseph', lastname='Smith', emails=SortedSet(['joe.smith@abc.net', 'joes@xyz.com'])), Row(id=2, firstname='Mike', lastname='Jones', emails=SortedSet(['mike.jones@def.net', 'mike1234@gmail.com', 'mikej@xyz.com']))]


### In order for spark to talk to cassandra it needs to know the IP address to initialize the spark context with and it also needs the spark-cassandra-connector.

### Mongo will be similar and we need to initial the spark context with pointers to the mongo uri and also include the mongo-spark-connector

### Additionally, whoever configures the cluster may need to make sure additional jars are installed in $SPARK_HOME/jars

Don't run the following, it is just included here to have a look at.

In [None]:
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.1,com.datastax.spark:spark-cassandra-connector_2.11:2.4.0 pyspark-shell'

def initspark(appname = "Test", servername = "local", cassandra="127.0.0.1", mongo="mongodb://127.0.0.1/classroom"):
    print ('initializing pyspark')
    conf = SparkConf().set("spark.cassandra.connection.host", cassandra).setAppName(appname).setMaster(servername)
    sc = SparkContext(conf=conf)
    spark = SparkSession.builder.appName(appname) \
    .config("spark.mongodb.input.uri", mongo) \
    .config("spark.mongodb.output.uri", mongo) \
    .enableHiveSupport().getOrCreate()
    sc.setLogLevel("WARN")
    print ('pyspark initialized')
    return sc, spark, conf


### Let's initialize as usual but pass in the IP address of the cassandra cluster this time

In [2]:
import sys
sys.path.append('/class')
from initspark import *
sc, spark, conf = initspark(cassandra=CASSANDRA_IP)


initializing pyspark
pyspark initialized


### Using the spark-cassandra-connector we can read from a cassandra table in a similar way to how we read from a MySQL table

In [3]:
people = spark.read.format("org.apache.spark.sql.cassandra").options(table="student", keyspace="classroom").load()
display(people)
print(people.collect())


Unnamed: 0,id,emails,firstname,lastname
0,1,"[joe.smith@abc.net, joes@xyz.com]",Joseph,Smith
1,2,"[mike.jones@def.net, mike1234@gmail.com, mikej...",Mike,Jones


[Row(id=1, emails=['joe.smith@abc.net', 'joes@xyz.com'], firstname='Joseph', lastname='Smith'), Row(id=2, emails=['mike.jones@def.net', 'mike1234@gmail.com', 'mikej@xyz.com'], firstname='Mike', lastname='Jones')]


In [4]:
# Append the results of a DataFrame into a Cassandra table
x = sc.parallelize([(3, 'Mary', 'Johnson', ['Mary1@gmail.com', 'Mary2@yahoo.com'])])
x1 = spark.createDataFrame(x, schema = ['id', 'firstname', 'lastname', 'emails'])
x1.write.format("org.apache.spark.sql.cassandra").options(table="student", keyspace="classroom").mode("append").save()



In [5]:
people = spark.read.format("org.apache.spark.sql.cassandra").options(table="student", keyspace="classroom").load()
display(people)



Unnamed: 0,id,emails,firstname,lastname
0,1,"[joe.smith@abc.net, joes@xyz.com]",Joseph,Smith
1,2,"[mike.jones@def.net, mike1234@gmail.com, mikej...",Mike,Jones
2,3,"[Mary1@gmail.com, Mary2@yahoo.com]",Mary,Johnson


In [6]:
people.createOrReplaceTempView('people')
people2 = spark.sql('select id, firstname, lastname, email from people LATERAL VIEW EXPLODE(emails) EXPLODED_TABLE AS email')
display(people2)



Unnamed: 0,id,firstname,lastname,email
0,1,Joseph,Smith,joe.smith@abc.net
1,1,Joseph,Smith,joes@xyz.com
2,2,Mike,Jones,mike.jones@def.net
3,2,Mike,Jones,mike1234@gmail.com
4,2,Mike,Jones,mikej@xyz.com
5,3,Mary,Johnson,Mary1@gmail.com
6,3,Mary,Johnson,Mary2@yahoo.com


In [7]:
people3 = people2.where("email like '%.com'").orderBy("id")
display(people3)

Unnamed: 0,id,firstname,lastname,email
0,1,Joseph,Smith,joes@xyz.com
1,2,Mike,Jones,mikej@xyz.com
2,2,Mike,Jones,mike1234@gmail.com
3,3,Mary,Johnson,Mary1@gmail.com
4,3,Mary,Johnson,Mary2@yahoo.com


In [None]:
import pymongo
client = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
classroom = client["classroom"]
if 'classroom' in (x['name'] for x in client.list_databases()):
    client.drop_database('classroom')

people = classroom['people']
name = {"firstname" : "Adam", "personid":4}
x = people.insert_one(name)

names = [{"firstname" : "Betty", "personid":5}
         ,{"firstname" : "Charlie", "personid":6}]
x = people.insert_many(names)

x = people.find()
print ('*' * 80)
print ('from mongo directly')
print (list(x))
print ('*' * 80)



In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession

sc = SparkContext()
spark = SparkSession.builder.appName("myApp")\
    .config("spark.mongodb.input.uri", "mongodb://127.0.0.1/classroom") \
    .config("spark.mongodb.output.uri", "mongodb://127.0.0.1/classroom") \
    .getOrCreate()


In [None]:
df = spark.read.format("mongo").option("uri", "mongodb://127.0.0.1/classroom.people").load()
df.show()


In [None]:
x = sc.parallelize([(7, 'David')])
x1 = spark.createDataFrame(x, schema = ['personid', 'firstname'])
x1.write.format("mongo").options(collection="people", database="classroom").mode("append").save()

In [None]:
df = spark.read.format("mongo").option("uri", "mongodb://127.0.0.1/classroom.people").load()
df.show()


In [None]:
df.createOrReplaceTempView('people')
spark.sql('select * from people').show()
