# <img src="../images/mongo_logo.png" width=200 height=200 /><font color='cadetblue' size="+2"></font>

## Let's do a simple test of reading and writing to Mongo-DB to see if it works

## <img src="../images/python_logo.png" width=50 height=25 /><font size="+2">  This is just basic Mongo-DB directly with Python</font>


In [None]:
# pip install mymongo
import pymongo
client = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
classroom = client["classroom"]
if 'classroom' in (x['name'] for x in client.list_databases()):
    client.drop_database('classroom')

people = classroom['people']
name = {"firstname" : "Adam", "personid":4}
x = people.insert_one(name)

names = [{"firstname" : "Betty", "personid":5}
         ,{"firstname" : "Charlie", "personid":6}]
x = people.insert_many(names)

x = people.find()
print ('*' * 80)
print ('from mongo directly')
print (list(x))
print ('*' * 80)


In [None]:
# pip install mymongo
import pymongo
client = pymongo.MongoClient("mongodb://127.0.0.1:27017/")
classroom = client["classroom"]
if 'classroom' in (x['name'] for x in client.list_databases()):
    client.drop_database('classroom')

students = classroom['students']
name = {"studentid":1, "firstname":'Joe', "lastname":'Smith', "emails": ['joes@xyz.com', 'joe.smith@abc.net']}
x = students.insert_one(name)

names = [
        {"studentid":2, "firstname":'Mike', "lastname":'Jones', "emails": ['mikej@xyz.com', 'mike.jones@def.net', 'mike1234@gmail.com']}
        , {"studentid":3, "firstname":'Betty', "lastname":'Johnson', "emails": ['betty@xyz.com']}
]
x = students.insert_many(names)

x = students.find()
print ('*' * 80)
print ('from mongo directly')
for s in x:
    print(s)
print ('*' * 80)


## <img src="../images/spark_logo.png" width=100 height=50 /><font size="+2">  This uses the Spark Cassandra connector to read and write between a Spark DataFrame and Cassandra</font>


In [None]:
# pip install cassandra-driver
# pyspark --packages com.datastax.spark:spark-cassandra-connector_2.12:3.0.1

import os
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

#CASSANDRA_HOST = 'localhost'
CASSANDRA_HOST = '127.0.0.1'
CASSANDRA_USER = 'cassandra'
CASSANDRA_PASSWORD = 'student'

ap = PlainTextAuthProvider(username=CASSANDRA_USER, password=CASSANDRA_PASSWORD)
cluster = Cluster([CASSANDRA_HOST], auth_provider = ap)
cluster.connect()

import sys
sys.path.append('/class')
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
sys.path.append('/class')
if 'sc' in locals():
    sc.stop()

if 'sc' not in locals():
    from initspark import initspark
    sc, spark, conf = initspark(mongo="mongodb://127.0.0.1/classroom", packages = ['mongo'])



### First read what's already there

In [None]:
df = spark.read.format("mongo").option("uri", "mongodb://127.0.0.1/classroom.students").load()
df.show()


### Add a new document from a Spark DataFrame

In [None]:
x = sc.parallelize([(4, 'Han', 'Solo', ['han@starwars.com'])])
x1 = spark.createDataFrame(x, schema = ['studentid', 'firstname', 'lastname', 'emails'])
x1.write.format("mongo").option("uri", "mongodb://127.0.0.1/classroom.students").mode("append").save()
print('Done')


### Confirm that it worked

In [None]:
df = spark.read.format("mongo").option("uri", "mongodb://127.0.0.1/classroom.students").load()
df.show()


## <img src="../images/beam_logo.png" width=100 height=50 /><font size="+2">The MongoDB connectors for Beam are pretty straight forward.</font>

In [None]:
import apache_beam as beam
from apache_beam.io import ReadFromMongoDB
connection_string = 'mongodb://localhost:27017'
with beam.Pipeline() as p:
    (p
     | 'read' >> ReadFromMongoDB(connection_string, 'classroom', 'students') 
     | 'print' >> beam.Map(print)
    )


In [None]:
import apache_beam as beam
from apache_beam.io import ReadFromMongoDB, WriteToMongoDB
connection_string = 'mongodb://localhost:27017'
with beam.Pipeline() as p:
    (p
     | 'create' >> beam.Create([{"studentid":5, "firstname": "Luke", "lastname":"Skywalker", "personid":5, "emails":["luke@tattoine.com","luke@jedi.org"]}])
     | 'write' >> WriteToMongoDB(connection_string, 'classroom', 'students') 
    )


In [None]:
import apache_beam as beam
from apache_beam.io import ReadFromMongoDB
connection_string = 'mongodb://localhost:27017'
with beam.Pipeline() as p:
    (p
     | 'read' >> ReadFromMongoDB(connection_string, 'classroom', 'students', filter = {"firstname":"Luke"}, projection= {"firstname": 1, "lastname":1}) 
     | 'print' >> beam.Map(print)
    )


# __ __ __ __ __ __ __ __ __ __ __ __

# <img src="../images/cassandra_logo.png" width=100 height=50 /><font color='cadetblue' size="+2"></font>

## Let's do a simple test of reading and writing to Cassandra to see if it works

## <img src="../images/python_logo.png" width=50 height=25 /><font size="+2">  This is just basic Cassandra example using Python directly</font>


In [None]:

session = cluster.connect()
session.execute('DROP KEYSPACE IF EXISTS classroom')
session.execute("CREATE KEYSPACE classroom WITH REPLICATION={'class':'SimpleStrategy', 'replication_factor':'1'}")
session = cluster.connect('classroom')
session.execute("create table student(id int PRIMARY KEY, firstname text, lastname text, emails set<text>)")
session.execute("insert into student (id, firstname, lastname, emails) values (1, 'Joe', 'Smith', {'joes@xyz.com', 'joe.smith@abc.net'})")
session.execute("update student set firstname = 'Joseph' where id = 1")
session.execute("insert into student (id, firstname, lastname, emails) values (2, 'Mike', 'Jones', {'mikej@xyz.com', 'mike.jones@def.net', 'mike1234@gmail.com'})")
rows = session.execute('SELECT id, firstname, lastname, emails from student')
print('*' * 80)
print('student rows from cassandra directly')
print('*' * 80)
print(list(rows))
print('*' * 80)



## <img src="../images/spark_logo.png" width=100 height=50 /><font size="+2">  This uses the Spark Cassandra connector to read and write between a Spark DataFrame and Cassandra</font>


### Set up the spark context but remember to add the package for Cassandra and any others needed. The initspark.py helper function provided shows examples of how to do that.

In [None]:
# pip install cassandra-driver
# pyspark --packages com.datastax.spark:spark-cassandra-connector_2.12:3.0.0
import os
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

#CASSANDRA_HOST = 'localhost'
CASSANDRA_HOST = '127.0.0.1'
CASSANDRA_USER = 'cassandra'
CASSANDRA_PASSWORD = 'student'

ap = PlainTextAuthProvider(username=CASSANDRA_USER, password=CASSANDRA_PASSWORD)
cluster = Cluster([CASSANDRA_HOST], auth_provider = ap)
cluster.connect()

import sys
sys.path.append('/class')
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
sys.path.append('/class')

if 'sc' in locals():
    sc.stop()
    
from initspark import initspark
sc, spark, conf = initspark(cassandra = "127.0.0.1", cassandra_user = 'cassandra'
                            , cassandra_password='student', packages = ['cassandra'])



### Use standard Spark read and write methods using the right package for Cassandra.

In [None]:
# Python to access a Cassandra cluster through Spark
people = spark.read.format("org.apache.spark.sql.cassandra").options(table="student", keyspace="classroom").load()
print('*' * 80)
print('student rows from spark before insert')
print('*' * 80)
people.show()
print(people.collect())
print('*' * 80)

# Append the results of a DataFrame into a Cassandra table
x = sc.parallelize([(3, 'Mary', 'Johnson', ['Mary1@gmail.com', 'Mary2@yahoo.com'])])
x1 = spark.createDataFrame(x, schema = ['id', 'firstname', 'lastname', 'emails'])
x1.write.format("org.apache.spark.sql.cassandra").options(table="student", keyspace="classroom").mode("append").save()

people = spark.read.format("org.apache.spark.sql.cassandra").options(table="student", keyspace="classroom").load()
print('*' * 80)
print('student rows from spark after insert')
print('*' * 80)
people.show()
print(people.collect())
print('*' * 80)

print('*' * 80)
print('spark sql query from cassandra')
print('*' * 80)
people.createOrReplaceTempView('people')
people2 = spark.sql('select id, firstname, lastname, email from people LATERAL VIEW EXPLODE(emails) EXPLODED_TABLE AS email')
people2.show()

people3 = people2.where("email like '%.com'").orderBy("id")
people3.show()


## <img src="../images/beam_logo.png" width=100 height=50 /><font size="+2">  Currently Beam CassandraIO is only supported on Java</font>

### Refer here for a list of supported connectors: https://beam.apache.org/documentation/io/built-in/. It's only a matter of time before this is supported on Python. Below is a sample of Java code showing reading and writing PCollections in a pipeline.

In [None]:
// Java example of reading from Apache Cassandra

pipeline.apply(CassandraIO.<Person>read()
     .withHosts(Arrays.asList("host1", "host2"))
     .withPort(9042)
     .withKeyspace("beam")
     .withTable("Person")
     .withEntity(Person.class)
     .withCoder(SerializableCoder.of(Person.class))

 
// Java example of writing to Apache Cassandra

 pipeline
    .apply(...) // provides a PCollection<Person> where Person is an entity
    .apply(CassandraIO.<Person>write()
        .withHosts(Arrays.asList("host1", "host2"))
        .withPort(9042)
        .withKeyspace("beam")
        .withEntity(Person.class));
 

# __ __ __ __ __ __ __ __ __ __ __ __

# <img src="../images/hbase_logo.png" width=200 height=200 /><font color='cadetblue' size="+2"></font>

In [None]:
import sys, os
sys.path.append('/class')
os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3'
os.environ['
sys.path.append('/class')
if 'sc' in locals():
    sc.stop()

if 'sc' not in locals():
    from initspark import initspark
    sc, spark, conf = initspark(packages = ['hbase'])



In [None]:
data_source_format = 'org.apache.hadoop.hbase.spark'
df = sc.parallelize([('a', '1.0'), ('b', '2.0')]).toDF(schema=['col0', 'col1'])

# ''.join(string.split()) in order to write a multi-line JSON string here.
catalog = ''.join("""{
    "table":{"namespace":"default", "name":"testtable"},
    "rowkey":"key",
    "columns":{
        "col0":{"cf":"rowkey", "col":"key", "type":"string"},
        "col1":{"cf":"cf", "col":"col1", "type":"string"}
    }
}""".split())


# Writing
df.write.options(catalog=catalog).format(data_source_format).save()

# Reading
df = sqlc.read.options(catalog=catalog).format(data_source_format).load()

df.show()