In [0]:
import sys

rootpath = '/class/'
datapath = f'{rootpath}datasets/'
sys.path.append(rootpath)
from pyspark_helpers import *
sc, spark, conf = initspark()

### You can query an existing Hive table and bring it into a Spark DataFrame.

In [0]:
regions = spark.sql('select * from regions')
Eregions = spark.read.table('regions')


# r = spark.read.csv('hdfs://localhost:9000/user/hive/warehouse/regions', schema='regionid:int, regionname:string').where('regionid<=2')
# r.show()
display(regions)

### Read in a file to a Spark DataFrame.

In [0]:
territories = spark.read.csv(f'{datapath}/northwind/CSVHeaders/territories', header=True)
territories.show()

### Use createOrReplaceTempView to create a virtual table in the Hive catalog and then it can be queried using SQL as if it were a hive table.

In [0]:
territories.createOrReplaceTempView('territories')
t1 =spark.sql('select * from territories where regionid = 1').orderBy('TerritoryName')
t1.show()
print(t1.count())

### Spark DataFrames can be saved to a Hive table using either the saveAsTable method or writing a SQL query that uses CREATE TABLE AS.

In [0]:
! hadoop fs -rm -r /user/hive/warehouse/territories2
! hadoop fs -rm -r /user/hive/warehouse/territories3
! hadoop fs -rm -r /user/hive/warehouse/territoryregion
spark.sql('drop table if exists territories3')

spark.sql('CREATE TABLE territories3 STORED AS ORC AS SELECT * FROM territories ')

### Queries use standard HQL to mix Hive tables and virtual tables. Both are read into a Spark DataFrame and the processing happens at the Spark level not at the Hive level. HQL is just used to parse the logic into the corresponding Spark methods.

In [0]:
sql = """
select r.regionid, r.regionname, t.territoryid, t.territoryname 
from regions as r 
join territories as t on r.regionid = t.regionid 
order by r.regionid, t.territoryid
"""
rt = spark.sql(sql)
rt.show(10)

tr = regions.join(territories, regions.regionid == territories.RegionID). \
     select('regions.regionid', 'regionname', 'TerritoryID', 'TerritoryName'). \
     orderBy('regionid', 'territoryid')
tr.show(10)

### *LAB*: Read the northwind JSON products and make it into a TempView and do the same with the CSVHeaders version of categories. Then join the two using SparkSQL.

In [0]:
##############################################################################################categories = spark.read.csv(f'{datapath}/northwind/CSVHeaders/categories', header=True, inferSchema = True)
##############################################################################################print(categories)
##############################################################################################display(categories)
##############################################################################################categories.createOrReplaceTempView('categories') 

##############################################################################################products = spark.read.json(f'{datapath}/northwind/JSON/products')
##############################################################################################print(products)
##############################################################################################display(products)
##############################################################################################products.createOrReplaceTempView('products') 

##############################################################################################sql = '''
##############################################################################################select c.categoryid, c.categoryname, p.productid, p.productname, p.unitprice
##############################################################################################from products as p
##############################################################################################join categories as c on p.categoryid = c.categoryid
##############################################################################################order by c.categoryid, p.productid
##############################################################################################'''
##############################################################################################display(spark.sql(sql))


### Install the MySQL Python connector. This has nothing to do with Spark but if you want to run SQL queries directly, it is helpful.

In [0]:
#! pip install mysql-connector-python

### Let's make sure we have a database for northwind and no regions table.

In [0]:
import mysql.connector
try:
    cn = mysql.connector.connect(host='localhost', user='test', password='password')
    cursor = cn.cursor()
    cursor.execute('create database if not exists northwind')
    cn.close()

    cn = mysql.connector.connect(host='localhost', user='test', password='password', database='northwind')
    cursor = cn.cursor()    
    cursor.execute('drop table if exists regions')
    cn.close()
except:
    print('something went wrong')
else:
    print('success')


### Write a DataFrame to a SQL database.

In [0]:
regions.write.format("jdbc").mode('overwrite').options(url="jdbc:mysql://localhost/northwind", driver='com.mysql.jdbc.Driver', dbtable='regions', user='test', password = "password", mode = "append", useSSL = "false").save()
products.write.format("jdbc").mode('overwrite').options(url="jdbc:mysql://localhost/northwind", driver='com.mysql.jdbc.Driver', dbtable='products', user='test', password = "password", mode = "append", useSSL = "false").save()


print('Done')

### Read a SQL table into a Spark DataFrame.

In [0]:
regions2 = spark.read.format("jdbc"). \
           options(url="jdbc:mysql://localhost/northwind", 
                   driver="com.mysql.jdbc.Driver", 
                   dbtable= "regions", user="test", password="password").load()
regions2.show()



### Use the query option to do some of the initial work on the SQL side, then bring that into a dataframe and continue doing more processing with SparkSQL.

In [0]:
sql = "select productid, productname, unitprice, unitsinstock from products where categoryid = 2"
products2 = spark.read.format("jdbc"). \
            options(url="jdbc:mysql://localhost/northwind", 
                    driver="com.mysql.jdbc.Driver", 
                    query=sql, user="test", password="password").load()
products2.show()

products2.createOrReplaceTempView('products2')
display(spark.sql('select *, unitprice * unitsinstock as value from products2'))

### Creating the regions2 DataFrame does not execute anything yet, but by making the DataFrame into a Temp View then running a Spark SQL query, it tells Spark to read the SQL data into a DataFrame and then use the cluster to do the processing, not the SQL source.

In [0]:
regions2.createOrReplaceTempView('regions2')
spark.sql('select * from regions2 where regionid < 3').show()

### Alternate ways to code a query using SQL and methods.

In [0]:
# x = spark.sql('select count(*) from regions').collect()
# print(x[0][0])
# spark.sql('select * from regions').count()

p2 = products.withColumn('value', products.unitprice * products.unitsinstock).where('value > 500')
display(p2)

sql = """
select *
from (select *, unitprice * quantity as value) as t
where value > 500
"""

### Using SQL you can use familiar syntax instead of withColumn or withCoumnRenamed methods.

In [0]:
t1 = spark.sql('select TerritoryID as TerrID, UPPER(TerritoryName) as TerritoryName, RegionID from territories')
t1.show(5)

from pyspark.sql.functions import expr
territories.withColumn('TerritoryName', expr('UPPER(TerritoryName)')).withColumnRenamed('TerritoryID', 'TerrID').show(5)

### Sometimes there is a function in Python that doesn't exist in SQL and it would be helpful to use, so you could make a udf and use withColumn.

In [0]:
from pyspark.sql.functions import expr, udf
from pyspark.sql.types import *

t2 = spark.sql('select * from territories')
t2.printSchema()
#t2.show()
t2 = t2.withColumn('upperName', expr('UPPER(TerritoryName)'))
t2.show(5)

t2 = t2.withColumn('titleName', udf(lambda x : x.title(), StringType())(t2.upperName))
t2.show(5)



### To make it easier though, you could make the Python function into a udf that SQL can understand similar to how you can make a DataFrame seem like a virtual table with createOrReplaceTempView.

In [0]:
def reverseString(x):
    return x[::-1]

spark.udf.register('reverse', reverseString, StringType())

spark.sql('select *, reverse(TerritoryName) as Reversed from Territories').orderBy('Reversed').show()

### HQL has collect_set and collect_list functions to aggregate items into a list instead of summing them up. 

In [0]:
from pyspark.sql.functions import collect_list
territories.groupBy(territories.RegionID).agg(collect_list(territories.TerritoryName)).show()

tr1 = spark.sql("SELECT RegionID, collect_list(TerritoryName) AS TerritoryList FROM Territories GROUP BY RegionID")
display(tr1)
tr1.printSchema()
print(tr1.take(1))



### Instead of a simple datatype you could also collect complex structured objects using the HQL NAMED_STRUCT.

In [0]:

sql = """SELECT r.RegionID, r.RegionName
, COLLECT_SET(NAMED_STRUCT("TerritoryID", t.TerritoryID, "TerritoryName", t.TerritoryName)) AS TerritoryList
FROM Regions AS r
JOIN Territories AS t ON r.RegionID = t.RegionID
GROUP BY r.RegionID, r.RegionName
ORDER BY r.RegionID"""

tr2 = spark.sql(sql)
tr2.printSchema()
print(tr2)
tr2.show()
print(tr2.take(2))
tr2.write.json('TerritoryRegion.json')
spark.sql('create table TerritoryRegion as ' + sql)

In [0]:
tr2 = spark.read.json('TerritoryRegion.json')
display(tr2)
tr2.printSchema()

### If you have data that is already collected into a complex datatype and want to flatten it, you could use HQL EXPLODE function.

### You could use the Spark explode method.

In [0]:
from pyspark.sql.functions import explode
tr1.select('RegionID', explode('TerritoryList')).withColumnRenamed('col','TerritoryName').show()

### Or if the DataFrame is turned into a Temp View, you could use the HQL query to do it.

In [0]:
tr1.createOrReplaceTempView('RegionTerritories')
sql = """SELECT RegionID, TerritoryName
FROM RegionTerritories
LATERAL VIEW EXPLODE(TerritoryList) EXPLODED_TABLE AS TerritoryName
ORDER BY RegionID, TerritoryName
"""
spark.sql(sql).show()

### Or you could select specific elements from a collection.

In [0]:
tr2.createOrReplaceTempView('RegionTerritories')
spark.sql("select RegionId, RegionName, TerritoryList[0] as First, TerritoryList[size(TerritoryList) - 1] as Last, size(TerritoryList) as TerritoryCount from RegionTerritories").show()

### If the array is of structs note the syntax of fetching the elements from the struct uses the . like an object property.

In [0]:
sql = """SELECT RegionID, RegionName, Territory.TerritoryID AS TerritoryID, Territory.TerritoryName AS TerritoryName
FROM RegionTerritories
LATERAL VIEW EXPLODE(TerritoryList) EXPLODED_TABLE AS Territory
"""
spark.sql(sql).show()

### To read from Cassandra:
Requires PYSPARK_SUBMIT_ARGS='--packages com.datastax.spark:spark-cassandra-connector_2.11:2.4.1"


In [0]:
people = spark.read.format("org.apache.spark.sql.cassandra").options(table="student", keyspace="classroom").load()
display(people)   

### To read from Mongo:
Requires PYSPARK_SUBMIT_ARGS"--packages org.mongodb.spark:mongo-spark-connector_2.11:2.4.1"

In [0]:
df = spark.read.format("mongo").option("uri", "mongodb://127.0.0.1/classroom.people").load()
display(df)