In [1]:
import os
import sys
os.environ['SPARK_HOME']='/home/cloudera/spark230hadoop26'
os.environ['PYLIB']=os.environ['SPARK_HOME']+'/python/lib'
sys.path.insert(0,os.environ['PYLIB']+'/py4j-0.10.6-src.zip')
sys.path.insert(1,os.environ['PYLIB']+'/pyspark.zip')

In [2]:
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np

In [3]:
spark = SparkSession.builder.appName('TestHive') \
.config('spark.warehouse.dir','/apps/hive/warehouse') \
.enableHiveSupport().getOrCreate()

In [4]:
# Having set the driver and driver options we should have spark representing spark session 
# available straight away
spark.version

'2.3.0'

In [5]:
sc = spark.sparkContext

In [6]:
acTransList = ["SB10001,1000", "SB10002,1200", "SB10003,8000",
    "SB10004,400", "SB10005,300", "SB10006,10000", "SB10007,500",
    "SB10008,56", "SB10009,30", "SB10010,7000",
    "CR10001,7000", "SB10002,-10"]
acRDD = sc.parallelize(acTransList)

In [7]:
acTransDF = acRDD.map(lambda x: x.split(',')).toDF(['accNo', 'tranAmount'])
acTransDF.show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|      1000|
|SB10002|      1200|
|SB10003|      8000|
|SB10004|       400|
|SB10005|       300|
|SB10006|     10000|
|SB10007|       500|
|SB10008|        56|
|SB10009|        30|
|SB10010|      7000|
|CR10001|      7000|
|SB10002|       -10|
+-------+----------+



In [22]:
# Using create data frame method of spark
from pyspark.sql.functions import *
acTransDFFmSQLC = spark.createDataFrame(acRDD.map(lambda x: x.split(','))).toDF('accno', 'tranamount')
print(acTransDFFmSQLC.printSchema())
acTransDFFmSQLC.show()

root
 |-- accno: string (nullable = true)
 |-- tranamount: string (nullable = true)

None
+-------+----------+
|  accno|tranamount|
+-------+----------+
|SB10001|      1000|
|SB10002|      1200|
|SB10003|      8000|
|SB10004|       400|
|SB10005|       300|
|SB10006|     10000|
|SB10007|       500|
|SB10008|        56|
|SB10009|        30|
|SB10010|      7000|
|CR10001|      7000|
|SB10002|       -10|
+-------+----------+



In [13]:
print("\nData frame select column, columns")
# def select(col: String, cols: String*): DataFrame
print(acTransDF.select("accNo").collect())
acTransDF.show()


Data frame select column, columns
[Row(accNo='SB10001'), Row(accNo='SB10002'), Row(accNo='SB10003'), Row(accNo='SB10004'), Row(accNo='SB10005'), Row(accNo='SB10006'), Row(accNo='SB10007'), Row(accNo='SB10008'), Row(accNo='SB10009'), Row(accNo='SB10010'), Row(accNo='CR10001'), Row(accNo='SB10002')]
+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|      1000|
|SB10002|      1200|
|SB10003|      8000|
|SB10004|       400|
|SB10005|       300|
|SB10006|     10000|
|SB10007|       500|
|SB10008|        56|
|SB10009|        30|
|SB10010|      7000|
|CR10001|      7000|
|SB10002|       -10|
+-------+----------+



In [14]:
print("Data fame selection using select expr")
acTransDF.selectExpr("accNo as account_no", "tranAmount").show()

Data fame selection using select expr
+----------+----------+
|account_no|tranAmount|
+----------+----------+
|   SB10001|      1000|
|   SB10002|      1200|
|   SB10003|      8000|
|   SB10004|       400|
|   SB10005|       300|
|   SB10006|     10000|
|   SB10007|       500|
|   SB10008|        56|
|   SB10009|        30|
|   SB10010|      7000|
|   CR10001|      7000|
|   SB10002|       -10|
+----------+----------+



In [16]:
print("filter equivalent to where, operation - filter and where seem to be equivalent")
acTransDF.filter("tranAmount >= 1000").show()

filter equivalent to where, operation - filter and where seem to be equivalent
+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|      1000|
|SB10002|      1200|
|SB10003|      8000|
|SB10006|     10000|
|SB10010|      7000|
|CR10001|      7000|
+-------+----------+



In [17]:
acTransDF.where("tranAmount >= 1000 and accNo like'SB%' ").show()

+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|      1000|
|SB10002|      1200|
|SB10003|      8000|
|SB10006|     10000|
|SB10010|      7000|
+-------+----------+



In [41]:
from pyspark.sql.functions import *
print("Filtering for conditions on multiple columns")
acTransDF.filter((acTransDF.tranAmount.cast('float') >= 1000) &
                 (acTransDF.accNo.startswith('SB'))).show()

Filtering for conditions on multiple columns
+-------+----------+
|  accNo|tranAmount|
+-------+----------+
|SB10001|      1000|
|SB10002|      1200|
|SB10003|      8000|
|SB10006|     10000|
|SB10010|      7000|
+-------+----------+



In [47]:
tenf = lambda x: x * 10
tenudf = udf(tenf)
acTransDF.select('accNo','tranAmount', tenudf(acTransDF.tranAmount.cast('float')).alias('tranM10')).show()
acTransDF.select('accNo','tranAmount', tenudf(acTransDF['tranAmount'].cast('float')).alias('tranM10')).show()

+-------+----------+--------+
|  accNo|tranAmount| tranM10|
+-------+----------+--------+
|SB10001|      1000| 10000.0|
|SB10002|      1200| 12000.0|
|SB10003|      8000| 80000.0|
|SB10004|       400|  4000.0|
|SB10005|       300|  3000.0|
|SB10006|     10000|100000.0|
|SB10007|       500|  5000.0|
|SB10008|        56|   560.0|
|SB10009|        30|   300.0|
|SB10010|      7000| 70000.0|
|CR10001|      7000| 70000.0|
|SB10002|       -10|  -100.0|
+-------+----------+--------+

+-------+----------+--------+
|  accNo|tranAmount| tranM10|
+-------+----------+--------+
|SB10001|      1000| 10000.0|
|SB10002|      1200| 12000.0|
|SB10003|      8000| 80000.0|
|SB10004|       400|  4000.0|
|SB10005|       300|  3000.0|
|SB10006|     10000|100000.0|
|SB10007|       500|  5000.0|
|SB10008|        56|   560.0|
|SB10009|        30|   300.0|
|SB10010|      7000| 70000.0|
|CR10001|      7000| 70000.0|
|SB10002|       -10|  -100.0|
+-------+----------+--------+



In [55]:
stlist = [
    "INFY,2017-05-01,2000,2164550",
    "INFY,2017-5-02,1954,2174352",
    "INFY,2017-06-03,2341,2934231",
    "INFY,2017-06-04,1814,1904557",
    "SBIN,2017-05-01,200061,3164550",
    "SBIN,2017-5-02,211954,3174352",
    "SBIN,2017-06-03,222341,3434234",
    "SBIN,2017-06-04,301814,4590455"]

In [59]:
from pyspark.sql.types import *
stock_schema = StructType(
[StructField('symbol', StringType()), StructField('trdate', StringType()), 
StructField('qty', IntegerType()), StructField('vlu', DoubleType())])
stock_rdd = sc.parallelize(stlist).map(lambda x: x.split(',')).map(
    blambda x: (x[0], x[1], int(x[2]), float(x[3])))
stdf = spark.createDataFrame(stock_rdd, stock_schema)
print(stdf.printSchema())
stdf.show()

root
 |-- symbol: string (nullable = true)
 |-- trdate: string (nullable = true)
 |-- qty: integer (nullable = true)
 |-- vlu: double (nullable = true)

None
+------+----------+------+---------+
|symbol|    trdate|   qty|      vlu|
+------+----------+------+---------+
|  INFY|2017-05-01|  2000|2164550.0|
|  INFY| 2017-5-02|  1954|2174352.0|
|  INFY|2017-06-03|  2341|2934231.0|
|  INFY|2017-06-04|  1814|1904557.0|
|  SBIN|2017-05-01|200061|3164550.0|
|  SBIN| 2017-5-02|211954|3174352.0|
|  SBIN|2017-06-03|222341|3434234.0|
|  SBIN|2017-06-04|301814|4590455.0|
+------+----------+------+---------+



In [62]:
stdf_wym = stdf.select("symbol", year(to_date("trdate")).alias("yr"),
month(to_date("trdate")).alias("mnth"), "qty", "vlu")
stdf_wym.show()

+------+----+----+------+---------+
|symbol|  yr|mnth|   qty|      vlu|
+------+----+----+------+---------+
|  INFY|2017|   5|  2000|2164550.0|
|  INFY|2017|   5|  1954|2174352.0|
|  INFY|2017|   6|  2341|2934231.0|
|  INFY|2017|   6|  1814|1904557.0|
|  SBIN|2017|   5|200061|3164550.0|
|  SBIN|2017|   5|211954|3174352.0|
|  SBIN|2017|   6|222341|3434234.0|
|  SBIN|2017|   6|301814|4590455.0|
+------+----+----+------+---------+



In [67]:
print('Manipulating spark data frames using the underlying rdd')
stdf.rdd.map(lambda x: (x[0], x[1], x[2], x[3])).collect()

Manipulating spark data frames using the underlying rdd


[('INFY', '2017-05-01', 2000, 2164550.0),
 ('INFY', '2017-5-02', 1954, 2174352.0),
 ('INFY', '2017-06-03', 2341, 2934231.0),
 ('INFY', '2017-06-04', 1814, 1904557.0),
 ('SBIN', '2017-05-01', 200061, 3164550.0),
 ('SBIN', '2017-5-02', 211954, 3174352.0),
 ('SBIN', '2017-06-03', 222341, 3434234.0),
 ('SBIN', '2017-06-04', 301814, 4590455.0)]

In [73]:
print("cube will provide every cobmination of the fields used to create the cube")
stdf_wym.cube("symbol", "yr", "mnth").agg(sum("qty").alias("qty"), sum("vlu").alias("vlu")).show()

cube will provide every cobmination of the fields used to create the cube
+------+----+----+------+-----------+
|symbol|  yr|mnth|   qty|        vlu|
+------+----+----+------+-----------+
|  SBIN|null|null|936170|1.4363591E7|
|  SBIN|2017|   6|524155|  8024689.0|
|  null|2017|   5|415969|1.0677804E7|
|  SBIN|null|   5|412015|  6338902.0|
|  null|null|   5|415969|1.0677804E7|
|  SBIN|2017|null|936170|1.4363591E7|
|  SBIN|2017|   5|412015|  6338902.0|
|  null|2017|   6|528310|1.2863477E7|
|  INFY|null|null|  8109|  9177690.0|
|  SBIN|null|   6|524155|  8024689.0|
|  INFY|null|   6|  4155|  4838788.0|
|  null|null|null|944279|2.3541281E7|
|  INFY|null|   5|  3954|  4338902.0|
|  INFY|2017|   5|  3954|  4338902.0|
|  null|null|   6|528310|1.2863477E7|
|  INFY|2017|null|  8109|  9177690.0|
|  null|2017|null|944279|2.3541281E7|
|  INFY|2017|   6|  4155|  4838788.0|
+------+----+----+------+-----------+



In [74]:
print("\nrollup will rollup aggregates beginning from the first field in the rollup columns")
stdf_wym.rollup("symbol", "yr", "mnth").agg(sum("qty").alias("qty"), sum("vlu").alias("vlu")).show()


rollup will rollup aggregates beginning from the first field in the rollup columns
+------+----+----+------+-----------+
|symbol|  yr|mnth|   qty|        vlu|
+------+----+----+------+-----------+
|  SBIN|null|null|936170|1.4363591E7|
|  SBIN|2017|   6|524155|  8024689.0|
|  SBIN|2017|null|936170|1.4363591E7|
|  SBIN|2017|   5|412015|  6338902.0|
|  INFY|null|null|  8109|  9177690.0|
|  null|null|null|944279|2.3541281E7|
|  INFY|2017|   5|  3954|  4338902.0|
|  INFY|2017|null|  8109|  9177690.0|
|  INFY|2017|   6|  4155|  4838788.0|
+------+----+----+------+-----------+



In [77]:
stdf_wym.registerTempTable("stmdtbl")
stdf_wym.write.mode('overwrite').parquet("hdfs://localhost:8020/user/cloudera/stdf_parquet")

In [79]:
spark.sql("""create external table if not exists
            stmdtbl(symbol string,yr int, mnth int, qty int, vlu double )
            stored as parquet
            location 'hdfs://localhost:8020/user/cloudera/stdf_parquet'""")

print("Using sql to carry out multi dimensional aggregations")
spark.sql("""select symbol,yr,mnth,sum(qty) as qty, sum(vlu) as vlu
          from stmdtbl
          group by symbol, yr, mnth
          with cube""").show()


Using sql to carry out multi dimensional aggregations
+------+----+----+------+-----------+
|symbol|  yr|mnth|   qty|        vlu|
+------+----+----+------+-----------+
|  SBIN|null|null|936170|1.4363591E7|
|  SBIN|2017|   6|524155|  8024689.0|
|  null|2017|   5|415969|1.0677804E7|
|  SBIN|null|   5|412015|  6338902.0|
|  null|null|   5|415969|1.0677804E7|
|  SBIN|2017|null|936170|1.4363591E7|
|  SBIN|2017|   5|412015|  6338902.0|
|  null|2017|   6|528310|1.2863477E7|
|  INFY|null|null|  8109|  9177690.0|
|  SBIN|null|   6|524155|  8024689.0|
|  INFY|null|   6|  4155|  4838788.0|
|  null|null|null|944279|2.3541281E7|
|  INFY|null|   5|  3954|  4338902.0|
|  INFY|2017|   5|  3954|  4338902.0|
|  null|null|   6|528310|1.2863477E7|
|  INFY|2017|null|  8109|  9177690.0|
|  null|2017|null|944279|2.3541281E7|
|  INFY|2017|   6|  4155|  4838788.0|
+------+----+----+------+-----------+

