In [1]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.context import SparkContext
import time
from pyspark.sql.types import StructType, StructField, StringType
import os


In [2]:
#Spark session with cluster mode(yarn)
spark = SparkSession.builder \
     .master("yarn") \
     .appName("Pyspark SQL") \
     .config("spark.ui.port", "50032") \
     .getOrCreate()

In [3]:
#to print all spark configuarations
#to get application URL(DAG) use setting PROXY_URI_BASES
spark.sparkContext.getConf().getAll() 

[('spark.history.kerberos.keytab', 'none'),
 ('spark.eventLog.enabled', 'true'),
 ('spark.dynamicAllocation.initialExecutors', '2'),
 ('spark.history.ui.port', '18081'),
 ('spark.dynamicAllocation.maxExecutors', '10'),
 ('spark.driver.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter.param.PROXY_URI_BASES',
  'http://rm01.itversity.com:19088/proxy/application_1589064448439_1422'),
 ('spark.executor.extraLibraryPath',
  '/usr/hdp/current/hadoop-client/lib/native:/usr/hdp/current/hadoop-client/lib/native/Linux-amd64-64'),
 ('spark.executorEnv.PYTHONPATH',
  '/usr/hdp/current/spark2-client/python/lib/py4j-0.10.6-src.zip:/usr/hdp/current/spark2-client/python/<CPS>{{PWD}}/pyspark.zip<CPS>{{PWD}}/py4j-0.10.6-src.zip'),
 ('spark.history.provider',
  'org.apache.spark.deploy.history.FsHistoryProvider'),
 ('spark.org.apache.hadoop.yarn.server.webproxy.amf

In [4]:
#Create Spark Context
sc = spark.sparkContext

In [5]:
#Create SQL Context
sql = spark.sql

In [6]:
#Directly connect to hive Database
sql("use rposam_db")

DataFrame[]

In [7]:
#Query some table from hive database
sql("select * from emp").show()

+------+------+---------+------+----------+------+------+------+
| empno| ename|      job|   mgr|  hiredate|   sal|  comm|deptno|
+------+------+---------+------+----------+------+------+------+
|7369.0| SMITH|    CLERK|7902.0|1980-12-17| 800.0|  null|  20.0|
|7499.0| ALLEN| SALESMAN|7698.0|1981-02-20|1600.0| 300.0|  30.0|
|7521.0|  WARD| SALESMAN|7698.0|1981-02-22|1250.0| 500.0|  30.0|
|7566.0| JONES|  MANAGER|7839.0|1981-04-02|2975.0|  null|  20.0|
|7654.0|MARTIN| SALESMAN|7698.0|1981-09-28|1250.0|1400.0|  30.0|
|7698.0| BLAKE|  MANAGER|7839.0|1981-05-01|2850.0|  null|  30.0|
|7782.0| CLARK|  MANAGER|7839.0|1981-06-09|2450.0|  null|  10.0|
|7788.0| SCOTT|  ANALYST|7566.0|1982-12-09|3000.0|  null|  20.0|
|7839.0|  KING|PRESIDENT|  null|1981-11-17|5000.0|  null|  10.0|
|7844.0|TURNER| SALESMAN|7698.0|1981-09-08|1500.0|   0.0|  30.0|
|7876.0| ADAMS|    CLERK|7788.0|1983-01-12|1100.0|  null|  20.0|
|7900.0| JAMES|    CLERK|7698.0|1981-12-03| 950.0|  null|  30.0|
|7902.0|  FORD|  ANALYST|

In [36]:
emp.to_csv("output/emp.csv")

In [37]:
emp.to_excel("output/emp.xlsx")

In [38]:
emp.to_json("output/emp.json")

In [42]:
emp.to_parquet("output/emp.parquet",engine="pyarrow")

In [46]:
emp_csv = spark.read.csv("data/emp.csv")

DataFrame[_c0: string, _c1: string, _c2: string, _c3: string, _c4: string, _c5: string, _c6: string, _c7: string]

In [22]:
##Read josn file using pandas(if you use spark then reads from HDFS) from local
pandas_df = pd.read_json("output/emp.json")

In [127]:
##Create Dataframe in Spark using Pandas DataFrame
spark_df = spark.createDataFrame(pandas_df)

In [128]:
##Registering as temp Table
spark_df.createOrReplaceTempView("empTab")

In [129]:
#Query using SparkSQL Core
sql("select * from empTab").show()

+-----+------+---------+------+----------+----+------+------+
|empno| ename|      job|   mgr|  hiredate| sal|  comm|deptno|
+-----+------+---------+------+----------+----+------+------+
| 7369| SMITH|    CLERK|7902.0|1980-12-17| 800|   NaN|    20|
| 7499| ALLEN| SALESMAN|7698.0|1981-02-20|1600| 300.0|    30|
| 7521|  WARD| SALESMAN|7698.0|1981-02-22|1250| 500.0|    30|
| 7566| JONES|  MANAGER|7839.0|1981-04-02|2975|   NaN|    20|
| 7654|MARTIN| SALESMAN|7698.0|1981-09-28|1250|1400.0|    30|
| 7698| BLAKE|  MANAGER|7839.0|1981-05-01|2850|   NaN|    30|
| 7782| CLARK|  MANAGER|7839.0|1981-06-09|2450|   NaN|    10|
| 7788| SCOTT|  ANALYST|7566.0|1982-12-09|3000|   NaN|    20|
| 7839|  KING|PRESIDENT|   NaN|1981-11-17|5000|   NaN|    10|
| 7844|TURNER| SALESMAN|7698.0|1981-09-08|1500|   0.0|    30|
| 7876| ADAMS|    CLERK|7788.0|1983-01-12|1100|   NaN|    20|
| 7900| JAMES|    CLERK|7698.0|1981-12-03| 950|   NaN|    30|
| 7902|  FORD|  ANALYST|7566.0|1981-12-03|3000|   NaN|    20|
| 7934|M

In [8]:
#To read json file using Spark from HDFS 
df = spark.read.json("data/emp.json")

In [9]:
def convert_empdf_collection(df):
    l = []
    for row in df.collect():
        for i in range(0,len(row.empno)):
            l.append(
                (row.empno[i],
                 row.ename[i],
                 row.job[i],
                 row.mgr[i],
                 row.hiredate[i],
                 row.sal[i],
                 row.comm[i],
                 row.deptno[i])
            )
    return l

In [10]:
emp_l = convert_empdf_collection(df)
emp_df = spark.createDataFrame(emp_l,("empno","ename","job","mgr","hiredate","sal","comm","deptno"))
emp_df.show()

+-----+------+---------+------+----------+------+------+------+
|empno| ename|      job|   mgr|  hiredate|   sal|  comm|deptno|
+-----+------+---------+------+----------+------+------+------+
| 7369| SMITH|    CLERK|7902.0|1980-12-17| 800.0|  null|    20|
| 7499| ALLEN| SALESMAN|7698.0|1981-02-20|1600.0| 300.0|    30|
| 7876| ADAMS|    CLERK|7788.0|1983-01-12|1100.0|  null|    20|
| 7900| JAMES|    CLERK|7698.0|1981-12-03| 950.0|  null|    30|
| 7902|  FORD|  ANALYST|7566.0|1981-12-03|3000.0|  null|    20|
| 7934|MILLER|    CLERK|7782.0|1982-01-23|1300.0|  null|    10|
| 7521|  WARD| SALESMAN|7698.0|1981-02-22|1250.0| 500.0|    30|
| 7566| JONES|  MANAGER|7839.0|1981-04-02|2975.0|  null|    20|
| 7654|MARTIN| SALESMAN|7698.0|1981-09-28|1250.0|1400.0|    30|
| 7698| BLAKE|  MANAGER|7839.0|1981-05-01|2850.0|  null|    30|
| 7782| CLARK|  MANAGER|7839.0|1981-06-09|2450.0|  null|    10|
| 7788| SCOTT|  ANALYST|7566.0|1982-12-09|3000.0|  null|    20|
| 7839|  KING|PRESIDENT|  null|1981-11-1

In [11]:
emp_df[emp_df.deptno == 10].show()

+-----+------+---------+------+----------+------+----+------+
|empno| ename|      job|   mgr|  hiredate|   sal|comm|deptno|
+-----+------+---------+------+----------+------+----+------+
| 7934|MILLER|    CLERK|7782.0|1982-01-23|1300.0|null|    10|
| 7782| CLARK|  MANAGER|7839.0|1981-06-09|2450.0|null|    10|
| 7839|  KING|PRESIDENT|  null|1981-11-17|5000.0|null|    10|
+-----+------+---------+------+----------+------+----+------+

