In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [74]:
spark = SparkSession.builder.appName('Partitioning').master('local[*]').enableHiveSupport().config("spark.sql.catalogImplementation","hive").getOrCreate()

In [75]:
emp_df = spark.read.csv('file:///home/jovyan/work/HR-Dataset/core_dataset.csv',header=True,inferSchema=True)

### Catalog

In [47]:
spark.catalog.listCatalogs() # Catalog -> database

[CatalogMetadata(name='spark_catalog', description=None)]

In [42]:
spark.catalog.currentCatalog()

'spark_catalog'

In [43]:
# create table employee (
#     emp_id int NOT NULL,
#     name string,
#     dept string
# )

schema = StructType([
    StructField('emp_id',IntegerType(),False),
    StructField('name',StringType(),False),
    StructField('dept',StringType(),False),
])

# spark.sql('''
# create table employee (
#     emp_id int NOT NULL,
#     name string,
#     dept string
# )
# ''')

In [44]:
spark.catalog.createTable('employee',schema=schema)

DataFrame[emp_id: int, name: string, dept: string]

In [41]:
spark.catalog.listTables()

[]

In [33]:
# spark.catalog.dropGlobalTempView('employee')

False

In [51]:
tbl = spark.catalog.getTable('employee')

### createOrReplaceTempView

In [79]:
# emp_df -> dataFrame 
# emp_tbl -> tempView

emp_df.createOrReplaceTempView('emp_tbl')

### spark.sql

In [80]:
spark.sql('select * from emp_tbl limit 5').show()

+--------------------+---------------+-----+------+----------+---+------+-----------+-----------+---------------+--------------------+------------+-------------------+--------------------+-----------------+-------------+--------------------+--------+------------------+--------------------+--------------------+
|       Employee Name|Employee Number|State|   Zip|       DOB|Age|   Sex|MaritalDesc|CitizenDesc|Hispanic/Latino|            RaceDesc|Date of Hire|Date of Termination|     Reason For Term|Employment Status|   Department|            Position|Pay Rate|      Manager Name|     Employee Source|   Performance Score|
+--------------------+---------------+-----+------+----------+---+------+-----------+-----------+---------------+--------------------+------------+-------------------+--------------------+-----------------+-------------+--------------------+--------+------------------+--------------------+--------------------+
|          Brown, Mia|     1103024456|   MA|1450.0|11/24/1985| 3

In [56]:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
+---------+



In [64]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|  default| employee|      false|
|         |  emp_tbl|       true|
+---------+---------+-----------+



In [59]:
spark.sql('create database emp_db').show()

++
||
++
++



In [60]:
spark.sql('show databases').show()

+---------+
|namespace|
+---------+
|  default|
|   emp_db|
+---------+



In [65]:
spark.sql('use database emp_db').show()

++
||
++
++



In [62]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|         |  emp_tbl|       true|
+---------+---------+-----------+



In [66]:
spark.sql('''
create table emp_db.employee (
    emp_id int NOT NULL,
    name string,
    dept string,
    salary double
)
''')

DataFrame[]

In [67]:
spark.sql('show tables').show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|   emp_db| employee|      false|
|         |  emp_tbl|       true|
+---------+---------+-----------+



In [68]:
spark.sql('''
insert into emp_db.employee (emp_id, name, dept, salary) values (1, 'John', 'Sales', 100000)
''')

DataFrame[]

In [71]:
spark.sql('select * from employee').show()

+------+----+-----+--------+
|emp_id|name| dept|  salary|
+------+----+-----+--------+
|     1|John|Sales|100000.0|
+------+----+-----+--------+



In [72]:
spark.sql('select * from default.employee').show()

+------+----+----+
|emp_id|name|dept|
+------+----+----+
+------+----+----+



In [73]:
spark.stop()

In [78]:
spark.sql('select * from emp_db.employee').show()

+------+----+-----+--------+
|emp_id|name| dept|  salary|
+------+----+-----+--------+
|     1|John|Sales|100000.0|
+------+----+-----+--------+



In [81]:
spark.sql('''
select count(*) from emp_tbl where Sex='Male' and Age between 25 and 30 and MaritalDesc in ('Single', 'Seperated', 'Divorced')
''').show()

+--------+
|count(1)|
+--------+
|      11|
+--------+



In [85]:
# emp_df.withColumn('First_name', split(col('Employee Name'),', ').getItem(0)).\
# withColumn('Last_name', split(col('Employee Name'),', ').getItem(1)).\
# withColumn('new_pay_rate', col('Pay Rate')*1.1 ).\
# select('First_name','Last_name','new_pay_rate','Department').show()

spark.sql('''
select 
split(`Employee Name` , ', ')[0] as first_name,
split(`Employee Name` , ', ')[1] as last_name,
`Pay Rate` * 1.1 as new_pay_rate,
Department
from emp_tbl
''').show()

+------------+----------+------------------+----------------+
|  first_name| last_name|      new_pay_rate|      Department|
+------------+----------+------------------+----------------+
|       Brown|       Mia|             31.35|   Admin Offices|
|   LaRotonda| William  |              25.3|   Admin Offices|
|      Steans|  Tyrone  |31.900000000000002|   Admin Offices|
|      Howard|   Estelle|23.650000000000002|   Admin Offices|
|       Singh|      Nan |            18.216|   Admin Offices|
|       Smith| Leigh Ann|             22.55|   Admin Offices|
|     LeBlanc|Brandon  R| 60.50000000000001|   Admin Offices|
|       Quinn|      Sean| 60.50000000000001|   Admin Offices|
|    Boutwell|   Bonalyn| 38.44500000000001|   Admin Offices|
|Foster-Baker|       Amy| 38.44500000000001|   Admin Offices|
|        King|     Janet|              88.0|Executive Office|
|      Zamora|  Jennifer|              71.5|           IT/IS|
|      Becker|     Renee|47.300000000000004|           IT/IS|
|       

In [90]:
ddl = spark.sql('show create table emp_db.employee').first()['createtab_stmt']
print(ddl)

CREATE TABLE emp_db.employee (
  emp_id INT NOT NULL,
  name STRING,
  dept STRING,
  salary DOUBLE)
USING text
TBLPROPERTIES (
  'transient_lastDdlTime' = '1703262655')



In [101]:
df = spark.table('emp_db.employee').withColumn('hike', lit(None).cast('double'))

df.write.mode('overwrite').option('overwriteSchema','true').saveAsTable('emp_db.employee2')