In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T
from pyspark import SparkConf

conf = SparkConf().setAppName("dmltest")
conf.set('spark.jars.packages', 'io.delta:delta-core_2.12:2.1.0')
conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
conf.set("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore")


In [None]:
spark = SparkSession.builder.config(conf=conf).getOrCreate()

In [None]:
df_trp = spark.read.format('parquet').load('/home/glue_user/workspace/sparklearning/src_data_pq/tripdata')
df_cus = spark.read.format('csv').options(header='true', inferschema='true').load('/home/glue_user/workspace/sparklearning/src_data_csv/Customer.csv')
df_ren = spark.read.format('csv').options(header='true', inferschema='true').load('/home/glue_user/workspace/sparklearning/src_data_csv/dvdrental.csv')
df_stf = spark.read.format('csv').options(header='true', inferschema='true').load('/home/glue_user/workspace/sparklearning/src_data_csv/Staff.csv')


In [None]:
#selectExpr
df_cus.selectExpr("*", "Company_name nameOfCompany_using_selectExpr").limit(2).show(truncate=False)
df_cus.select("*", F.col('Company_name').alias('nameOfCompany_using_alias')).limit(2).show(truncate=False)
df_cus.withColumnRenamed('Company_name', 'nameOfCompany_using_withColumnRenamed').limit(2).show(truncate=False)

In [None]:
#groupBy
df_cus.groupBy('Town').agg(F.avg('Town').alias('zavg'), F.count('Town').alias('cnt')).show()
df_cus.groupBy('Town').agg(F.expr("count(*) cnt"), F.expr("cast(sum(Company_ref) as integer) sm")).show()

In [None]:
#windowFunction
df_cus.selectExpr("*", 
                  "row_number() over(partition by Town order by Company_ref) as rn",
                  "count(*) over(partition by Town) cnt").filter("cnt>1")\
                    .orderBy(F.expr("town"), F.expr("rn desc")).show()

In [None]:
spark.sql('''
select
    to_timestamp('1993-08-15T10:30:45.5+05:30') bd
''').show(truncate=False)

In [None]:
df_ren.printSchema()

In [None]:
#join
df_ren.createOrReplaceTempView('dvdrental')
df_cus.createOrReplaceTempView('customer')
df_stf.createOrReplaceTempView('staff')

spark.sql('''
select
    *
from
    dvdrental
join
    customer
    on dvdrental.customer_id=customer.customer_id
''').show()

In [None]:
spark.range(1000).write.mode('overwrite').format('delta').saveAsTable('cloud.test_delta')

In [None]:
df = spark.read.table("cloud.test_delta")

In [6]:
spark.stop()