In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types as T

In [None]:
spark = SparkSession.builder.appName('test_dml').getOrCreate()

In [62]:
df_trp = spark.read.format('parquet').load('/home/glue_user/workspace/sparklearning/src_data_pq/tripdata')
df_cus = spark.read.format('csv').options(header='true', inferschema='true').load('/home/glue_user/workspace/sparklearning/src_data_csv/Customer.csv')
df_ren = spark.read.format('csv').options(header='true', inferschema='true').load('/home/glue_user/workspace/sparklearning/src_data_csv/dvdrental.csv')
df_stf = spark.read.format('csv').options(header='true', inferschema='true').load('/home/glue_user/workspace/sparklearning/src_data_csv/Staff.csv')


In [None]:
#selectExpr
df_cus.selectExpr("*", "Company_name nameOfCompany_using_selectExpr").limit(2).show(truncate=False)
df_cus.select("*", F.col('Company_name').alias('nameOfCompany_using_alias')).limit(2).show(truncate=False)
df_cus.withColumnRenamed('Company_name', 'nameOfCompany_using_withColumnRenamed').limit(2).show(truncate=False)

In [None]:
#groupBy
df_cus.groupBy('Town').agg(F.avg('Town').alias('zavg'), F.count('Town').alias('cnt')).show()
df_cus.groupBy('Town').agg(F.expr("count(*) cnt"), F.expr("cast(sum(Company_ref) as integer) sm")).show()

In [None]:
#windowFunction
df_cus.selectExpr("*", 
                  "row_number() over(partition by Town order by Company_ref) as rn",
                  "count(*) over(partition by Town) cnt").filter("cnt>1")\
                    .orderBy(F.expr("town"), F.expr("rn desc")).show()

In [None]:
spark.sql('''
select
    to_timestamp('1993-08-15T10:30:45.5+05:30') bd
''').show(truncate=False)

In [63]:
df_ren.printSchema()

root
 |-- rental_id: integer (nullable = true)
 |-- rental_date: timestamp (nullable = true)
 |-- inventory_id: integer (nullable = true)
 |-- customer_id: integer (nullable = true)
 |-- return_date: timestamp (nullable = true)
 |-- staff_id: integer (nullable = true)
 |-- last_update: timestamp (nullable = true)



In [64]:
#join
df_ren.createOrReplaceTempView('dvdrental')
df_cus.createOrReplaceTempView('customer')
df_stf.createOrReplaceTempView('staff')

spark.sql('''
select
    *
from
    dvdrental
join
    customer
    on dvdrental.customer_id=customer.customer_id
''').show()

+---------+-------------------+------------+-----------+-------------------+--------+-------------------+-----------+--------+----------+-----------+--------------------+----------+----------+-------------------+--------------------+------+
|rental_id|        rental_date|inventory_id|customer_id|        return_date|staff_id|        last_update|customer_id|store_id|first_name|  last_name|               email|address_id|activebool|        create_date|         last_update|active|
+---------+-------------------+------------+-----------+-------------------+--------+-------------------+-----------+--------+----------+-----------+--------------------+----------+----------+-------------------+--------------------+------+
|        2|2005-05-24 22:54:33|        1525|        459|2005-05-28 19:40:33|       1|2006-02-16 02:30:53|        459|       1|     Tommy|    Collazo|tommy.collazo@sak...|       464|         t|2006-02-14 00:00:00|2013-05-26 14:49:...|     1|
|        3|2005-05-24 23:03:39|     