# Bermain Dengan Format Tanggal

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [0]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
from pyspark.sql import Row
data_dummy = [("notebook","2020-01-01 00:00:00", 4),
              ("notebook", "2020-02-10 13:00:00", 2),
              ("small_phone", "2020-02-15 12:00:00", 7),
              ("small_phone", "2020-02-23 09:30:00", 1)]
rdd = spark.sparkContext.parallelize(data_dummy)
rdd_data_dummy = rdd.map(lambda x: Row( device=x[0], purchase_time=x[1], day_of_week=x[2] ))
df_dataset = spark.createDataFrame(rdd_data_dummy).select('purchase_time', 'device',  'day_of_week')

In [37]:
df_dataset.printSchema()

root
 |-- purchase_time: string (nullable = true)
 |-- device: string (nullable = true)
 |-- day_of_week: long (nullable = true)



In [38]:
df_dataset.show()

+-------------------+-----------+-----------+
|      purchase_time|     device|day_of_week|
+-------------------+-----------+-----------+
|2020-01-01 00:00:00|   notebook|          4|
|2020-02-10 13:00:00|   notebook|          2|
|2020-02-15 12:00:00|small_phone|          7|
|2020-02-23 09:30:00|small_phone|          1|
+-------------------+-----------+-----------+



In [73]:
from pyspark.sql.functions import to_date, date_format, to_timestamp, hour, minute, second

df_stg_dataset = df_dataset.withColumn('date_form', to_date(df_dataset.purchase_time)) \
                           .withColumn('date_id', date_format(df_dataset.purchase_time, 'YYYYMMdd')) \
                           .withColumn('hour_id', date_format(df_dataset.purchase_time, 'YYYYMMddHH')) \
                           .withColumn('day_of_month', date_format(df_dataset.purchase_time, 'dd')) \
                           .withColumn('day_name_short', date_format(df_dataset.purchase_time, 'E')) \
                           .withColumn('day_name', date_format(df_dataset.purchase_time, 'EEEE')) \
                           .withColumn('day_of_week_2', date_format(df_dataset.purchase_time, 'u')) \
                           .withColumn('datetime_form', to_timestamp(df_dataset.purchase_time, "yyyy-MM-dd HH:mm:ss")) \
                           .withColumn('hour', hour(df_dataset.purchase_time)) 
df_stg_dataset.show()
df_stg_dataset.printSchema()
##.withColumn('day_name_2', calendar.day_name[df_dataset.day_of_week_2]) \

+-------------------+-----------+-----------+----------+--------+----------+------------+--------------+---------+-------------+-------------------+----+
|      purchase_time|     device|day_of_week| date_form| date_id|   hour_id|day_of_month|day_name_short| day_name|day_of_week_2|      datetime_form|hour|
+-------------------+-----------+-----------+----------+--------+----------+------------+--------------+---------+-------------+-------------------+----+
|2020-01-01 00:00:00|   notebook|          4|2020-01-01|20200101|2020010100|          01|           Wed|Wednesday|            3|2020-01-01 00:00:00|   0|
|2020-02-10 13:00:00|   notebook|          2|2020-02-10|20200210|2020021013|          10|           Mon|   Monday|            1|2020-02-10 13:00:00|  13|
|2020-02-15 12:00:00|small_phone|          7|2020-02-15|20200215|2020021512|          15|           Sat| Saturday|            6|2020-02-15 12:00:00|  12|
|2020-02-23 09:30:00|small_phone|          1|2020-02-23|20200223|2020022309|

In [54]:
df_dataset.show()
df_dataset.printSchema()

+-------------------+-----------+-----------+
|      purchase_time|     device|day_of_week|
+-------------------+-----------+-----------+
|2020-01-01 00:00:00|   notebook|          4|
|2020-02-10 13:00:00|   notebook|          2|
|2020-02-15 12:00:00|small_phone|          7|
|2020-02-23 09:30:00|small_phone|          1|
+-------------------+-----------+-----------+

root
 |-- purchase_time: string (nullable = true)
 |-- device: string (nullable = true)
 |-- day_of_week: long (nullable = true)



In [31]:
df_stg_dataset.printSchema()

root
 |-- purchase_time: string (nullable = true)
 |-- device: string (nullable = true)
 |-- day_of_week: long (nullable = true)
 |-- date_form: string (nullable = true)



In [18]:
df_stg_dataset = df_dataset.select('purchase_time', 'device',  'day_of_week')
df_stg_dataset.printSchema()

root
 |-- purchase_time: string (nullable = true)
 |-- device: string (nullable = true)
 |-- day_of_week: long (nullable = true)

