# Dates and Timestamps

You will often find yourself working with Time and Date information, let's walk through some ways you can deal with it!

In [1]:
from pyspark.sql import SparkSession
# May take a little while on a local computer
spark = SparkSession.builder.appName("dates").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/02 13:48:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/11/02 13:48:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/11/02 13:48:20 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/11/02 13:48:20 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [2]:
#df = spark.read.csv("appl_stock.csv",header=True,inferSchema=True)
import pandas as pd
df = spark.createDataFrame(pd.read_csv("appl_stock.csv",header='infer'))
df.show()

+----------+----------+------------------+----------+----------+---------+------------------+
|      Date|      Open|              High|       Low|     Close|   Volume|         Adj Close|
+----------+----------+------------------+----------+----------+---------+------------------+
|2010-01-04|213.429998|        214.499996|212.380001|214.009998|123432400|         27.727039|
|2010-01-05|214.599998|        215.589994|213.249994|214.379993|150476200|         27.774976|
|2010-01-06|214.379993|            215.23|210.750004|210.969995|138040000|27.333178000000004|
|2010-01-07|    211.75|        212.000006|209.050005|    210.58|119282800|          27.28265|
|2010-01-08|210.299994|        212.000006|209.060005|211.980005|111902700|         27.464034|
|2010-01-11|212.799997|        213.000002|208.450005|210.110003|115557400|         27.221758|
|2010-01-12|209.189995|        209.769995|206.419998|207.720001|148614900|          26.91211|
|2010-01-13|207.870005|        210.929995|204.099998|210.650

In [3]:
df.show()

+----------+----------+------------------+----------+----------+---------+------------------+
|      Date|      Open|              High|       Low|     Close|   Volume|         Adj Close|
+----------+----------+------------------+----------+----------+---------+------------------+
|2010-01-04|213.429998|        214.499996|212.380001|214.009998|123432400|         27.727039|
|2010-01-05|214.599998|        215.589994|213.249994|214.379993|150476200|         27.774976|
|2010-01-06|214.379993|            215.23|210.750004|210.969995|138040000|27.333178000000004|
|2010-01-07|    211.75|        212.000006|209.050005|    210.58|119282800|          27.28265|
|2010-01-08|210.299994|        212.000006|209.060005|211.980005|111902700|         27.464034|
|2010-01-11|212.799997|        213.000002|208.450005|210.110003|115557400|         27.221758|
|2010-01-12|209.189995|        209.769995|206.419998|207.720001|148614900|          26.91211|
|2010-01-13|207.870005|        210.929995|204.099998|210.650

Let's walk through how to grab parts of the timestamp data

In [4]:
from pyspark.sql.functions import format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format

In [5]:
df.select(dayofmonth(df['Date'])).show()

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
|               6|
|               7|
|               8|
|              11|
|              12|
|              13|
|              14|
|              15|
|              19|
|              20|
|              21|
|              22|
|              25|
|              26|
|              27|
|              28|
|              29|
|               1|
+----------------+
only showing top 20 rows



In [6]:
df.select(hour(df['Date'])).show()

+----------+
|hour(Date)|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 20 rows



In [7]:
df.select(dayofyear(df['Date'])).show()

+---------------+
|dayofyear(Date)|
+---------------+
|              4|
|              5|
|              6|
|              7|
|              8|
|             11|
|             12|
|             13|
|             14|
|             15|
|             19|
|             20|
|             21|
|             22|
|             25|
|             26|
|             27|
|             28|
|             29|
|             32|
+---------------+
only showing top 20 rows



In [8]:
df.select(month(df['Date'])).show()

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          1|
|          2|
+-----------+
only showing top 20 rows



So for example, let's say we wanted to know the average closing price per year. Easy! With a groupby and the year() function call:

In [9]:
df.select(year(df['Date'])).show()

+----------+
|year(Date)|
+----------+
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
+----------+
only showing top 20 rows



In [10]:
df.withColumn("Year",year(df['Date'])).show()

+----------+----------+------------------+----------+----------+---------+------------------+----+
|      Date|      Open|              High|       Low|     Close|   Volume|         Adj Close|Year|
+----------+----------+------------------+----------+----------+---------+------------------+----+
|2010-01-04|213.429998|        214.499996|212.380001|214.009998|123432400|         27.727039|2010|
|2010-01-05|214.599998|        215.589994|213.249994|214.379993|150476200|         27.774976|2010|
|2010-01-06|214.379993|            215.23|210.750004|210.969995|138040000|27.333178000000004|2010|
|2010-01-07|    211.75|        212.000006|209.050005|    210.58|119282800|          27.28265|2010|
|2010-01-08|210.299994|        212.000006|209.060005|211.980005|111902700|         27.464034|2010|
|2010-01-11|212.799997|        213.000002|208.450005|210.110003|115557400|         27.221758|2010|
|2010-01-12|209.189995|        209.769995|206.419998|207.720001|148614900|          26.91211|2010|
|2010-01-1

In [11]:
newdf = df.withColumn("Year",year(df['Date']))
newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']].show()

+---------+------------------+
|avg(Year)|        avg(Close)|
+---------+------------------+
|   2010.0| 259.8424600000001|
|   2011.0| 364.0043253214286|
|   2012.0| 576.0497195639999|
|   2013.0|472.63488028571436|
|   2014.0|295.40234165079374|
|   2015.0|120.03999980555552|
|   2016.0|104.60400786904763|
+---------+------------------+



Still not quite presentable! Let's use the .alias method as well as round() to clean this up!

In [12]:
result = newdf.groupBy("Year").mean()[['avg(Year)','avg(Close)']]
result = result.withColumnRenamed("avg(Year)","Year")
result = result.select('Year',format_number('avg(Close)',2).alias("Mean Close")).show()

+------+----------+
|  Year|Mean Close|
+------+----------+
|2010.0|    259.84|
|2011.0|    364.00|
|2012.0|    576.05|
|2013.0|    472.63|
|2014.0|    295.40|
|2015.0|    120.04|
|2016.0|    104.60|
+------+----------+



Perfect! Now you know how to work with Date and Timestamp information!