# Dates and Timestamps



In [35]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark

In [36]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"

In [37]:
import findspark
findspark.init()

In [38]:
from pyspark.sql import SparkSession
# May take a little while on a local computer
spark = SparkSession.builder.appName("dates").getOrCreate()

In [39]:
from google.colab import files


In [40]:
uploaded=files.upload()

Saving appl_stock.csv to appl_stock (1).csv


In [41]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [42]:
! gdrive/MyDrive/Colab\ Notebooks/BigData_Workshop

/bin/bash: gdrive/MyDrive/Colab Notebooks/BigData_Workshop: Is a directory


In [43]:
!ls

'appl_stock (1).csv'   spark-3.0.1-bin-hadoop2.7
 appl_stock.csv        spark-3.0.1-bin-hadoop2.7.tgz
 gdrive		       spark-3.0.1-bin-hadoop2.7.tgz.1
 sample_data


In [44]:
df = spark.read.csv("gdrive/MyDrive/Colab\ Notebooks/BigData_Workshop/appl_stock.csv",header=True,inferSchema=True)

In [45]:
df.show()

+----------+------------------+------------------+------------------+------------------+---------+------------------+
|      Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+----------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|
|2010-01-11|212.79999700000002|        213.000002|      

Let's change the datatype of Date from string to timestamp




In [46]:
from pyspark.sql.types import TimestampType

In [47]:
df=df.withColumn('Date',df['Date'].cast(TimestampType()))

In [48]:
df.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|
|2010-01-08 00:00:00|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|    

In [49]:
from pyspark.sql.functions import * 

Let's add columns with the year,month,day of the week to the dataframe




In [50]:
dfnew=df.withColumn('year',year('Date')).withColumn('month',month('Date')).withColumn('day',dayofweek('Date'))

In [51]:
dfnew.show()

+-------------------+------------------+------------------+------------------+------------------+---------+------------------+----+-----+---+
|               Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|year|month|day|
+-------------------+------------------+------------------+------------------+------------------+---------+------------------+----+-----+---+
|2010-01-04 00:00:00|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|2010|    1|  2|
|2010-01-05 00:00:00|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|2010|    1|  3|
|2010-01-06 00:00:00|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|2010|    1|  4|
|2010-01-07 00:00:00|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|2010|    1|  5|
|2010-

###Let's change the date to the format MM/dd/yyyy

In [96]:
import pyspark.sql.functions as F

In [97]:
dfnew=dfnew.withColumn('date',F.date_format('Date','MM/dd/yyyy'))

In [98]:
dfnew.show()

+----+------------------+------------------+------------------+------------------+---------+------------------+----+-----+---+
|date|              Open|              High|               Low|             Close|   Volume|         Adj Close|year|month|day|
+----+------------------+------------------+------------------+------------------+---------+------------------+----+-----+---+
|null|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|2010|    1|  2|
|null|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|2010|    1|  3|
|null|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|2010|    1|  4|
|null|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|2010|    1|  5|
|null|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|

###Let's check the distinct dates in this dataframe

In [99]:
dfnew.select('date').distinct().count()

1

In [100]:
dfnew2=dfnew.groupBy('year','month').mean()[['year','month','avg(Low)','avg(Close)']]

In [101]:
dfnew2.columns

['year', 'month', 'avg(Low)', 'avg(Close)']

In [102]:
dfnew2=dfnew2.withColumn('low_avg',format_number('avg(Low)',2))

In [103]:
dfnew2.show()

+----+-----+------------------+------------------+-------+
|year|month|          avg(Low)|        avg(Close)|low_avg|
+----+-----+------------------+------------------+-------+
|2012|   10| 628.2138112380952| 634.7142817142857| 628.21|
|2010|    7|251.08762104761905|254.94999942857143| 251.09|
|2010|   12| 320.0027276818182|321.48591127272726| 320.00|
|2015|    2|124.03105294736844|125.43210478947367| 124.03|
|2014|    4| 536.2623773809524| 541.0742899999999| 536.26|
|2015|   12| 110.7418182272727|111.72681809090909| 110.74|
|2016|    7|       98.00249975| 98.55649979999998|  98.00|
|2016|   11|108.98857233333332|110.15428528571428| 108.99|
|2012|    8| 637.1382606521738| 642.6960867391304| 637.14|
|2013|    2|452.10631984210534| 456.8089530526315| 452.11|
|2012|    4| 598.6499981000001|       606.0029994| 598.65|
|2012|   12|      524.96699785|      532.05499735| 524.97|
|2014|   10|100.81304308695653|101.69956573913043| 100.81|
|2016|    5|  94.0490471904762| 94.97476157142857|  94.0

##Let's groupby the year and find the max and min values of all the columns

In [104]:
dfnew2.groupBy('year').min().show()

+----+---------+----------+------------------+------------------+
|year|min(year)|min(month)|     min(avg(Low))|   min(avg(Close))|
+----+---------+----------+------------------+------------------+
|2015|     2015|         1|109.33199965000001|110.64150084999999|
|2013|     2013|         1| 414.7222726818182| 419.7649976818182|
|2014|     2014|         1| 94.83909018181818| 95.62590850000002|
|2012|     2012|         1|425.81050165000005|428.57799885000003|
|2016|     2016|         1|  94.0490471904762| 94.97476157142857|
|2010|     2010|         1|196.32263184210524|198.79578810526314|
|2011|     2011|         1|328.23772877272717| 331.0813620909091|
+----+---------+----------+------------------+------------------+



In [105]:
dfnew2.groupBy('year').max().show()

+----+---------+----------+------------------+------------------+
|year|max(year)|max(month)|     max(avg(Low))|   max(avg(Close))|
+----+---------+----------+------------------+------------------+
|2015|     2015|        12|127.66299984999998|128.76149974999998|
|2013|     2013|        12| 556.4057082857142| 559.6576130952382|
|2014|     2014|        12| 598.3328578095238|  603.195721904762|
|2012|     2012|        12|  676.095266263158| 681.5684264210527|
|2016|     2016|        12|114.94142766666668|115.70714309523811|
|2010|     2010|        12| 320.0027276818182|321.48591127272726|
|2011|     2011|        12| 391.5257136190477|397.23000076190476|
+----+---------+----------+------------------+------------------+



##Let's groupby the year and find the max a of avg(Low) and sum of avg(Close)

In [106]:
dfnew2.groupBy('year').agg(F.sum('avg(Close)'),F.max('avg(Low)').alias('avg')).show()

+----+------------------+------------------+
|year|   sum(avg(Close))|               avg|
+----+------------------+------------------+
|2015|1440.6814124810319|127.66299984999998|
|2013| 5671.432358660076| 556.4057082857142|
|2014|3586.1754387392443| 598.3328578095238|
|2012| 6905.342888621221|  676.095266263158|
|2016| 1254.054804429888|114.94142766666668|
|2010|3107.8098681626143| 320.0027276818182|
|2011| 4366.804782148052| 391.5257136190477|
+----+------------------+------------------+



##You can obtain the above result in another way

In [107]:
dfnew2.groupby('year').agg({'avg(Close)': 'sum','avg(Low)':'max'}).show()

+----+------------------+------------------+
|year|     max(avg(Low))|   sum(avg(Close))|
+----+------------------+------------------+
|2015|127.66299984999998|1440.6814124810319|
|2013| 556.4057082857142| 5671.432358660076|
|2014| 598.3328578095238|3586.1754387392443|
|2012|  676.095266263158| 6905.342888621221|
|2016|114.94142766666668| 1254.054804429888|
|2010| 320.0027276818182|3107.8098681626143|
|2011| 391.5257136190477| 4366.804782148052|
+----+------------------+------------------+



###Let's  find the standard deviation of the column avg(Close)and the distinct values of the same column

In [108]:
from pyspark.sql.functions import countDistinct, avg,stddev

In [109]:
dfnew2.columns

['year', 'month', 'avg(Low)', 'avg(Close)', 'low_avg']

In [114]:
dfnew2.select(stddev('avg(Close)')).show()

+-----------------------+
|stddev_samp(avg(Close))|
+-----------------------+
|     184.14856115839135|
+-----------------------+



In [115]:
dfnew2.select(countDistinct('avg(Close)')).show()

+--------------------------+
|count(DISTINCT avg(Close))|
+--------------------------+
|                        84|
+--------------------------+



###Order by and sort

In [65]:
# OrderBy
# Ascending
df.orderBy("High").show()

+-------------------+-----------------+-----------------+---------+-----------------+---------+-----------------+
|               Date|             Open|             High|      Low|            Close|   Volume|        Adj Close|
+-------------------+-----------------+-----------------+---------+-----------------+---------+-----------------+
|2014-06-25 00:00:00|        90.209999|        90.699997|89.650002|        90.360001| 36869000|        85.737201|
|2014-06-26 00:00:00|        90.370003|        91.050003|89.800003|        90.900002| 32629000|        86.249576|
|2014-06-23 00:00:00|            91.32|        91.620003|90.599998|90.83000200000001| 43694000|        86.183157|
|2016-05-13 00:00:00|             90.0|91.66999799999999|     90.0|        90.519997| 44392800|89.18571700000001|
|2014-06-24 00:00:00|            90.75|        91.739998|90.190002|        90.279999| 39036000|        85.661292|
|2014-06-27 00:00:00|            90.82|             92.0|90.769997|        91.980003| 64

In [67]:
# OrderBy
# descending
df.orderBy(df["High"].desc()).show()

+-------------------+-----------------+-----------------+-----------------+-----------------+---------+-----------------+
|               Date|             Open|             High|              Low|            Close|   Volume|        Adj Close|
+-------------------+-----------------+-----------------+-----------------+-----------------+---------+-----------------+
|2012-09-21 00:00:00|       702.409988|       705.070023|699.3599849999999|       700.089989|142897300|         91.09278|
|2012-09-19 00:00:00|       700.259979|       703.989998|       699.569977|       702.100021| 81718700|91.35431700000001|
|2012-09-18 00:00:00|       699.879997|       702.329987|696.4199980000001|       701.910004| 93375800|        91.329593|
|2012-09-20 00:00:00|699.1599809999999|       700.059975|       693.619987|698.6999969999999| 84142100|         90.91192|
|2012-09-17 00:00:00|       699.349998|       699.799995|694.6100230000001|699.7800219999999| 99507800|        91.052448|
|2012-09-14 00:00:00|   

###Let's look at user defined functions

In [107]:
def square(x):
  return x**2

###Registering UDF with integer type output

In [108]:
from pyspark.sql.types import FloatType
square_udf=udf(lambda z:square(z), FloatType())

In [109]:
dfnew2.select('avg(close)',square_udf('avg(close)').alias('avg_close')).show()

+------------------+---------+
|        avg(close)|avg_close|
+------------------+---------+
| 634.7142817142857|402862.22|
|254.94999942857143|64999.504|
|321.48591127272726|103353.19|
|125.43210478947367|15733.213|
| 541.0742899999999|292761.38|
|111.72681809090909|12482.882|
| 98.55649979999998| 9713.384|
|110.15428528571428|12133.967|
| 642.6960867391304|413058.25|
| 456.8089530526315|208674.42|
|       606.0029994|367239.62|
|      532.05499735|283082.53|
|101.69956573913043|10342.802|
| 94.97476157142857| 9020.205|
|112.41136418181817|12636.314|
| 480.1844989499999|230577.16|
|504.74478286956526| 254767.3|
|  603.195721904762| 363845.1|
|        95.7464999| 9167.393|
| 559.6576130952382|313216.66|
+------------------+---------+
only showing top 20 rows



Let's save the data using repartition and coalesce functions

In [90]:
dfnew2.repartition(1).write.format('csv').mode('append').save('gdrive/MyDrive/Colab\ Notebooks/BigData_Workshop/homework4.csv')

In [89]:
files.download('homework4.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Perfect! Now you know how to work with Date and Timestamp information!