In [1]:
import findspark
print(findspark.init())

None


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('Dates and Timestamp').getOrCreate()

In [4]:
df = spark.read.csv("appl_stock.csv",inferSchema=True,header=True)

In [5]:
df.head(1)

[Row(Date='2010-01-04', Open=213.429998, High=214.499996, Low=212.38000099999996, Close=214.009998, Volume=123432400, Adj Close=27.727039)]

In [6]:
df.printSchema()

root
 |-- Date: string (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [8]:
df.select(['Date','OPen']).show(10)

+----------+------------------+
|      Date|              OPen|
+----------+------------------+
|2010-01-04|        213.429998|
|2010-01-05|        214.599998|
|2010-01-06|        214.379993|
|2010-01-07|            211.75|
|2010-01-08|        210.299994|
|2010-01-11|212.79999700000002|
|2010-01-12|209.18999499999998|
|2010-01-13|        207.870005|
|2010-01-14|210.11000299999998|
|2010-01-15|210.92999500000002|
+----------+------------------+
only showing top 10 rows



In [9]:
from pyspark.sql.functions import (dayofmonth, hour, dayofyear, month, year, weekofyear, 
                                  format_number, date_format)

In [11]:
df.select(dayofmonth(df['Date'])).show(5)

+----------------+
|dayofmonth(Date)|
+----------------+
|               4|
|               5|
|               6|
|               7|
|               8|
+----------------+
only showing top 5 rows



In [12]:
df.select(hour(df['Date'])).show(5)

+----------+
|hour(Date)|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
+----------+
only showing top 5 rows



In [13]:
df.select(month(df['Date'])).show(5)

+-----------+
|month(Date)|
+-----------+
|          1|
|          1|
|          1|
|          1|
|          1|
+-----------+
only showing top 5 rows



In [14]:
df.select(year(df['Date'])).show(5)

+----------+
|year(Date)|
+----------+
|      2010|
|      2010|
|      2010|
|      2010|
|      2010|
+----------+
only showing top 5 rows



In [17]:
df.withColumn('Year',year(df['Date'])).show(5)

+----------+----------+----------+------------------+------------------+---------+------------------+----+
|      Date|      Open|      High|               Low|             Close|   Volume|         Adj Close|Year|
+----------+----------+----------+------------------+------------------+---------+------------------+----+
|2010-01-04|213.429998|214.499996|212.38000099999996|        214.009998|123432400|         27.727039|2010|
|2010-01-05|214.599998|215.589994|        213.249994|        214.379993|150476200|27.774976000000002|2010|
|2010-01-06|214.379993|    215.23|        210.750004|        210.969995|138040000|27.333178000000004|2010|
|2010-01-07|    211.75|212.000006|        209.050005|            210.58|119282800|          27.28265|2010|
|2010-01-08|210.299994|212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|2010|
+----------+----------+----------+------------------+------------------+---------+------------------+----+
only showing top 5 rows



In [21]:
new_df = df.withColumn('Year',year(df['Date']))

In [22]:
new_df.groupBy("Year").mean().show()

+----+------------------+------------------+------------------+------------------+--------------------+------------------+---------+
|Year|         avg(Open)|         avg(High)|          avg(Low)|        avg(Close)|         avg(Volume)|    avg(Adj Close)|avg(Year)|
+----+------------------+------------------+------------------+------------------+--------------------+------------------+---------+
|2015|120.17575393253965|121.24452385714291| 118.8630954325397|120.03999980555547|  5.18378869047619E7|115.96740080555561|   2015.0|
|2013| 473.1281355634922| 477.6389272301587|468.24710264682557| 472.6348802857143|          1.016087E8| 62.61798788492063|   2013.0|
|2014| 295.1426195357143|297.56103184523823| 292.9949599801587| 295.4023416507935| 6.315273055555555E7| 87.63583323809523|   2014.0|
|2012|     576.652720788| 581.8254008040001| 569.9211606079999| 576.0497195640002|       1.319642044E8| 74.81383696800002|   2012.0|
|2016|104.50777772619044| 105.4271825436508|103.69027771825397|104.60

In [24]:
new_df.groupBy("Year").mean().select(['Year','avg(close)']).show(5)

+----+------------------+
|Year|        avg(close)|
+----+------------------+
|2015|120.03999980555547|
|2013| 472.6348802857143|
|2014| 295.4023416507935|
|2012| 576.0497195640002|
|2016|104.60400786904763|
+----+------------------+
only showing top 5 rows



In [26]:
from pyspark.sql.functions import sum, mean, avg, stddev,count, countDistinct

In [38]:
new_df.groupBy("Year").agg(sum('close'),mean('close'),avg('close'),count('close')).show(5)

+----+------------------+------------------+------------------+------------+
|Year|        sum(close)|        avg(close)|        avg(close)|count(close)|
+----+------------------+------------------+------------------+------------+
|2015|30250.079950999978|120.03999980555547|120.03999980555547|         252|
|2013|     119103.989832| 472.6348802857143| 472.6348802857143|         252|
|2014| 74441.39009599996| 295.4023416507935| 295.4023416507935|         252|
|2012|144012.42989100004| 576.0497195640002| 576.0497195640002|         250|
|2016|26360.209983000004|104.60400786904763|104.60400786904763|         252|
+----+------------------+------------------+------------------+------------+
only showing top 5 rows



In [42]:
new_df.groupBy("Year").mean('Close').show(5)

+----+------------------+
|Year|        avg(Close)|
+----+------------------+
|2015|120.03999980555547|
|2013| 472.6348802857143|
|2014| 295.4023416507935|
|2012| 576.0497195640002|
|2016|104.60400786904763|
+----+------------------+
only showing top 5 rows



In [44]:
new_df.groupBy("Year").mean('Close','Low','Volume').show(5)

+----+------------------+------------------+-------------------+
|Year|        avg(Close)|          avg(Low)|        avg(Volume)|
+----+------------------+------------------+-------------------+
|2015|120.03999980555547| 118.8630954325397| 5.18378869047619E7|
|2013| 472.6348802857143|468.24710264682557|         1.016087E8|
|2014| 295.4023416507935| 292.9949599801587|6.315273055555555E7|
|2012| 576.0497195640002| 569.9211606079999|      1.319642044E8|
|2016|104.60400786904763|103.69027771825397| 3.84153623015873E7|
+----+------------------+------------------+-------------------+
only showing top 5 rows



In [59]:
result = new_df.groupBy("Year").mean().select(["Year",'avg(Close)'])

In [60]:
result.show(5)

+----+------------------+
|Year|        avg(Close)|
+----+------------------+
|2015|120.03999980555547|
|2013| 472.6348802857143|
|2014| 295.4023416507935|
|2012| 576.0497195640002|
|2016|104.60400786904763|
+----+------------------+
only showing top 5 rows



In [62]:
new = result.withColumnRenamed("avg(Close)",'Average close')

In [66]:
new.select('Year',format_number('Average close',2).alias('Average Closing Price')).show()

+----+---------------------+
|Year|Average Closing Price|
+----+---------------------+
|2015|               120.04|
|2013|               472.63|
|2014|               295.40|
|2012|               576.05|
|2016|               104.60|
|2010|               259.84|
|2011|               364.00|
+----+---------------------+



In [68]:
spark.stop()