In [41]:
import findspark
findspark.init('/home/shashank/spark-2.3.2-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession

In [42]:
spark = SparkSession.builder.appName("Dates").getOrCreate()

In [43]:
df = spark.read.csv('Spark_DataFrames/appl_stock.csv', inferSchema=True, header=True)

In [44]:
df.createOrReplaceTempView('dates_sql')

In [45]:
df.printSchema()

root
 |-- Date: timestamp (nullable = true)
 |-- Open: double (nullable = true)
 |-- High: double (nullable = true)
 |-- Low: double (nullable = true)
 |-- Close: double (nullable = true)
 |-- Volume: integer (nullable = true)
 |-- Adj Close: double (nullable = true)



In [46]:
spark.sql("FROM dates_sql SELECT Date,Open").show()

+-------------------+------------------+
|               Date|              Open|
+-------------------+------------------+
|2010-01-04 00:00:00|        213.429998|
|2010-01-05 00:00:00|        214.599998|
|2010-01-06 00:00:00|        214.379993|
|2010-01-07 00:00:00|            211.75|
|2010-01-08 00:00:00|        210.299994|
|2010-01-11 00:00:00|212.79999700000002|
|2010-01-12 00:00:00|209.18999499999998|
|2010-01-13 00:00:00|        207.870005|
|2010-01-14 00:00:00|210.11000299999998|
|2010-01-15 00:00:00|210.92999500000002|
|2010-01-19 00:00:00|        208.330002|
|2010-01-20 00:00:00|        214.910006|
|2010-01-21 00:00:00|        212.079994|
|2010-01-22 00:00:00|206.78000600000001|
|2010-01-25 00:00:00|202.51000200000001|
|2010-01-26 00:00:00|205.95000100000001|
|2010-01-27 00:00:00|        206.849995|
|2010-01-28 00:00:00|        204.930004|
|2010-01-29 00:00:00|        201.079996|
|2010-02-01 00:00:00|192.36999699999998|
+-------------------+------------------+
only showing top

In [47]:
from pyspark.sql.functions import dayofmonth, hour, dayofyear, month, year, weekofyear, format_number, date_format

In [48]:
spark.sql("FROM dates_sql SELECT dayofmonth(Date) AS Day").show()

+---+
|Day|
+---+
|  4|
|  5|
|  6|
|  7|
|  8|
| 11|
| 12|
| 13|
| 14|
| 15|
| 19|
| 20|
| 21|
| 22|
| 25|
| 26|
| 27|
| 28|
| 29|
|  1|
+---+
only showing top 20 rows



In [49]:
spark.sql("FROM dates_sql SELECT hour(Date) AS Day").show() #all hours are 0

+---+
|Day|
+---+
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
|  0|
+---+
only showing top 20 rows



In [55]:
#Avg closing price per year
from pyspark.sql.functions import mean
spark.sql("FROM dates_sql SELECT year(Date) AS Year, mean(Close) AS \
                         Average_Closing_Price GROUP BY year(Date)").show()

+----+---------------------+
|Year|Average_Closing_Price|
+----+---------------------+
|2015|   120.03999980555547|
|2013|    472.6348802857143|
|2014|    295.4023416507935|
|2012|    576.0497195640002|
|2016|   104.60400786904763|
|2010|    259.8424600000002|
|2011|   364.00432532142867|
+----+---------------------+



In [56]:
avg_cl_p = spark.sql("FROM dates_sql SELECT year(Date) AS Year, mean(Close) AS \
                         Average_Closing_Price GROUP BY year(Date)")

In [64]:
#Change number format
avg_cl_p.createOrReplaceTempView('final')
spark.sql("FROM final SELECT Year, CAST(Average_Closing_Price as DECIMAL(5,2))").show() 
#5 total number of digits - to be safe use a higher number like 15


+----+---------------------+
|Year|Average_Closing_Price|
+----+---------------------+
|2015|               120.04|
|2013|               472.63|
|2014|               295.40|
|2012|               576.05|
|2016|               104.60|
|2010|               259.84|
|2011|               364.00|
+----+---------------------+

