In [1]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("window-function").getOrCreate()

In [2]:
df = spark.read.csv("./source/stock_data.csv")

In [12]:
df.columns

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6']

### Renaming the columns :

In [16]:
df2 =df.select(df['_c0'].alias("Date"),
          df['_c1'].alias("Ticker"),
          df['_c2'].alias("Open"),
          df['_c3'].alias("High"),
          df['_c4'].alias("Low"),
          df['_c5'].alias("Close"),
          df['_c6'].alias("Volume"),
         )
df2.show()

+--------+------+-----+-----+-----+-----+------+
|    Date|Ticker| Open| High|  Low|Close|Volume|
+--------+------+-----+-----+-----+-----+------+
|20100721|     A|27.68| 28.2|27.41|27.58| 44528|
|20100722|     A|27.95|28.87|27.95|28.72| 36494|
|20100723|     A|28.56|29.41|28.45| 29.3| 37153|
|20100726|     A|29.22|29.67|29.11|29.64| 21256|
|20100727|     A|29.73|29.73|28.81|28.87| 33410|
|20100728|     A|28.79|29.27|28.74|28.78| 31156|
|20100729|     A|28.97|29.15|27.78|28.15| 44085|
|20100730|     A|27.78|28.17|27.66|27.93| 36943|
|20100802|     A|28.35|28.97| 28.2|28.82| 28989|
|20100803|     A| 28.7|28.73| 27.8|27.84| 42401|
|20100804|     A|27.86|28.35|27.75|28.29| 23525|
|20100805|     A|28.03|28.63|27.96|28.46| 20682|
|20100806|     A|28.18|28.75|28.07|28.73| 33777|
|20100809|     A|28.92|29.87|28.87|29.82| 36889|
|20100810|     A|29.44|29.68|29.13|29.46| 34866|
|20100811|     A|28.86| 28.9|27.98|28.22| 28271|
|20100812|     A|27.65|27.78|27.41|27.53| 32566|
|20100813|     A|27.

In [17]:
df2.columns

['Date', 'Ticker', 'Open', 'High', 'Low', 'Close', 'Volume']

In [18]:
# one more way to renanming the columns in dataframe 
df1 = df.toDF('Date','Ticker','Open','high','low','Close','volume')
df1.columns

['Date', 'Ticker', 'Open', 'high', 'low', 'Close', 'volume']

## Loading data to temp table 

In [21]:
df1.registerTempTable("stock_data")

In [22]:
spark.sql("select * from stock_data").show()

+--------+------+-----+-----+-----+-----+------+
|    Date|Ticker| Open| high|  low|Close|volume|
+--------+------+-----+-----+-----+-----+------+
|20100721|     A|27.68| 28.2|27.41|27.58| 44528|
|20100722|     A|27.95|28.87|27.95|28.72| 36494|
|20100723|     A|28.56|29.41|28.45| 29.3| 37153|
|20100726|     A|29.22|29.67|29.11|29.64| 21256|
|20100727|     A|29.73|29.73|28.81|28.87| 33410|
|20100728|     A|28.79|29.27|28.74|28.78| 31156|
|20100729|     A|28.97|29.15|27.78|28.15| 44085|
|20100730|     A|27.78|28.17|27.66|27.93| 36943|
|20100802|     A|28.35|28.97| 28.2|28.82| 28989|
|20100803|     A| 28.7|28.73| 27.8|27.84| 42401|
|20100804|     A|27.86|28.35|27.75|28.29| 23525|
|20100805|     A|28.03|28.63|27.96|28.46| 20682|
|20100806|     A|28.18|28.75|28.07|28.73| 33777|
|20100809|     A|28.92|29.87|28.87|29.82| 36889|
|20100810|     A|29.44|29.68|29.13|29.46| 34866|
|20100811|     A|28.86| 28.9|27.98|28.22| 28271|
|20100812|     A|27.65|27.78|27.41|27.53| 32566|
|20100813|     A|27.

In [23]:
spark.sql("select count(*) from stock_data").show()

+--------+
|count(1)|
+--------+
|   10937|
+--------+



## Lag and Lead functions :

The LAG function is an analytic function that lets you query more than one row in a table at a time without having to join the table to itself. It returns values from a previous row in the table.

the LEAD function is an analytic function that lets you query more than one row in a table at a time without having to join the table to itself. It returns values from the next row in the table.

In [25]:
yesterday_close_price = spark.sql("select Ticker,Date,Close ,\
                    lag(Close,1) over (partition by  Ticker order by Date ) as yesterday_price from stock_data ")

yesterday_close_price.show()

+------+--------+-----+---------------+
|Ticker|    Date|Close|yesterday_price|
+------+--------+-----+---------------+
|   GIS|20100721|35.03|           null|
|   GIS|20100722|35.36|          35.03|
|   GIS|20100723|35.52|          35.36|
|   GIS|20100726|35.44|          35.52|
|   GIS|20100727|35.89|          35.44|
|   GIS|20100728|35.44|          35.89|
|   GIS|20100729|34.13|          35.44|
|   GIS|20100730| 34.2|          34.13|
|   GIS|20100802|34.35|           34.2|
|   GIS|20100803|33.98|          34.35|
|   GIS|20100804|34.62|          33.98|
|   GIS|20100805|33.85|          34.62|
|   GIS|20100806|33.57|          33.85|
|   GIS|20100809| 33.7|          33.57|
|   GIS|20100810|33.98|           33.7|
|   GIS|20100811|33.81|          33.98|
|   GIS|20100812|34.43|          33.81|
|   GIS|20100813|34.86|          34.43|
|   GIS|20100816|35.15|          34.86|
|   GIS|20100817|35.38|          35.15|
+------+--------+-----+---------------+
only showing top 20 rows



In [27]:
next_day_close = spark.sql("select Ticker,Date,Close ,\
                        lead(Close,1) over (partition by  Ticker order by Date ) as next_day_price from stock_data ")

next_day_close.show(30)


+------+--------+-----+--------------+
|Ticker|    Date|Close|next_day_price|
+------+--------+-----+--------------+
|   GIS|20100721|35.03|         35.36|
|   GIS|20100722|35.36|         35.52|
|   GIS|20100723|35.52|         35.44|
|   GIS|20100726|35.44|         35.89|
|   GIS|20100727|35.89|         35.44|
|   GIS|20100728|35.44|         34.13|
|   GIS|20100729|34.13|          34.2|
|   GIS|20100730| 34.2|         34.35|
|   GIS|20100802|34.35|         33.98|
|   GIS|20100803|33.98|         34.62|
|   GIS|20100804|34.62|         33.85|
|   GIS|20100805|33.85|         33.57|
|   GIS|20100806|33.57|          33.7|
|   GIS|20100809| 33.7|         33.98|
|   GIS|20100810|33.98|         33.81|
|   GIS|20100811|33.81|         34.43|
|   GIS|20100812|34.43|         34.86|
|   GIS|20100813|34.86|         35.15|
|   GIS|20100816|35.15|         35.38|
|   GIS|20100817|35.38|         35.13|
|   GIS|20100819|35.13|         35.14|
|   GIS|20100820|35.14|          null|
|     K|20100721|51.14|  

## First_value and Last_value : 

FIRST_VALUE returns the first value in an ordered set of values.

LAST_VALUE returns the last value in an ordered set of values

In [35]:
first_value = spark.sql("select  Ticker,Date,High,\
                       first_value(High) over (partition by Ticker order by Date  ) as first_value \
                       from stock_data ")
first_value.show(50)

+------+--------+-------+-----------+
|Ticker|    Date|   High|first_value|
+------+--------+-------+-----------+
|   GIS|20100721|  35.56|      35.56|
|   GIS|20100722|  35.48|      35.56|
|   GIS|20100723|  35.58|      35.56|
|   GIS|20100726|35.6852|      35.56|
|   GIS|20100727|     36|      35.56|
|   GIS|20100728|  35.91|      35.56|
|   GIS|20100729|  35.11|      35.56|
|   GIS|20100730|  34.48|      35.56|
|   GIS|20100802|  35.03|      35.56|
|   GIS|20100803|  34.42|      35.56|
|   GIS|20100804|   34.7|      35.56|
|   GIS|20100805|  34.52|      35.56|
|   GIS|20100806|  33.73|      35.56|
|   GIS|20100809|  33.87|      35.56|
|   GIS|20100810|  34.15|      35.56|
|   GIS|20100811|  33.98|      35.56|
|   GIS|20100812|  34.44|      35.56|
|   GIS|20100813|  35.17|      35.56|
|   GIS|20100816|  35.16|      35.56|
|   GIS|20100817|  35.54|      35.56|
|   GIS|20100819|  35.29|      35.56|
|   GIS|20100820|   35.2|      35.56|
|     K|20100721|  51.84|      51.84|
|     K|2010

In [36]:
last_value = spark.sql("select  Ticker,Date,High,\
                       last_value(High) over (partition by Ticker order by Date  ) as last_value \
                       from stock_data ")
last_value.show(10)

+------+--------+-------+----------+
|Ticker|    Date|   High|last_value|
+------+--------+-------+----------+
|   GIS|20100721|  35.56|     35.56|
|   GIS|20100722|  35.48|     35.48|
|   GIS|20100723|  35.58|     35.58|
|   GIS|20100726|35.6852|   35.6852|
|   GIS|20100727|     36|        36|
|   GIS|20100728|  35.91|     35.91|
|   GIS|20100729|  35.11|     35.11|
|   GIS|20100730|  34.48|     34.48|
|   GIS|20100802|  35.03|     35.03|
|   GIS|20100803|  34.42|     34.42|
+------+--------+-------+----------+
only showing top 10 rows



### From the above results last value of the high is unpredicted hence we need last value to remain same: To get desired results we need to add ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING 

In [41]:
last_value = spark.sql("select  Ticker,Date,High,\
last_value(High) over (partition by Ticker order by Date ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)\
                       as last_value \
                       from stock_data ")
last_value.show(50)

+------+--------+-------+----------+
|Ticker|    Date|   High|last_value|
+------+--------+-------+----------+
|   GIS|20100721|  35.56|      35.2|
|   GIS|20100722|  35.48|      35.2|
|   GIS|20100723|  35.58|      35.2|
|   GIS|20100726|35.6852|      35.2|
|   GIS|20100727|     36|      35.2|
|   GIS|20100728|  35.91|      35.2|
|   GIS|20100729|  35.11|      35.2|
|   GIS|20100730|  34.48|      35.2|
|   GIS|20100802|  35.03|      35.2|
|   GIS|20100803|  34.42|      35.2|
|   GIS|20100804|   34.7|      35.2|
|   GIS|20100805|  34.52|      35.2|
|   GIS|20100806|  33.73|      35.2|
|   GIS|20100809|  33.87|      35.2|
|   GIS|20100810|  34.15|      35.2|
|   GIS|20100811|  33.98|      35.2|
|   GIS|20100812|  34.44|      35.2|
|   GIS|20100813|  35.17|      35.2|
|   GIS|20100816|  35.16|      35.2|
|   GIS|20100817|  35.54|      35.2|
|   GIS|20100819|  35.29|      35.2|
|   GIS|20100820|   35.2|      35.2|
|     K|20100721|  51.84|     49.86|
|     K|20100722| 51.625|     49.86|
|

## Row_number,Rank and Dense_rank:

