In [1]:
import sys; 
sys.path.insert(0, '..')

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder. \
    appName("pyspark-1"). \
    getOrCreate()

In [4]:
spark

In [124]:
# FINNIFTY_5min.csv
# bn_5min.csv
df = spark.read.csv("/dataset/bn_5min.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)



In [125]:
import pyspark.sql.functions as f

In [126]:
df_extended=df.withColumn("date",f.to_date('datetime')) \
              .withColumn('time', date_format('datetime', 'HH:mm:ss')) \
              .withColumn("day",f.date_format('date','EEEE'))
df_extended.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- date: date (nullable = true)
 |-- time: string (nullable = true)
 |-- day: string (nullable = true)



In [127]:
df_extended.show()

+-------------------+--------+--------+--------+--------+----------+--------+------+
|           datetime|    open|    high|     low|   close|      date|    time|   day|
+-------------------+--------+--------+--------+--------+----------+--------+------+
|2022-05-23 03:45:00| 34330.6| 34547.7| 34291.7|34347.35|2022-05-23|03:45:00|Monday|
|2022-05-23 03:50:00| 34343.4|34457.35| 34271.5| 34425.0|2022-05-23|03:50:00|Monday|
|2022-05-23 03:55:00| 34426.4| 34447.7|34312.95|34418.55|2022-05-23|03:55:00|Monday|
|2022-05-23 04:00:00|34416.45| 34438.6|34374.75| 34390.6|2022-05-23|04:00:00|Monday|
|2022-05-23 04:05:00| 34386.1| 34459.9| 34354.4|34431.45|2022-05-23|04:05:00|Monday|
|2022-05-23 04:10:00|34432.55|34535.15|34430.75| 34514.2|2022-05-23|04:10:00|Monday|
|2022-05-23 04:15:00|34514.15| 34595.4|34506.05| 34582.6|2022-05-23|04:15:00|Monday|
|2022-05-23 04:20:00|34581.75|34616.35| 34552.0|34610.35|2022-05-23|04:20:00|Monday|
|2022-05-23 04:25:00|34609.85|34609.85| 34526.1|34560.65|2022-05-

In [128]:
# UTC Time
start_time="06:30:00"
end_time="09:30:00"

In [129]:
avg_df=df_extended.filter(df_extended.time==start_time).selectExpr("date","round((high+low)/2,2) as avg")
avg_df.show()

+----------+--------+
|      date|     avg|
+----------+--------+
|2022-05-23|34763.83|
|2022-05-24|34450.35|
|2022-05-25|34507.45|
|2022-05-26|34493.18|
|2022-05-27| 35443.9|
|2022-05-30|36001.05|
|2022-05-31|35840.83|
|2022-06-01|35656.33|
|2022-06-02|35498.63|
|2022-06-03| 35704.0|
|2022-06-06|35269.85|
|2022-06-07| 34977.6|
|2022-06-08| 35335.5|
|2022-06-09| 34791.1|
|2022-06-10|34589.45|
|2022-06-13| 33329.9|
|2022-06-14|33517.68|
|2022-06-15|33438.77|
|2022-06-16|33103.53|
|2022-06-17|32599.18|
+----------+--------+
only showing top 20 rows



In [130]:
joined_df=df_extended.filter((df_extended.time>=start_time) & (df_extended.time<=end_time)) \
                     .join(avg_df,df_extended.date==avg_df.date).drop(avg_df.date)

In [131]:
updated_df=joined_df.withColumn("diff_high",round(abs(joined_df.avg-joined_df.high),2)) \
         .withColumn("diff_low",round(abs(joined_df.avg-joined_df.low),2)) \
         .withColumn("max_diff",when(col("diff_high")>col("diff_low"),col("diff_high")).otherwise(col("diff_low")))
updated_df.show()

+-------------------+--------+--------+--------+--------+--------+------+----------+--------+---------+--------+--------+
|           datetime|    open|    high|     low|   close|    time|   day|      date|     avg|diff_high|diff_low|max_diff|
+-------------------+--------+--------+--------+--------+--------+------+----------+--------+---------+--------+--------+
|2022-05-23 06:30:00| 34744.4| 34783.9|34743.75| 34763.1|06:30:00|Monday|2022-05-23|34763.83|    20.07|   20.08|   20.08|
|2022-05-23 06:35:00| 34763.4|34765.95| 34732.9| 34738.5|06:35:00|Monday|2022-05-23|34763.83|     2.12|   30.93|   30.93|
|2022-05-23 06:40:00| 34738.6| 34763.1| 34695.8|34762.05|06:40:00|Monday|2022-05-23|34763.83|     0.73|   68.03|   68.03|
|2022-05-23 06:45:00| 34762.8|34780.85|34715.35|34727.15|06:45:00|Monday|2022-05-23|34763.83|    17.02|   48.48|   48.48|
|2022-05-23 06:50:00| 34726.5| 34763.3| 34718.2| 34758.5|06:50:00|Monday|2022-05-23|34763.83|     0.53|   45.63|   45.63|
|2022-05-23 06:55:00|347

In [134]:
final_df=updated_df.filter(col("day")=="Thursday").groupBy("date","day").agg(max(col("max_diff"))) \
          .selectExpr("row_number() over(order by date) as Sl_No","*",) \
          .orderBy("date")
final_df.show(1000)

+-----+----------+--------+-------------+
|Sl_No|      date|     day|max(max_diff)|
+-----+----------+--------+-------------+
|    1|2022-05-26|Thursday|       641.82|
|    2|2022-06-02|Thursday|       158.42|
|    3|2022-06-09|Thursday|        262.5|
|    4|2022-06-16|Thursday|       518.93|
|    5|2022-06-23|Thursday|       468.58|
|    6|2022-06-30|Thursday|       137.25|
|    7|2022-07-07|Thursday|        198.4|
|    8|2022-07-14|Thursday|       179.95|
|    9|2022-07-21|Thursday|       195.03|
|   10|2022-07-28|Thursday|        151.6|
|   11|2022-08-04|Thursday|       279.82|
|   12|2022-08-11|Thursday|         96.9|
|   13|2022-08-18|Thursday|       302.45|
|   14|2022-08-25|Thursday|       487.05|
|   15|2022-09-01|Thursday|       431.68|
|   16|2022-09-08|Thursday|        229.3|
|   17|2022-09-15|Thursday|       285.47|
|   18|2022-09-22|Thursday|       322.42|
|   19|2022-09-29|Thursday|       194.25|
|   20|2022-10-06|Thursday|       219.97|
|   21|2022-10-13|Thursday|       