In [1]:
import sys; 
sys.path.insert(0, '..')

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder. \
    appName("pyspark-1"). \
    getOrCreate()

In [4]:
spark

In [5]:
# FINNIFTY_5min.csv
# bn_5min.csv
df = spark.read.csv("/dataset/FINNIFTY_5min.csv", header=True, inferSchema=True)
df.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)



In [6]:
import pyspark.sql.functions as f

In [7]:
from pyspark.sql import Window
df_extended=df.withColumn("date",f.to_date('datetime')) \
              .withColumn("IST",f.from_utc_timestamp(f.col("datetime"),"IST")) \
              .withColumn('time',f.date_format('IST', 'HH:mm:ss')) \
              .withColumn("day",f.date_format('date','EEEE')) \
              .withColumn("duration_unit",f.row_number().over(Window.partitionBy("date").orderBy("datetime") )) \
              .withColumn("duration",f.col("duration_unit")*5) \
              .drop("duration_unit")
df_extended.printSchema()

root
 |-- datetime: timestamp (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- date: date (nullable = true)
 |-- IST: timestamp (nullable = true)
 |-- time: string (nullable = true)
 |-- day: string (nullable = true)
 |-- duration: integer (nullable = true)



In [8]:
df_extended.show()

+-------------------+--------+--------+--------+--------+----------+-------------------+--------+---------+--------+
|           datetime|    open|    high|     low|   close|      date|                IST|    time|      day|duration|
+-------------------+--------+--------+--------+--------+----------+-------------------+--------+---------+--------+
|2022-07-27 03:45:00| 16727.1|16755.95|16694.55| 16741.6|2022-07-27|2022-07-27 09:15:00|09:15:00|Wednesday|       5|
|2022-07-27 03:50:00| 16743.3|16746.55| 16678.0| 16679.2|2022-07-27|2022-07-27 09:20:00|09:20:00|Wednesday|      10|
|2022-07-27 03:55:00| 16678.3| 16688.5| 16666.7| 16667.9|2022-07-27|2022-07-27 09:25:00|09:25:00|Wednesday|      15|
|2022-07-27 04:00:00|16666.25| 16675.7| 16661.7|16670.05|2022-07-27|2022-07-27 09:30:00|09:30:00|Wednesday|      20|
|2022-07-27 04:05:00|16669.45| 16698.2| 16669.1|16685.05|2022-07-27|2022-07-27 09:35:00|09:35:00|Wednesday|      25|
|2022-07-27 04:10:00| 16685.4|16686.65|16661.55| 16678.4|2022-07

In [23]:
# UTC Time
start_time="12:30:00"
end_time="15:00:00"

In [24]:
avg_df=df_extended.filter(df_extended.time==start_time).selectExpr("date","duration as dur_start","round((high+low)/2,2) as avg")
avg_df.show()

+----------+---------+--------+
|      date|dur_start|     avg|
+----------+---------+--------+
|2022-07-27|      200|16772.35|
|2022-08-02|      200|17501.25|
|2022-11-29|      200|19282.43|
|2023-05-22|      200|19361.95|
|2022-05-26|      200|15961.35|
|2022-06-22|      200|15315.33|
|2022-06-06|      200|16328.78|
|2022-11-21|      200|18929.18|
|2023-02-08|      200|18469.35|
|2022-07-07|      200| 16097.7|
|2022-12-09|      200| 19281.6|
|2022-12-02|      200|19208.65|
|2023-03-24|      200|17756.93|
|2022-10-19|      200| 18080.8|
|2022-12-20|      200|19130.03|
|2023-05-03|      200|19122.55|
|2022-08-11|      200|17974.78|
|2022-09-29|      200|17207.35|
|2023-05-16|      200|19502.58|
|2023-06-01|      200|19456.63|
+----------+---------+--------+
only showing top 20 rows



In [25]:
joined_df=df_extended.filter((df_extended.time>=start_time) & (df_extended.time<=end_time)) \
                     .join(avg_df,df_extended.date==avg_df.date).drop(avg_df.date)

In [26]:
updated_df=joined_df.withColumn("diff_high",f.round(f.abs(f.col("avg")-f.col("high")),2)) \
         .withColumn("diff_low",f.round(f.abs(f.col("avg")-f.col("low")),2)) \
         .withColumn("max_diff",f.when(f.col("diff_high")>f.col("diff_low"),f.col("diff_high")).otherwise(f.col("diff_low"))) \
         .drop(*("diff_high","diff_low","datetime","IST")) \
         .withColumn("d1",f.col("duration")-f.col("dur_start"))
updated_df.filter(f.col("day")=="Tuesday").show()

+--------+--------+--------+--------+--------+-------+--------+----------+---------+--------+--------+---+
|    open|    high|     low|   close|    time|    day|duration|      date|dur_start|     avg|max_diff| d1|
+--------+--------+--------+--------+--------+-------+--------+----------+---------+--------+--------+---+
| 17500.0|17507.65|17494.85|17498.65|12:30:00|Tuesday|     200|2022-08-02|      200|17501.25|     6.4|  0|
| 17498.0| 17498.5|17441.75|17449.45|12:35:00|Tuesday|     205|2022-08-02|      200|17501.25|    59.5|  5|
|17449.55| 17459.0|17430.65|17435.35|12:40:00|Tuesday|     210|2022-08-02|      200|17501.25|    70.6| 10|
|17433.15|17459.65|17428.05| 17459.2|12:45:00|Tuesday|     215|2022-08-02|      200|17501.25|    73.2| 15|
|17459.35|17481.25|17451.55|17481.25|12:50:00|Tuesday|     220|2022-08-02|      200|17501.25|    49.7| 20|
| 17481.3|17487.35| 17462.0|17467.55|12:55:00|Tuesday|     225|2022-08-02|      200|17501.25|   39.25| 25|
| 17469.0| 17471.0| 17449.3|17465.45|

In [27]:
final_df=updated_df.filter(f.col("day")=="Tuesday") \
          .withColumn("r1",f.row_number().over(Window.partitionBy("date").orderBy(f.col("max_diff").desc()) )) \
          .filter(f.col("r1")==1) \
          .selectExpr("row_number() over(order by date) as Sl_No","day","date","time","max_diff","d1") \
          .orderBy("max_diff")
final_df.show(1000)

+-----+-------+----------+--------+--------+---+
|Sl_No|    day|      date|    time|max_diff| d1|
+-----+-------+----------+--------+--------+---+
|   51|Tuesday|2023-06-06|15:00:00|   25.02|150|
|   27|Tuesday|2022-12-06|14:00:00|   30.63| 90|
|   49|Tuesday|2023-05-23|14:20:00|   30.67|110|
|   28|Tuesday|2022-12-13|14:40:00|    33.7|130|
|   21|Tuesday|2022-10-18|14:30:00|   34.55|120|
|   16|Tuesday|2022-09-13|14:15:00|   36.65|105|
|   45|Tuesday|2023-04-25|13:45:00|   38.27| 75|
|   52|Tuesday|2023-06-13|14:25:00|   40.38|115|
|   48|Tuesday|2023-05-16|15:00:00|   42.78|150|
|    2|Tuesday|2022-05-31|13:10:00|   44.73| 40|
|   25|Tuesday|2022-11-22|14:45:00|   45.18|135|
|   34|Tuesday|2023-01-24|14:20:00|    49.1|110|
|   19|Tuesday|2022-10-04|13:55:00|    50.1| 85|
|   43|Tuesday|2023-04-11|14:50:00|   52.32|140|
|   12|Tuesday|2022-08-16|13:20:00|   52.35| 50|
|   22|Tuesday|2022-10-25|14:40:00|   56.65|130|
|   46|Tuesday|2023-05-02|14:50:00|   57.45|140|
|   42|Tuesday|2023-