This notebook finds at every 5 min point in the trading day, how likely is it that we have already seen the final high or low of the day

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt




SESSION_BARS = 75


In [9]:
DATA_PATH = "nifty50_minute_complete-5min.csv"  

dt_col = "date"
open_col = "open"
high_col = "high"
low_col = "low"
close_col = "close"

df = pd.read_csv(DATA_PATH)

# Parse datetime and sort
df[dt_col] = pd.to_datetime(df[dt_col])
df = df.sort_values(dt_col).reset_index(drop=True)
df[dt_col] = pd.to_datetime(df[dt_col])
df["date"] = df[dt_col].dt.date

min_date = df["date"].min()
max_date = df["date"].max()

print(f"Data available from {min_date} to {max_date}")


Data available from 2015-01-09 to 2025-02-07


In [10]:
# Set your desired start and end dates here (inclusive)
start_date_str = "2023-11-01"   # example, change as needed
end_date_str   = "2023-12-20"   # example, change as needed

# Convert to date objects
start_date = pd.to_datetime(start_date_str).date()
end_date   = pd.to_datetime(end_date_str).date()

# Basic logical checks
assert start_date <= end_date, "start_date must be on or before end_date"
assert start_date >= min_date, f"start_date {start_date} is before min data date {min_date}"
assert end_date   <= max_date, f"end_date {end_date} is after max data date {max_date}"

print(f"Using data from {start_date} to {end_date}")

Using data from 2023-11-01 to 2023-12-20


In [11]:
df_range = df[(df["date"] >= start_date) & (df["date"] <= end_date)].copy()

print(f"Number of rows in selected range: {len(df_range)}")
print(f"Number of trading days in selected range: {df_range['date'].nunique()}")
df_range.head()


Number of rows in selected range: 2550
Number of trading days in selected range: 34


Unnamed: 0,date,open,high,low,close,open-s,high-s,low-s,close-s
162815,2023-11-01,19064.05,19086.55,19032.6,19074.75,-14.5,-5.1,-42.65,-11.8
162816,2023-11-01,19077.25,19092.35,19067.55,19082.15,13.2,5.8,34.95,7.4
162817,2023-11-01,19082.15,19083.2,19063.15,19070.65,4.9,-9.15,-4.4,-11.5
162818,2023-11-01,19071.25,19096.05,19065.8,19087.75,-10.9,12.85,2.65,17.1
162819,2023-11-01,19088.2,19092.2,19079.25,19085.05,16.95,-3.85,13.45,-2.7


In [12]:
# Compute day high and low for each day
day_high = df.groupby("date")[high_col].transform("max")
day_low  = df.groupby("date")[low_col].transform("min")

df["day_high"] = day_high
df["day_low"]  = day_low

# Check a single random day
sample_date = df["date"].iloc[0]
df[df["date"] == sample_date].head(10)


Unnamed: 0,date,open,high,low,close,open-s,high-s,low-s,close-s,day_high,day_low
0,2015-01-09,8285.45,8301.3,8285.45,8301.2,,,,,8303.0,8191.1
1,2015-01-09,8300.5,8303.0,8293.25,8301.0,15.05,1.7,7.8,-0.2,8303.0,8191.1
2,2015-01-09,8301.65,8302.55,8286.8,8294.15,1.15,-0.45,-6.45,-6.85,8303.0,8191.1
3,2015-01-09,8294.1,8295.75,8280.65,8288.5,-7.55,-6.8,-6.15,-5.65,8303.0,8191.1
4,2015-01-09,8289.1,8290.45,8278.0,8283.45,-5.0,-5.3,-2.65,-5.05,8303.0,8191.1
5,2015-01-09,8283.4,8288.3,8277.4,8285.55,-5.7,-2.15,-0.6,2.1,8303.0,8191.1
6,2015-01-09,8285.4,8287.65,8278.05,8283.75,2.0,-0.65,0.65,-1.8,8303.0,8191.1
7,2015-01-09,8283.8,8284.25,8273.95,8276.25,-1.6,-3.4,-4.1,-7.5,8303.0,8191.1
8,2015-01-09,8275.95,8283.6,8275.05,8282.0,-7.85,-0.65,1.1,5.75,8303.0,8191.1
9,2015-01-09,8281.8,8287.35,8281.7,8285.5,5.85,3.75,6.65,3.5,8303.0,8191.1


In [13]:
# Cumulative max/min within each day
df["cum_high_so_far"] = df.groupby("date")[high_col].cummax()
df["cum_low_so_far"]  = df.groupby("date")[low_col].cummin()

# Has the day high already appeared by this bar?
df["has_seen_day_high"] = (df["cum_high_so_far"] >= df["day_high"])

# Has the day low already appeared by this bar?
df["has_seen_day_low"] = (df["cum_low_so_far"] <= df["day_low"])

# Has either extreme (high or low) been seen by this bar?
df["has_seen_extreme"] = df["has_seen_day_high"] | df["has_seen_day_low"]

open_col = "open"
high_col = "high"
low_col  = "low"
close_col = "close"

# Sort properly
df_range = df_range.sort_values(["date", dt_col]).reset_index(drop=True)

# Bar index within each day
df_range["bar_index"] = df_range.groupby("date").cumcount() + 1

# Day high and low
df_range["day_high"] = df_range.groupby("date")[high_col].transform("max")
df_range["day_low"]  = df_range.groupby("date")[low_col].transform("min")

# Cumulative high and low within the day
df_range["cum_high_so_far"] = df_range.groupby("date")[high_col].cummax()
df_range["cum_low_so_far"]  = df_range.groupby("date")[low_col].cummin()

# Flags: have we seen final high or low by this bar
df_range["has_seen_day_high"] = df_range["cum_high_so_far"] >= df_range["day_high"]
df_range["has_seen_day_low"]  = df_range["cum_low_so_far"]  <= df_range["day_low"]
df_range["has_seen_extreme"]  = df_range["has_seen_day_high"] | df_range["has_seen_day_low"]

df_range.head()


Unnamed: 0,date,open,high,low,close,open-s,high-s,low-s,close-s,bar_index,day_high,day_low,cum_high_so_far,cum_low_so_far,has_seen_day_high,has_seen_day_low,has_seen_extreme
0,2023-11-01,19064.05,19086.55,19032.6,19074.75,-14.5,-5.1,-42.65,-11.8,1,19096.05,18973.7,19086.55,19032.6,False,False,False
1,2023-11-01,19077.25,19092.35,19067.55,19082.15,13.2,5.8,34.95,7.4,2,19096.05,18973.7,19092.35,19032.6,False,False,False
2,2023-11-01,19082.15,19083.2,19063.15,19070.65,4.9,-9.15,-4.4,-11.5,3,19096.05,18973.7,19092.35,19032.6,False,False,False
3,2023-11-01,19071.25,19096.05,19065.8,19087.75,-10.9,12.85,2.65,17.1,4,19096.05,18973.7,19096.05,19032.6,True,False,True
4,2023-11-01,19088.2,19092.2,19079.25,19085.05,16.95,-3.85,13.45,-2.7,5,19096.05,18973.7,19096.05,19032.6,True,False,True


In [14]:
prob_by_bar = (
    daily_bar_flags
    .groupby("bar_index")
    .agg(
        num_days=("date", "nunique"),
        high_seen_days=("high_seen", "sum"),
        low_seen_days=("low_seen", "sum"),
        extreme_seen_days=("extreme_seen", "sum")
    )
    .reset_index()
)

prob_by_bar["p_high_seen_by_bar"] = prob_by_bar["high_seen_days"] / prob_by_bar["num_days"]
prob_by_bar["p_low_seen_by_bar"]  = prob_by_bar["low_seen_days"]  / prob_by_bar["num_days"]
prob_by_bar["p_extreme_seen_by_bar"] = prob_by_bar["extreme_seen_days"] / prob_by_bar["num_days"]

with pd.option_context('display.max_rows', None):
    print(prob_by_bar[["bar_index", "p_high_seen_by_bar", "p_low_seen_by_bar", "p_extreme_seen_by_bar"]])


NameError: name 'daily_bar_flags' is not defined

In [15]:
# Collapse to one row per (date, bar_index)
daily_bar_flags_range = (
    df_range
    .groupby(["date", "bar_index"])
    .agg(
        extreme_seen=("has_seen_extreme", "max")
    )
    .reset_index()
)

# For each bar_index, compute probability over days
prob_by_bar_range = (
    daily_bar_flags_range
    .groupby("bar_index")
    .agg(
        num_days=("date", "nunique"),
        extreme_seen_days=("extreme_seen", "sum")
    )
    .reset_index()
)

prob_by_bar_range["p_extreme_seen_by_bar"] = (
    prob_by_bar_range["extreme_seen_days"] / prob_by_bar_range["num_days"]
)

prob_by_bar_range.head(20)


Unnamed: 0,bar_index,num_days,extreme_seen_days,p_extreme_seen_by_bar
0,1,34,13,0.382353
1,2,34,16,0.470588
2,3,34,16,0.470588
3,4,34,19,0.558824
4,5,34,20,0.588235
5,6,34,23,0.676471
6,7,34,23,0.676471
7,8,34,23,0.676471
8,9,34,23,0.676471
9,10,34,23,0.676471
