In [1]:
import pandas as pd
import os

# Define the file path
file_path = r"D:\Market Projects\options_data_analyzer\Aug '25\Aug 07 Exp\07 Aug\BEL_EQ.csv"

# Load CSV with pandas
# Using default encoding (utf-8) and comma delimiter
# The file appears to have standard CSV format
df = pd.read_csv(file_path)

# Display basic information about the loaded data
print("Data loaded successfully!")
print(f"File path: {file_path}")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())
print("\nData types:")
print(df.dtypes)
print("\nBasic statistics:")
print(df.describe())

Data loaded successfully!
File path: D:\Market Projects\options_data_analyzer\Aug '25\Aug 07 Exp\07 Aug\BEL_EQ.csv
Shape: (18250, 5)
Columns: ['date', 'price', 'qty', 'trnvr', 'cum_trnvr']

First few rows:
                     date   price    qty        trnvr    cum_trnvr
0  2025-08-07 09:15:00 AM  386.85  65740  25431519.00  25431519.00
1  2025-08-07 09:15:01 AM  386.65      0         0.00  25431519.00
2  2025-08-07 09:15:01 AM  386.30      0         0.00  25431519.00
3  2025-08-07 09:15:01 AM  386.30    895    345738.50  25777257.50
4  2025-08-07 09:15:01 AM  386.75   1401    541836.75  26319094.25

Data types:
date          object
price        float64
qty            int64
trnvr        float64
cum_trnvr    float64
dtype: object

Basic statistics:
              price            qty         trnvr     cum_trnvr
count  18250.000000   18250.000000  1.825000e+04  1.825000e+04
mean     386.537181     630.694301  2.439435e+05  2.317732e+09
std        1.570300    2651.792039  1.026096e+06  1.

In [2]:
# Preview first and last 10 rows to check ordering
print("=== FIRST 10 ROWS ===")
print(df.head(10))
print("\n" + "="*50 + "\n")
print("=== LAST 10 ROWS ===")
print(df.tail(10))

# Check if data is in chronological order
print("\n" + "="*50 + "\n")
print("=== CHRONOLOGICAL ORDER CHECK ===")

# Convert date column to datetime if not already
df['date'] = pd.to_datetime(df['date'])

# Check first and last timestamps
first_time = df['date'].iloc[0]
last_time = df['date'].iloc[-1]

print(f"First timestamp: {first_time}")
print(f"Last timestamp: {last_time}")

# Check if chronological (ascending) or reverse chronological (descending)
if first_time < last_time:
    print("✓ Data is in CHRONOLOGICAL order (ascending)")
    print("  - First row: Earliest time")
    print("  - Last row: Latest time")
else:
    print("✗ Data is in REVERSE CHRONOLOGICAL order (descending)")
    print("  - First row: Latest time")
    print("  - Last row: Earliest time")

# Show time range
time_range = last_time - first_time
print(f"\nTotal time range: {time_range}")

=== FIRST 10 ROWS ===
                     date   price    qty        trnvr    cum_trnvr
0  2025-08-07 09:15:00 AM  386.85  65740  25431519.00  25431519.00
1  2025-08-07 09:15:01 AM  386.65      0         0.00  25431519.00
2  2025-08-07 09:15:01 AM  386.30      0         0.00  25431519.00
3  2025-08-07 09:15:01 AM  386.30    895    345738.50  25777257.50
4  2025-08-07 09:15:01 AM  386.75   1401    541836.75  26319094.25
5  2025-08-07 09:15:02 AM  386.80   1795    694306.00  27013400.25
6  2025-08-07 09:15:02 AM  386.95    741    286729.95  27300130.20
7  2025-08-07 09:15:03 AM  386.85      0         0.00  27300130.20
8  2025-08-07 09:15:03 AM  386.50      0         0.00  27300130.20
9  2025-08-07 09:15:03 AM  386.90   2717   1051207.30  28351337.50


=== LAST 10 ROWS ===
                         date   price   qty       trnvr     cum_trnvr
18240  2025-08-07 03:29:31 PM  388.25   226    87744.50  4.446928e+09
18241  2025-08-07 03:29:31 PM  388.10     0        0.00  4.446928e+09
18242  2

In [3]:
# Convert date to datetime64[ns] and extract datetime features
print("=== DATETIME CONVERSION AND FEATURE EXTRACTION ===")

# Convert date column to datetime64[ns]
df['date'] = pd.to_datetime(df['date'])

# Extract additional datetime features
df['date_only'] = df['date'].dt.date
df['time'] = df['date'].dt.time
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['second'] = df['date'].dt.second

# Display the new datetime features
print("New datetime features added:")
print(f"  - date_only: {df['date_only'].dtype}")
print(f"  - time: {df['time'].dtype}")
print(f"  - hour: {df['hour'].dtype}")
print(f"  - minute: {df['minute'].dtype}")
print(f"  - second: {df['second'].dtype}")

# Show sample of the enhanced dataframe
print("\n=== SAMPLE DATA WITH NEW FEATURES ===")
print(df[['date', 'date_only', 'time', 'hour', 'minute', 'second', 'price', 'qty']].head(10))

# Verify datetime conversion
print(f"\n=== DATETIME VERIFICATION ===")
print(f"Original date column dtype: {df['date'].dtype}")
print(f"First timestamp: {df['date'].iloc[0]}")
print(f"Last timestamp: {df['date'].iloc[-1]}")
print(f"Total unique dates: {df['date_only'].nunique()}")
print(f"Date range: {df['date_only'].min()} to {df['date_only'].max()}")

=== DATETIME CONVERSION AND FEATURE EXTRACTION ===
New datetime features added:
  - date_only: object
  - time: object
  - hour: int32
  - minute: int32
  - second: int32

=== SAMPLE DATA WITH NEW FEATURES ===
                 date   date_only      time  hour  minute  second   price  \
0 2025-08-07 09:15:00  2025-08-07  09:15:00     9      15       0  386.85   
1 2025-08-07 09:15:01  2025-08-07  09:15:01     9      15       1  386.65   
2 2025-08-07 09:15:01  2025-08-07  09:15:01     9      15       1  386.30   
3 2025-08-07 09:15:01  2025-08-07  09:15:01     9      15       1  386.30   
4 2025-08-07 09:15:01  2025-08-07  09:15:01     9      15       1  386.75   
5 2025-08-07 09:15:02  2025-08-07  09:15:02     9      15       2  386.80   
6 2025-08-07 09:15:02  2025-08-07  09:15:02     9      15       2  386.95   
7 2025-08-07 09:15:03  2025-08-07  09:15:03     9      15       3  386.85   
8 2025-08-07 09:15:03  2025-08-07  09:15:03     9      15       3  386.50   
9 2025-08-07 09:15:0

In [4]:
# Handle zero-quantity trades - remove rows with 0 qty
print("=== HANDLING ZERO-QUANTITY TRADES ===")

# Check current data shape and zero qty count
print(f"Original data shape: {df.shape}")
zero_qty_count = (df['qty'] == 0).sum()
print(f"Rows with zero quantity: {zero_qty_count}")
print(f"Percentage of zero qty rows: {(zero_qty_count/len(df)*100):.2f}%")

# Show sample of zero qty rows before removal
print("\n=== SAMPLE OF ZERO QTY ROWS (BEFORE REMOVAL) ===")
zero_qty_sample = df[df['qty'] == 0][['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].head(5)
print(zero_qty_sample)

# Remove rows with zero quantity
df_clean = df[df['qty'] > 0].copy()

# Reset index after filtering
df_clean = df_clean.reset_index(drop=True)

# Display results after cleaning
print(f"\n=== AFTER CLEANING ===")
print(f"Cleaned data shape: {df_clean.shape}")
print(f"Rows removed: {len(df) - len(df_clean)}")
print(f"Remaining rows: {len(df_clean)}")

# Show sample of cleaned data
print("\n=== SAMPLE OF CLEANED DATA ===")
print(df_clean[['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].head(10))

# Verify no zero qty rows remain
remaining_zero_qty = (df_clean['qty'] == 0).sum()
print(f"\nZero qty rows remaining: {remaining_zero_qty}")

# Update the main dataframe reference
df = df_clean
print(f"\n✓ Main dataframe 'df' now contains {len(df)} rows with non-zero quantities")

=== HANDLING ZERO-QUANTITY TRADES ===
Original data shape: (18250, 10)
Rows with zero quantity: 7294
Percentage of zero qty rows: 39.97%

=== SAMPLE OF ZERO QTY ROWS (BEFORE REMOVAL) ===
                  date   price  qty  trnvr    cum_trnvr
1  2025-08-07 09:15:01  386.65    0    0.0  25431519.00
2  2025-08-07 09:15:01  386.30    0    0.0  25431519.00
7  2025-08-07 09:15:03  386.85    0    0.0  27300130.20
8  2025-08-07 09:15:03  386.50    0    0.0  27300130.20
13 2025-08-07 09:15:05  386.45    0    0.0  32995767.85

=== AFTER CLEANING ===
Cleaned data shape: (10956, 10)
Rows removed: 7294
Remaining rows: 10956

=== SAMPLE OF CLEANED DATA ===
                 date   price    qty        trnvr    cum_trnvr
0 2025-08-07 09:15:00  386.85  65740  25431519.00  25431519.00
1 2025-08-07 09:15:01  386.30    895    345738.50  25777257.50
2 2025-08-07 09:15:01  386.75   1401    541836.75  26319094.25
3 2025-08-07 09:15:02  386.80   1795    694306.00  27013400.25
4 2025-08-07 09:15:02  386.95    

In [5]:
# Validate numeric columns for negatives or outliers
print("=== NUMERIC COLUMN VALIDATION ===")

# List of numeric columns to validate
numeric_cols = ['price', 'qty', 'trnvr', 'cum_trnvr']

# Check for negative values
print("=== NEGATIVE VALUE CHECK ===")
for col in numeric_cols:
    negative_count = (df[col] < 0).sum()
    print(f"{col}: {negative_count} negative values")

# Check for zero values (after qty cleaning)
print("\n=== ZERO VALUE CHECK ===")
for col in numeric_cols:
    zero_count = (df[col] == 0).sum()
    print(f"{col}: {zero_count} zero values")

# Statistical summary for outlier detection
print("\n=== STATISTICAL SUMMARY ===")
print(df[numeric_cols].describe())

# Outlier detection using IQR method
print("\n=== OUTLIER DETECTION (IQR METHOD) ===")
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers_lower = (df[col] < lower_bound).sum()
    outliers_upper = (df[col] > upper_bound).sum()
    
    print(f"\n{col}:")
    print(f"  Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
    print(f"  Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")
    print(f"  Outliers below lower bound: {outliers_lower}")
    print(f"  Outliers above upper bound: {outliers_upper}")

# Check for extreme values (beyond 3 standard deviations)
print("\n=== EXTREME VALUE CHECK (3 STD DEV) ===")
for col in numeric_cols:
    mean_val = df[col].mean()
    std_val = df[col].std()
    
    lower_3std = mean_val - 3 * std_val
    upper_3std = mean_val + 3 * std_val
    
    extreme_lower = (df[col] < lower_3std).sum()
    extreme_upper = (df[col] > upper_3std).sum()
    
    print(f"\n{col}:")
    print(f"  Mean: {mean_val:.2f}, Std: {std_val:.2f}")
    print(f"  Lower 3σ: {lower_3std:.2f}, Upper 3σ: {upper_3std:.2f}")
    print(f"  Extreme values below: {extreme_lower}")
    print(f"  Extreme values above: {extreme_upper}")

# Show sample of potential outliers
print("\n=== SAMPLE OF POTENTIAL OUTLIERS ===")
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[df[col] > upper_bound]
    if len(outliers) > 0:
        print(f"\n{col} outliers (top 5):")
        print(outliers[['date', col, 'qty', 'trnvr']].head())

# Data quality summary
print("\n=== DATA QUALITY SUMMARY ===")
print(f"Total rows: {len(df)}")
print(f"Columns with potential issues:")
for col in numeric_cols:
    issues = []
    if (df[col] < 0).any():
        issues.append("negative values")
    if (df[col] == 0).any() and col != 'qty':  # qty can legitimately be 0
        issues.append("zero values")
    
    if issues:
        print(f"  {col}: {', '.join(issues)}")
    else:
        print(f"  {col}: ✓ clean")

=== NUMERIC COLUMN VALIDATION ===
=== NEGATIVE VALUE CHECK ===
price: 0 negative values
qty: 0 negative values
trnvr: 0 negative values
cum_trnvr: 0 negative values

=== ZERO VALUE CHECK ===
price: 0 zero values
qty: 0 zero values
trnvr: 0 zero values
cum_trnvr: 0 zero values

=== STATISTICAL SUMMARY ===
              price            qty         trnvr     cum_trnvr
count  10956.000000   10956.000000  1.095600e+04  1.095600e+04
mean     386.561094    1050.581508  4.063499e+05  2.377823e+09
std        1.574324    3357.505716  1.299189e+06  1.165181e+09
min      383.500000       1.000000  3.836000e+02  2.543152e+07
25%      385.150000      26.000000  1.003177e+04  1.490814e+09
50%      386.450000     200.000000  7.732500e+04  2.355502e+09
75%      387.950000     799.000000  3.085895e+05  3.321858e+09
max      390.200000  153858.000000  5.955074e+07  4.451969e+09

=== OUTLIER DETECTION (IQR METHOD) ===

price:
  Q1: 385.15, Q3: 387.95, IQR: 2.80
  Lower bound: 380.95, Upper bound: 392.15


In [6]:
# Ensure sorting by datetime for time-series integrity
print("=== TIME-SERIES INTEGRITY CHECK AND SORTING ===")

# Check current sorting status
print("=== CURRENT SORTING STATUS ===")
print(f"First timestamp: {df['date'].iloc[0]}")
print(f"Last timestamp: {df['date'].iloc[-1]}")

# Check if data is already sorted
is_sorted = df['date'].is_monotonic_increasing
print(f"Data is already sorted chronologically: {is_sorted}")

# Check for any duplicate timestamps
duplicate_timestamps = df['date'].duplicated().sum()
print(f"Duplicate timestamps: {duplicate_timestamps}")

if duplicate_timestamps > 0:
    print("\n=== DUPLICATE TIMESTAMP ANALYSIS ===")
    duplicate_samples = df[df['date'].duplicated(keep=False)].sort_values('date')
    print("Sample duplicate timestamps:")
    print(duplicate_samples[['date', 'price', 'qty', 'trnvr']].head(10))

# Sort the dataframe by datetime
print("\n=== SORTING DATA BY DATETIME ===")
df_sorted = df.sort_values('date').reset_index(drop=True)

# Verify sorting
is_now_sorted = df_sorted['date'].is_monotonic_increasing
print(f"Data is now sorted chronologically: {is_now_sorted}")

# Display sorting verification
print(f"\n=== SORTING VERIFICATION ===")
print("First 5 rows after sorting:")
print(df_sorted[['date', 'price', 'qty', 'trnvr']].head())
print(f"\nLast 5 rows after sorting:")
print(df_sorted[['date', 'price', 'qty', 'trnvr']].tail())

# Check for any time gaps or irregularities
print(f"\n=== TIME SERIES CONTINUITY CHECK ===")
time_diffs = df_sorted['date'].diff().dropna()
print(f"Time differences between consecutive rows:")
print(f"  Min: {time_diffs.min()}")
print(f"  Max: {time_diffs.max()}")
print(f"  Mean: {time_diffs.mean()}")
print(f"  Most common: {time_diffs.mode().iloc[0] if len(time_diffs.mode()) > 0 else 'N/A'}")

# Check for any large time gaps
large_gaps = time_diffs[time_diffs > pd.Timedelta(minutes=5)]
if len(large_gaps) > 0:
    print(f"\n⚠️  Found {len(large_gaps)} time gaps larger than 5 minutes:")
    gap_indices = time_diffs[time_diffs > pd.Timedelta(minutes=5)].index
    for idx in gap_indices[:5]:  # Show first 5 gaps
        gap_start = df_sorted.loc[idx-1, 'date']
        gap_end = df_sorted.loc[idx, 'date']
        gap_duration = gap_end - gap_start
        print(f"  Gap: {gap_start} to {gap_end} (Duration: {gap_duration})")

# Update the main dataframe with sorted version
df = df_sorted
print(f"\n✓ Main dataframe 'df' is now properly sorted chronologically")
print(f"✓ Total rows: {len(df)}")
print(f"✓ Time range: {df['date'].min()} to {df['date'].max()}")

# Final verification
print(f"\n=== FINAL VERIFICATION ===")
print("✓ Data is sorted chronologically")
print("✓ Index is reset and sequential")
print("✓ Ready for time-series analysis")

=== TIME-SERIES INTEGRITY CHECK AND SORTING ===
=== CURRENT SORTING STATUS ===
First timestamp: 2025-08-07 09:15:00
Last timestamp: 2025-08-07 15:29:58
Data is already sorted chronologically: True
Duplicate timestamps: 1961

=== DUPLICATE TIMESTAMP ANALYSIS ===
Sample duplicate timestamps:
                  date   price   qty       trnvr
1  2025-08-07 09:15:01  386.30   895   345738.50
2  2025-08-07 09:15:01  386.75  1401   541836.75
3  2025-08-07 09:15:02  386.80  1795   694306.00
4  2025-08-07 09:15:02  386.95   741   286729.95
5  2025-08-07 09:15:03  386.90  2717  1051207.30
6  2025-08-07 09:15:03  386.80  9068  3507502.40
7  2025-08-07 09:15:04  386.75  1141   441281.75
8  2025-08-07 09:15:04  386.90  1798   695646.20
9  2025-08-07 09:15:05  386.65  2092   808871.80
10 2025-08-07 09:15:05  386.75  1679   649353.25

=== SORTING DATA BY DATETIME ===
Data is now sorted chronologically: True

=== SORTING VERIFICATION ===
First 5 rows after sorting:
                 date   price    qty 

In [7]:
# Get comprehensive descriptive statistics for numeric fields
print("=== COMPREHENSIVE DESCRIPTIVE STATISTICS ===")

# Get basic describe() for all numeric columns
print("=== BASIC DESCRIPTIVE STATISTICS ===")
print(df.describe())

# Get detailed statistics for each numeric column
print("\n" + "="*60)
print("=== DETAILED STATISTICS BY COLUMN ===")

numeric_cols = ['price', 'qty', 'trnvr', 'cum_trnvr']

for col in numeric_cols:
    print(f"\n--- {col.upper()} ---")
    col_stats = df[col].describe()
    
    print(f"Count: {col_stats['count']:,.0f}")
    print(f"Mean: {col_stats['mean']:,.2f}")
    print(f"Std: {col_stats['std']:,.2f}")
    print(f"Min: {col_stats['min']:,.2f}")
    print(f"25%: {col_stats['25%']:,.2f}")
    print(f"50% (Median): {col_stats['50%']:,.2f}")
    print(f"75%: {col_stats['75%']:,.2f}")
    print(f"Max: {col_stats['max']:,.2f}")
    
    # Additional useful statistics
    print(f"Range: {col_stats['max'] - col_stats['min']:,.2f}")
    print(f"IQR: {col_stats['75%'] - col_stats['25%']:,.2f}")
    print(f"Coefficient of Variation: {(col_stats['std']/col_stats['mean']*100):,.2f}%")

# Get statistics for datetime features
print("\n" + "="*60)
print("=== DATETIME FEATURE STATISTICS ===")

print("\n--- HOUR DISTRIBUTION ---")
hour_counts = df['hour'].value_counts().sort_index()
print(hour_counts)

print("\n--- MINUTE DISTRIBUTION (Sample) ---")
minute_counts = df['minute'].value_counts().sort_index().head(20)
print(minute_counts)

# Get statistics for specific time periods
print("\n" + "="*60)
print("=== TIME PERIOD ANALYSIS ===")

# Market hours analysis (assuming 9:15 AM to 3:30 PM)
market_hours = df[(df['hour'] >= 9) & (df['hour'] <= 15)]
print(f"Trades during market hours (9 AM - 3 PM): {len(market_hours):,}")
print(f"Trades outside market hours: {len(df) - len(market_hours):,}")

# Price range analysis
print(f"\n--- PRICE ANALYSIS ---")
print(f"Price range: ₹{df['price'].min():.2f} to ₹{df['price'].max():.2f}")
print(f"Price spread: ₹{df['price'].max() - df['price'].min():.2f}")

# Volume analysis
print(f"\n--- VOLUME ANALYSIS ---")
print(f"Total volume traded: {df['qty'].sum():,}")
print(f"Average trade size: {df['qty'].mean():.0f}")
print(f"Largest single trade: {df['qty'].max():,}")

# Turnover analysis
print(f"\n--- TURNOVER ANALYSIS ---")
print(f"Total turnover: ₹{df['trnvr'].sum():,.2f}")
print(f"Average trade value: ₹{df['trnvr'].mean():,.2f}")
print(f"Largest single trade value: ₹{df['trnvr'].max():,.2f}")

# Display summary table
print("\n" + "="*60)
print("=== SUMMARY TABLE ===")
summary_data = {
    'Metric': ['Total Rows', 'Price Range', 'Total Volume', 'Total Turnover', 'Time Range'],
    'Value': [
        f"{len(df):,}",
        f"₹{df['price'].min():.2f} - ₹{df['price'].max():.2f}",
        f"{df['qty'].sum():,}",
        f"₹{df['trnvr'].sum():,.2f}",
        f"{df['date'].min().strftime('%H:%M:%S')} to {df['date'].max().strftime('%H:%M:%S')}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

=== COMPREHENSIVE DESCRIPTIVE STATISTICS ===
=== BASIC DESCRIPTIVE STATISTICS ===
                                date         price            qty  \
count                          10956  10956.000000   10956.000000   
mean   2025-08-07 12:40:58.929810176    386.561094    1050.581508   
min              2025-08-07 09:15:00    383.500000       1.000000   
25%       2025-08-07 11:06:26.500000    385.150000      26.000000   
50%              2025-08-07 12:45:14    386.450000     200.000000   
75%    2025-08-07 14:26:31.249999872    387.950000     799.000000   
max              2025-08-07 15:29:58    390.200000  153858.000000   
std                              NaN      1.574324    3357.505716   

              trnvr     cum_trnvr          hour        minute        second  
count  1.095600e+04  1.095600e+04  10956.000000  10956.000000  10956.000000  
mean   4.063499e+05  2.377823e+09     12.209383     27.926798     29.543173  
min    3.836000e+02  2.543152e+07      9.000000      0.000000 

In [9]:
# Count total unique trading days
print("=== TRADING DAYS ANALYSIS ===")

# Extract unique dates from the datetime column
unique_dates = df['date_only'].unique()
total_trading_days = len(unique_dates)

print(f"Total unique trading days: {total_trading_days}")

# Display all unique trading dates
print(f"\n=== ALL TRADING DATES ===")
for i, date in enumerate(sorted(unique_dates), 1):
    print(f"{i:2d}. {date}")

# Get date range
date_range = f"{min(unique_dates)} to {max(unique_dates)}"
print(f"\nTrading date range: {date_range}")

# Check if all dates are from the same month/year
print(f"\n=== DATE ANALYSIS ===")
if total_trading_days == 1:
    print("Single trading day data")
    print(f"Date: {unique_dates[0]}")
    # Initialize these variables for single day to avoid errors
    months = {unique_dates[0].month}
    years = {unique_dates[0].year}
elif total_trading_days > 1:
    # Check month and year consistency
    months = set(date.month for date in unique_dates)
    years = set(date.year for date in unique_dates)
    
    print(f"Multiple trading days: {total_trading_days}")
    print(f"Months covered: {sorted(months)}")
    print(f"Years covered: {sorted(years)}")
    
    if len(months) == 1:
        month_name = pd.Timestamp(unique_dates[0]).strftime('%B')
        print(f"All dates are from: {month_name} {list(years)[0]}")
    
    if len(years) == 1:
        print(f"All dates are from year: {list(years)[0]}")

# Trading days by month (if multiple months)
if len(months) > 1:
    print(f"\n=== TRADING DAYS BY MONTH ===")
    monthly_counts = {}
    for date in unique_dates:
        month_key = f"{date.year}-{date.month:02d}"
        monthly_counts[month_key] = monthly_counts.get(month_key, 0) + 1
    
    for month_key in sorted(monthly_counts.keys()):
        year, month = month_key.split('-')
        month_name = pd.Timestamp(f"{year}-{month}-01").strftime('%B %Y')
        print(f"{month_name}: {monthly_counts[month_key]} trading days")

# Verify data consistency
print(f"\n=== DATA CONSISTENCY CHECK ===")
print(f"Total rows in dataset: {len(df):,}")
print(f"Average rows per trading day: {len(df)/total_trading_days:.1f}")

# Check for any missing dates in sequence (if multiple days)
if total_trading_days > 1:
    sorted_dates = sorted(unique_dates)
    date_diffs = []
    for i in range(1, len(sorted_dates)):
        diff = (sorted_dates[i] - sorted_dates[i-1]).days
        date_diffs.append(diff)
    
    if any(diff > 1 for diff in date_diffs):
        print(f"\n⚠️  Gaps detected in trading days:")
        for i, diff in enumerate(date_diffs):
            if diff > 1:
                gap_start = sorted_dates[i-1]
                gap_end = sorted_dates[i]
                print(f"  Gap: {gap_start} to {gap_end} ({diff-1} missing days)")
    else:
        print(f"\n✓ No gaps in trading days - consecutive trading days")

print(f"\n=== SUMMARY ===")
print(f"✓ Total unique trading days: {total_trading_days}")
print(f"✓ Date range: {date_range}")
print(f"✓ Ready for daily analysis and aggregation")

=== TRADING DAYS ANALYSIS ===
Total unique trading days: 1

=== ALL TRADING DATES ===
 1. 2025-08-07

Trading date range: 2025-08-07 to 2025-08-07

=== DATE ANALYSIS ===
Single trading day data
Date: 2025-08-07

=== DATA CONSISTENCY CHECK ===
Total rows in dataset: 10,956
Average rows per trading day: 10956.0

=== SUMMARY ===
✓ Total unique trading days: 1
✓ Date range: 2025-08-07 to 2025-08-07
✓ Ready for daily analysis and aggregation


In [10]:
# Identify earliest and latest timestamps
print("=== TIMESTAMP RANGE ANALYSIS ===")

# Get earliest and latest timestamps
earliest_timestamp = df['date'].min()
latest_timestamp = df['date'].max()

print(f"Earliest timestamp: {earliest_timestamp}")
print(f"Latest timestamp: {latest_timestamp}")

# Calculate total time duration
total_duration = latest_timestamp - earliest_timestamp
print(f"Total time duration: {total_duration}")

# Convert duration to more readable format
duration_hours = total_duration.total_seconds() / 3600
duration_minutes = total_duration.total_seconds() / 60

print(f"Duration in hours: {duration_hours:.2f} hours")
print(f"Duration in minutes: {duration_minutes:.0f} minutes")

# Display the actual rows with earliest and latest timestamps
print(f"\n=== EARLIEST TIMESTAMP ROW ===")
earliest_row = df[df['date'] == earliest_timestamp]
print(earliest_row[['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].to_string(index=False))

print(f"\n=== LATEST TIMESTAMP ROW ===")
latest_row = df[df['date'] == latest_timestamp]
print(latest_row[['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].to_string(index=False))

# Check if timestamps span across different time periods
print(f"\n=== TIME PERIOD ANALYSIS ===")
earliest_hour = earliest_timestamp.hour
earliest_minute = earliest_timestamp.minute
latest_hour = latest_timestamp.hour
latest_minute = latest_timestamp.minute

print(f"Earliest: {earliest_hour:02d}:{earliest_minute:02d}")
print(f"Latest: {latest_hour:02d}:{latest_minute:02d}")

# Market hours analysis (assuming 9:15 AM to 3:30 PM)
market_start = pd.Timestamp('2025-08-07 09:15:00')
market_end = pd.Timestamp('2025-08-07 15:30:00')

print(f"\nMarket hours: 09:15:00 to 15:30:00")
print(f"Data coverage:")

if earliest_timestamp < market_start:
    pre_market_duration = market_start - earliest_timestamp
    print(f"  Pre-market: {earliest_timestamp.strftime('%H:%M:%S')} to {market_start.strftime('%H:%M:%S')} ({pre_market_duration})")

if earliest_timestamp <= market_start and latest_timestamp >= market_end:
    market_duration = market_end - market_start
    print(f"  Market hours: {market_start.strftime('%H:%M:%S')} to {market_end.strftime('%H:%M:%S')} ({market_duration})")

if latest_timestamp > market_end:
    post_market_duration = latest_timestamp - market_end
    print(f"  Post-market: {market_end.strftime('%H:%M:%S')} to {latest_timestamp.strftime('%H:%M:%S')} ({post_market_duration})")

# Check for any gaps in the time series
print(f"\n=== TIME SERIES CONTINUITY ===")
time_diffs = df['date'].diff().dropna()
min_time_diff = time_diffs.min()
max_time_diff = time_diffs.max()

print(f"Minimum time difference between consecutive rows: {min_time_diff}")
print(f"Maximum time difference between consecutive rows: {max_time_diff}")

# Identify any unusually large time gaps
large_gaps = time_diffs[time_diffs > pd.Timedelta(minutes=1)]
if len(large_gaps) > 0:
    print(f"\n⚠️  Found {len(large_gaps)} time gaps larger than 1 minute:")
    gap_indices = large_gaps.index[:5]  # Show first 5 gaps
    for idx in gap_indices:
        gap_start = df.loc[idx-1, 'date']
        gap_end = df.loc[idx, 'date']
        gap_duration = gap_end - gap_start
        print(f"  Gap: {gap_start.strftime('%H:%M:%S')} to {gap_end.strftime('%H:%M:%S')} (Duration: {gap_duration})")

# Summary
print(f"\n=== SUMMARY ===")
print(f"✓ Data spans: {earliest_timestamp.strftime('%Y-%m-%d %H:%M:%S')} to {latest_timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"✓ Total duration: {duration_hours:.2f} hours ({duration_minutes:.0f} minutes)")
print(f"✓ Total rows: {len(df):,}")
print(f"✓ Average frequency: {len(df)/duration_hours:.1f} ticks per hour")

=== TIMESTAMP RANGE ANALYSIS ===
Earliest timestamp: 2025-08-07 09:15:00
Latest timestamp: 2025-08-07 15:29:58
Total time duration: 0 days 06:14:58
Duration in hours: 6.25 hours
Duration in minutes: 375 minutes

=== EARLIEST TIMESTAMP ROW ===
               date  price   qty      trnvr  cum_trnvr
2025-08-07 09:15:00 386.85 65740 25431519.0 25431519.0

=== LATEST TIMESTAMP ROW ===
               date  price  qty    trnvr    cum_trnvr
2025-08-07 15:29:58 388.25 1740 675555.0 4.451969e+09

=== TIME PERIOD ANALYSIS ===
Earliest: 09:15
Latest: 15:29

Market hours: 09:15:00 to 15:30:00
Data coverage:

=== TIME SERIES CONTINUITY ===
Minimum time difference between consecutive rows: 0 days 00:00:00
Maximum time difference between consecutive rows: 0 days 00:00:12

=== SUMMARY ===
✓ Data spans: 2025-08-07 09:15:00 to 2025-08-07 15:29:58
✓ Total duration: 6.25 hours (375 minutes)
✓ Total rows: 10,956
✓ Average frequency: 1753.1 ticks per hour


In [11]:
# Calculate total traded volume and total turnover
print("=== VOLUME AND TURNOVER ANALYSIS ===")

# Calculate totals
total_volume = df['qty'].sum()
total_turnover = df['trnvr'].sum()

print(f"Total traded volume: {total_volume:,}")
print(f"Total turnover: ₹{total_turnover:,.2f}")

# Additional volume and turnover metrics
print(f"\n=== DETAILED METRICS ===")

# Volume analysis
print("--- VOLUME ANALYSIS ---")
avg_trade_size = df['qty'].mean()
median_trade_size = df['qty'].median()
max_trade_size = df['qty'].max()
min_trade_size = df['qty'].min()

print(f"Average trade size: {avg_trade_size:,.0f}")
print(f"Median trade size: {median_trade_size:,.0f}")
print(f"Largest single trade: {max_trade_size:,}")
print(f"Smallest single trade: {min_trade_size:,}")

# Turnover analysis
print(f"\n--- TURNOVER ANALYSIS ---")
avg_trade_value = df['trnvr'].mean()
median_trade_value = df['trnvr'].median()
max_trade_value = df['trnvr'].max()
min_trade_value = df['trnvr'].min()

print(f"Average trade value: ₹{avg_trade_value:,.2f}")
print(f"Median trade value: ₹{median_trade_value:,.2f}")
print(f"Largest single trade value: ₹{max_trade_value:,.2f}")
print(f"Smallest single trade value: ₹{min_trade_value:,.2f}")

# Price analysis
print(f"\n--- PRICE ANALYSIS ---")
avg_price = df['price'].mean()
weighted_avg_price = (df['price'] * df['qty']).sum() / df['qty'].sum()
price_range = df['price'].max() - df['price'].min()

print(f"Simple average price: ₹{avg_price:.2f}")
print(f"Weighted average price (by volume): ₹{weighted_avg_price:.2f}")
print(f"Price range: ₹{price_range:.2f}")

# Volume-weighted metrics
print(f"\n--- VOLUME-WEIGHTED METRICS ---")
vwap = (df['price'] * df['qty']).sum() / df['qty'].sum()
print(f"Volume Weighted Average Price (VWAP): ₹{vwap:.2f}")

# Efficiency metrics
print(f"\n--- EFFICIENCY METRICS ---")
trades_count = len(df)
print(f"Total number of trades: {trades_count:,}")
print(f"Average volume per trade: {total_volume/trades_count:,.0f}")
print(f"Average turnover per trade: ₹{total_turnover/trades_count:,.2f}")

# Time-based analysis
print(f"\n=== TIME-BASED ANALYSIS ===")
earliest_time = df['date'].min()
latest_time = df['date'].max()
duration_hours = (latest_time - earliest_time).total_seconds() / 3600

print(f"Trading duration: {duration_hours:.2f} hours")
print(f"Volume per hour: {total_volume/duration_hours:,.0f}")
print(f"Turnover per hour: ₹{total_turnover/duration_hours:,.2f}")

# Market activity intensity
print(f"\n--- MARKET ACTIVITY INTENSITY ---")
print(f"Trades per hour: {trades_count/duration_hours:.1f}")
print(f"Volume per trade: {total_volume/trades_count:,.0f}")
print(f"Turnover per trade: ₹{total_turnover/trades_count:,.2f}")

# Summary table
print(f"\n" + "="*60)
print("=== SUMMARY TABLE ===")
summary_data = {
    'Metric': ['Total Volume', 'Total Turnover', 'Total Trades', 'Avg Trade Size', 'Avg Trade Value', 'VWAP'],
    'Value': [
        f"{total_volume:,}",
        f"₹{total_turnover:,.2f}",
        f"{trades_count:,}",
        f"{avg_trade_size:,.0f}",
        f"₹{avg_trade_value:,.2f}",
        f"₹{vwap:.2f}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

print(f"\n=== KEY INSIGHTS ===")
print(f"✓ Total volume traded: {total_volume:,} shares")
print(f"✓ Total market value: ₹{total_turnover:,.2f}")
print(f"✓ Market activity: {trades_count:,} individual trades")
print(f"✓ Trading efficiency: ₹{total_turnover/total_volume:.2f} per share")

=== VOLUME AND TURNOVER ANALYSIS ===
Total traded volume: 11,510,171
Total turnover: ₹4,451,969,111.70

=== DETAILED METRICS ===
--- VOLUME ANALYSIS ---
Average trade size: 1,051
Median trade size: 200
Largest single trade: 153,858
Smallest single trade: 1

--- TURNOVER ANALYSIS ---
Average trade value: ₹406,349.86
Median trade value: ₹77,325.00
Largest single trade value: ₹59,550,738.90
Smallest single trade value: ₹383.60

--- PRICE ANALYSIS ---
Simple average price: ₹386.56
Weighted average price (by volume): ₹386.79
Price range: ₹6.70

--- VOLUME-WEIGHTED METRICS ---
Volume Weighted Average Price (VWAP): ₹386.79

--- EFFICIENCY METRICS ---
Total number of trades: 10,956
Average volume per trade: 1,051
Average turnover per trade: ₹406,349.86

=== TIME-BASED ANALYSIS ===
Trading duration: 6.25 hours
Volume per hour: 1,841,791
Turnover per hour: ₹712,378,380.39

--- MARKET ACTIVITY INTENSITY ---
Trades per hour: 1753.1
Volume per trade: 1,051
Turnover per trade: ₹406,349.86

=== SUMMA

In [12]:
# Check number of trades with qty > 0 (actual trades vs. zero-qty updates)
print("=== TRADE TYPE ANALYSIS ===")

# Count different types of records
total_records = len(df)
actual_trades = (df['qty'] > 0).sum()
zero_qty_updates = (df['qty'] == 0).sum()

print(f"Total records in dataset: {total_records:,}")
print(f"Actual trades (qty > 0): {actual_trades:,}")
print(f"Zero-quantity updates (qty = 0): {zero_qty_updates:,}")

# Calculate percentages
actual_trades_pct = (actual_trades / total_records) * 100
zero_qty_pct = (zero_qty_updates / total_records) * 100

print(f"\n=== PERCENTAGE BREAKDOWN ===")
print(f"Actual trades: {actual_trades_pct:.2f}%")
print(f"Zero-quantity updates: {zero_qty_pct:.2f}%")

# Analyze the data composition
print(f"\n=== DATA COMPOSITION ANALYSIS ===")
if zero_qty_updates > 0:
    print("⚠️  Dataset contains both actual trades and zero-quantity updates")
    print("   This suggests the data includes bid-ask spread updates")
else:
    print("✓ Dataset contains only actual trades (all qty > 0)")

# Show sample of actual trades
print(f"\n=== SAMPLE OF ACTUAL TRADES (qty > 0) ===")
actual_trades_df = df[df['qty'] > 0]
print(actual_trades_df[['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].head(10))

# Show sample of zero-quantity updates (if any exist)
if zero_qty_updates > 0:
    print(f"\n=== SAMPLE OF ZERO-QUANTITY UPDATES (qty = 0) ===")
    zero_qty_df = df[df['qty'] == 0]
    print(zero_qty_df[['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].head(10))

# Analyze characteristics of each type
print(f"\n=== CHARACTERISTICS ANALYSIS ===")

# Actual trades characteristics
if actual_trades > 0:
    print("--- ACTUAL TRADES (qty > 0) ---")
    actual_trades_data = df[df['qty'] > 0]
    print(f"  Total volume: {actual_trades_data['qty'].sum():,}")
    print(f"  Total turnover: ₹{actual_trades_data['trnvr'].sum():,.2f}")
    print(f"  Average trade size: {actual_trades_data['qty'].mean():,.0f}")
    print(f"  Average trade value: ₹{actual_trades_data['trnvr'].mean():,.2f}")
    print(f"  Price range: ₹{actual_trades_data['price'].min():.2f} - ₹{actual_trades_data['price'].max():.2f}")

# Zero-quantity updates characteristics (if any exist)
if zero_qty_updates > 0:
    print(f"\n--- ZERO-QUANTITY UPDATES (qty = 0) ---")
    zero_qty_data = df[df['qty'] == 0]
    print(f"  Total records: {len(zero_qty_data):,}")
    print(f"  Price range: ₹{zero_qty_data['price'].min():.2f} - ₹{zero_qty_data['price'].max():.2f}")
    print(f"  Average price: ₹{zero_qty_data['price'].mean():.2f}")
    
    # Check if these are bid-ask spread updates
    if zero_qty_data['trnvr'].sum() == 0:
        print(f"  All have zero turnover (typical bid-ask updates)")
    else:
        print(f"  Some have non-zero turnover (data quality issue)")

# Data quality implications
print(f"\n=== DATA QUALITY IMPLICATIONS ===")
if zero_qty_updates > 0:
    print("⚠️  Mixed data types detected:")
    print("   - Actual trades: Use for volume, turnover, and price analysis")
    print("   - Zero-qty updates: Use for bid-ask spread analysis only")
    print("   - Consider filtering by qty > 0 for trade-based analysis")
else:
    print("✓ Clean dataset with only actual trades")

# Recommendations
print(f"\n=== RECOMMENDATIONS ===")
if zero_qty_updates > 0:
    print("For different types of analysis:")
    print("  📊 Volume/Turnover analysis: Use df[df['qty'] > 0]")
    print("  📈 Price movement analysis: Use df[df['qty'] > 0]")
    print("  🔍 Bid-ask spread analysis: Use df[df['qty'] == 0]")
    print("  📋 Complete market picture: Use full dataset")
else:
    print("  ✓ Dataset is ready for all types of analysis")

# Summary
print(f"\n=== SUMMARY ===")
print(f"✓ Total records: {total_records:,}")
print(f"✓ Actual trades: {actual_trades:,} ({actual_trades_pct:.1f}%)")
print(f"✓ Zero-qty updates: {zero_qty_updates:,} ({zero_qty_pct:.1f}%)")
print(f"✓ Data type: {'Mixed (trades + updates)' if zero_qty_updates > 0 else 'Pure trades only'}")

=== TRADE TYPE ANALYSIS ===
Total records in dataset: 10,956
Actual trades (qty > 0): 10,956
Zero-quantity updates (qty = 0): 0

=== PERCENTAGE BREAKDOWN ===
Actual trades: 100.00%
Zero-quantity updates: 0.00%

=== DATA COMPOSITION ANALYSIS ===
✓ Dataset contains only actual trades (all qty > 0)

=== SAMPLE OF ACTUAL TRADES (qty > 0) ===
                 date   price    qty        trnvr    cum_trnvr
0 2025-08-07 09:15:00  386.85  65740  25431519.00  25431519.00
1 2025-08-07 09:15:01  386.30    895    345738.50  25777257.50
2 2025-08-07 09:15:01  386.75   1401    541836.75  26319094.25
3 2025-08-07 09:15:02  386.80   1795    694306.00  27013400.25
4 2025-08-07 09:15:02  386.95    741    286729.95  27300130.20
5 2025-08-07 09:15:03  386.90   2717   1051207.30  28351337.50
6 2025-08-07 09:15:03  386.80   9068   3507502.40  31858839.90
7 2025-08-07 09:15:04  386.75   1141    441281.75  32300121.65
8 2025-08-07 09:15:04  386.90   1798    695646.20  32995767.85
9 2025-08-07 09:15:05  386.75 

In [13]:
# Create price_change = current price - previous price
print("=== PRICE CHANGE CALCULATION ===")

# Calculate price change (current price - previous price)
df['price_change'] = df['price'].diff()

# Display the first few rows to verify the calculation
print("=== SAMPLE DATA WITH PRICE CHANGE ===")
print(df[['date', 'price', 'price_change', 'qty', 'trnvr']].head(10))

# Basic statistics of price changes
print(f"\n=== PRICE CHANGE STATISTICS ===")
print(f"Total price changes calculated: {len(df['price_change'].dropna())}")
print(f"First price change: {df['price_change'].iloc[1]:.2f}")  # First change is at index 1
print(f"Last price change: {df['price_change'].iloc[-1]:.2f}")

# Statistical summary of price changes
print(f"\n=== PRICE CHANGE DESCRIPTIVE STATISTICS ===")
price_change_stats = df['price_change'].describe()
print(price_change_stats)

# Analyze price change distribution
print(f"\n=== PRICE CHANGE DISTRIBUTION ANALYSIS ===")
positive_changes = (df['price_change'] > 0).sum()
negative_changes = (df['price_change'] < 0).sum()
zero_changes = (df['price_change'] == 0).sum()
total_changes = len(df['price_change'].dropna())

print(f"Positive price changes: {positive_changes:,} ({(positive_changes/total_changes*100):.2f}%)")
print(f"Negative price changes: {negative_changes:,} ({(negative_changes/total_changes*100):.2f}%)")
print(f"No price changes: {zero_changes:,} ({(zero_changes/total_changes*100):.2f}%)")

# Price change magnitude analysis
print(f"\n=== PRICE CHANGE MAGNITUDE ANALYSIS ===")
abs_price_changes = df['price_change'].abs()
print(f"Average absolute price change: ₹{abs_price_changes.mean():.2f}")
print(f"Median absolute price change: ₹{abs_price_changes.median():.2f}")
print(f"Largest price increase: ₹{df['price_change'].max():.2f}")
print(f"Largest price decrease: ₹{df['price_change'].min():.2f}")

# Show examples of different types of price changes
print(f"\n=== EXAMPLES OF PRICE CHANGES ===")

# Largest price increases
print("Top 5 largest price increases:")
largest_increases = df.nlargest(5, 'price_change')[['date', 'price', 'price_change', 'qty']]
print(largest_increases.to_string(index=False))

# Largest price decreases
print(f"\nTop 5 largest price decreases:")
largest_decreases = df.nsmallest(5, 'price_change')[['date', 'price', 'price_change', 'qty']]
print(largest_decreases.to_string(index=False))

# No change examples
if zero_changes > 0:
    print(f"\nSample of rows with no price change:")
    no_change_sample = df[df['price_change'] == 0][['date', 'price', 'price_change', 'qty']].head(5)
    print(no_change_sample.to_string(index=False))

# Price change patterns
print(f"\n=== PRICE CHANGE PATTERNS ===")
print(f"Price changes per hour: {total_changes / ((df['date'].max() - df['date'].min()).total_seconds() / 3600):.1f}")
print(f"Average price change frequency: {total_changes / len(df):.2f} changes per tick")

# Verify calculation integrity
print(f"\n=== CALCULATION VERIFICATION ===")
print(f"✓ Price change column created successfully")
print(f"✓ First row price_change is NaN (no previous price to compare)")
print(f"✓ Total rows: {len(df):,}")
print(f"✓ Price changes calculated: {total_changes:,}")
print(f"✓ Ready for price movement analysis")

# Display final dataframe info
print(f"\n=== UPDATED DATAFRAME INFO ===")
print(f"Columns: {list(df.columns)}")
print(f"Shape: {df.shape}")
print(f"Data types:")
print(df.dtypes)

=== PRICE CHANGE CALCULATION ===
=== SAMPLE DATA WITH PRICE CHANGE ===
                 date   price  price_change    qty        trnvr
0 2025-08-07 09:15:00  386.85           NaN  65740  25431519.00
1 2025-08-07 09:15:01  386.30         -0.55    895    345738.50
2 2025-08-07 09:15:01  386.75          0.45   1401    541836.75
3 2025-08-07 09:15:02  386.80          0.05   1795    694306.00
4 2025-08-07 09:15:02  386.95          0.15    741    286729.95
5 2025-08-07 09:15:03  386.90         -0.05   2717   1051207.30
6 2025-08-07 09:15:03  386.80         -0.10   9068   3507502.40
7 2025-08-07 09:15:04  386.75         -0.05   1141    441281.75
8 2025-08-07 09:15:04  386.90          0.15   1798    695646.20
9 2025-08-07 09:15:05  386.75         -0.15   1679    649353.25

=== PRICE CHANGE STATISTICS ===
Total price changes calculated: 10955
First price change: -0.55
Last price change: 0.00

=== PRICE CHANGE DESCRIPTIVE STATISTICS ===
count    10955.000000
mean         0.000128
std          0.

In [14]:
# Create direction column: "Up", "Down", "No change"
print("=== DIRECTION COLUMN CREATION ===")

# Create direction column based on price_change
df['direction'] = df['price_change'].apply(lambda x: 
    'Up' if x > 0 else 
    'Down' if x < 0 else 
    'No change'
)

# Display the first few rows to verify the direction column
print("=== SAMPLE DATA WITH DIRECTION ===")
print(df[['date', 'price', 'price_change', 'direction', 'qty', 'trnvr']].head(10))

# Count the occurrences of each direction
print(f"\n=== DIRECTION DISTRIBUTION ===")
direction_counts = df['direction'].value_counts()
print(direction_counts)

# Calculate percentages
total_rows = len(df)
print(f"\n=== DIRECTION PERCENTAGES ===")
for direction, count in direction_counts.items():
    percentage = (count / total_rows) * 100
    print(f"{direction}: {count:,} ({percentage:.2f}%)")

# Analyze direction patterns
print(f"\n=== DIRECTION PATTERN ANALYSIS ===")

# Check for consecutive directions
print("Direction sequence analysis:")
consecutive_up = 0
consecutive_down = 0
consecutive_no_change = 0
max_consecutive_up = 0
max_consecutive_down = 0
max_consecutive_no_change = 0

current_up = 0
current_down = 0
current_no_change = 0

for direction in df['direction']:
    if direction == 'Up':
        current_up += 1
        current_down = 0
        current_no_change = 0
        max_consecutive_up = max(max_consecutive_up, current_up)
    elif direction == 'Down':
        current_down += 1
        current_up = 0
        current_no_change = 0
        max_consecutive_down = max(max_consecutive_down, current_down)
    else:  # No change
        current_no_change += 1
        current_up = 0
        current_down = 0
        max_consecutive_no_change = max(max_consecutive_no_change, current_no_change)

print(f"Maximum consecutive 'Up' movements: {max_consecutive_up}")
print(f"Maximum consecutive 'Down' movements: {max_consecutive_down}")
print(f"Maximum consecutive 'No change': {max_consecutive_no_change}")

# Direction by time periods
print(f"\n=== DIRECTION BY TIME PERIODS ===")
df['hour'] = df['date'].dt.hour
hourly_direction = df.groupby('hour')['direction'].value_counts().unstack(fill_value=0)

print("Direction distribution by hour:")
print(hourly_direction)

# Direction transitions (what follows what)
print(f"\n=== DIRECTION TRANSITIONS ===")
transitions = []
for i in range(1, len(df)):
    prev_direction = df['direction'].iloc[i-1]
    curr_direction = df['direction'].iloc[i]
    transitions.append((prev_direction, curr_direction))

transition_counts = pd.Series(transitions).value_counts().head(10)
print("Most common direction transitions:")
for transition, count in transition_counts.items():
    print(f"  {transition[0]} → {transition[1]}: {count:,} times")

# Direction with volume analysis
print(f"\n=== DIRECTION WITH VOLUME ANALYSIS ===")
direction_volume = df.groupby('direction')['qty'].agg(['sum', 'mean', 'count'])
print("Volume analysis by direction:")
print(direction_volume)

# Direction with price change magnitude
print(f"\n=== DIRECTION WITH PRICE CHANGE MAGNITUDE ===")
direction_magnitude = df.groupby('direction')['price_change'].agg(['mean', 'std', 'min', 'max'])
print("Price change magnitude by direction:")
print(direction_magnitude)

# Show examples of each direction
print(f"\n=== EXAMPLES OF EACH DIRECTION ===")

# Up movements
up_examples = df[df['direction'] == 'Up'][['date', 'price', 'price_change', 'qty']].head(3)
print("Sample 'Up' movements:")
print(up_examples.to_string(index=False))

# Down movements
down_examples = df[df['direction'] == 'Down'][['date', 'price', 'price_change', 'qty']].head(3)
print(f"\nSample 'Down' movements:")
print(down_examples.to_string(index=False))

# No change
no_change_examples = df[df['direction'] == 'No change'][['date', 'price', 'price_change', 'qty']].head(3)
print(f"\nSample 'No change':")
print(no_change_examples.to_string(index=False))

# Summary statistics
print(f"\n=== DIRECTION SUMMARY ===")
print(f"✓ Direction column created successfully")
print(f"✓ Total rows: {total_rows:,}")
print(f"✓ Up movements: {direction_counts.get('Up', 0):,}")
print(f"✓ Down movements: {direction_counts.get('Down', 0):,}")
print(f"✓ No change: {direction_counts.get('No change', 0):,}")
print(f"✓ Most common direction: {direction_counts.index[0]}")
print(f"✓ Ready for directional analysis and pattern recognition")

# Display final dataframe info
print(f"\n=== UPDATED DATAFRAME INFO ===")
print(f"Columns: {list(df.columns)}")
print(f"Shape: {df.shape}")

=== DIRECTION COLUMN CREATION ===
=== SAMPLE DATA WITH DIRECTION ===
                 date   price  price_change  direction    qty        trnvr
0 2025-08-07 09:15:00  386.85           NaN  No change  65740  25431519.00
1 2025-08-07 09:15:01  386.30         -0.55       Down    895    345738.50
2 2025-08-07 09:15:01  386.75          0.45         Up   1401    541836.75
3 2025-08-07 09:15:02  386.80          0.05         Up   1795    694306.00
4 2025-08-07 09:15:02  386.95          0.15         Up    741    286729.95
5 2025-08-07 09:15:03  386.90         -0.05       Down   2717   1051207.30
6 2025-08-07 09:15:03  386.80         -0.10       Down   9068   3507502.40
7 2025-08-07 09:15:04  386.75         -0.05       Down   1141    441281.75
8 2025-08-07 09:15:04  386.90          0.15         Up   1798    695646.20
9 2025-08-07 09:15:05  386.75         -0.15       Down   1679    649353.25

=== DIRECTION DISTRIBUTION ===
direction
No change    4441
Down         3299
Up           3216
Name: coun

In [15]:
# Calculate rolling averages (1 min, 5 min, 15 min) for price
print("=== ROLLING AVERAGE CALCULATION ===")

# First, ensure the dataframe is sorted by datetime
df = df.sort_values('date').reset_index(drop=True)

# Set datetime as index for time-based rolling operations
df_temp = df.set_index('date')

# Calculate rolling averages at different time intervals
print("Calculating rolling averages...")

# 1-minute rolling average
df_temp['price_1min_avg'] = df_temp['price'].rolling(window='1T', min_periods=1).mean()

# 5-minute rolling average
df_temp['price_5min_avg'] = df_temp['price'].rolling(window='5T', min_periods=1).mean()

# 15-minute rolling average
df_temp['price_15min_avg'] = df_temp['price'].rolling(window='15T', min_periods=1).mean()

# Reset index to get back to normal dataframe format
df = df_temp.reset_index()

# Display sample data with rolling averages
print("=== SAMPLE DATA WITH ROLLING AVERAGES ===")
print(df[['date', 'price', 'price_1min_avg', 'price_5min_avg', 'price_15min_avg', 'qty']].head(15))

# Basic statistics of rolling averages
print(f"\n=== ROLLING AVERAGE STATISTICS ===")
rolling_cols = ['price_1min_avg', 'price_5min_avg', 'price_15min_avg']

for col in rolling_cols:
    print(f"\n{col}:")
    print(f"  Min: ₹{df[col].min():.2f}")
    print(f"  Max: ₹{df[col].min():.2f}")
    print(f"  Mean: ₹{df[col].mean():.2f}")
    print(f"  Std: ₹{df[col].std():.2f}")

# Compare current price vs rolling averages
print(f"\n=== PRICE VS ROLLING AVERAGES ANALYSIS ===")

# Calculate differences from rolling averages
df['price_vs_1min'] = df['price'] - df['price_1min_avg']
df['price_vs_5min'] = df['price'] - df['price_5min_avg']
df['price_vs_15min'] = df['price'] - df['price_15min_avg']

# Show statistics of these differences
print("Price differences from rolling averages:")
print(f"  vs 1-min avg: Mean={df['price_vs_1min'].mean():.2f}, Std={df['price_vs_1min'].std():.2f}")
print(f"  vs 5-min avg: Mean={df['price_vs_5min'].mean():.2f}, Std={df['price_vs_5min'].std():.2f}")
print(f"  vs 15-min avg: Mean={df['price_vs_15min'].mean():.2f}, Std={df['price_vs_15min'].std():.2f}")

# Identify when price is above/below each moving average
print(f"\n=== MOVING AVERAGE CROSSOVER ANALYSIS ===")

# Price position relative to moving averages
df['above_1min'] = df['price'] > df['price_1min_avg']
df['above_5min'] = df['price'] > df['price_5min_avg']
df['above_15min'] = df['price'] > df['price_15min_avg']

# Count how many times price is above each moving average
above_1min_count = df['above_1min'].sum()
above_5min_count = df['above_5min'].sum()
above_15min_count = df['above_15min'].sum()

print(f"Price above 1-min average: {above_1min_count:,} times ({(above_1min_count/len(df)*100):.1f}%)")
print(f"Price above 5-min average: {above_5min_count:,} times ({(above_5min_count/len(df)*100):.1f}%)")
print(f"Price above 15-min average: {above_15min_count:,} times ({(above_15min_count/len(df)*100):.1f}%)")

# Moving average crossover signals
print(f"\n=== MOVING AVERAGE CROSSOVER SIGNALS ===")

# 1-min vs 5-min crossover
df['ma_1min_5min_cross'] = (df['price_1min_avg'] > df['price_5min_avg']).astype(int)
df['ma_1min_5min_signal'] = df['ma_1min_5min_cross'].diff()

# 5-min vs 15-min crossover
df['ma_5min_15min_cross'] = (df['price_5min_avg'] > df['price_15min_avg']).astype(int)
df['ma_5min_15min_signal'] = df['ma_5min_15min_cross'].diff()

# Count crossover signals
bullish_1min_5min = (df['ma_1min_5min_signal'] == 1).sum()
bearish_1min_5min = (df['ma_1min_5min_signal'] == -1).sum()

bullish_5min_15min = (df['ma_5min_15min_signal'] == 1).sum()
bearish_5min_15min = (df['ma_5min_15min_signal'] == -1).sum()

print(f"1-min vs 5-min crossovers:")
print(f"  Bullish (1-min crosses above 5-min): {bullish_1min_5min}")
print(f"  Bearish (1-min crosses below 5-min): {bearish_1min_5min}")

print(f"\n5-min vs 15-min crossovers:")
print(f"  Bullish (5-min crosses above 15-min): {bullish_5min_15min}")
print(f"  Bearish (5-min crosses below 15-min): {bearish_5min_15min}")

# Show sample of crossover signals
print(f"\n=== SAMPLE CROSSOVER SIGNALS ===")
crossover_sample = df[df['ma_1min_5min_signal'] != 0][['date', 'price', 'price_1min_avg', 'price_5min_avg', 'ma_1min_5min_signal']].head(10)
print("Recent 1-min vs 5-min crossovers:")
print(crossover_sample.to_string(index=False))

# Rolling average trends
print(f"\n=== ROLLING AVERAGE TRENDS ===")

# Calculate the slope of each moving average (trend direction)
df['ma_1min_slope'] = df['price_1min_avg'].diff()
df['ma_5min_slope'] = df['price_5min_avg'].diff()
df['ma_15min_slope'] = df['price_15min_avg'].diff()

# Count trending periods
trending_1min_up = (df['ma_1min_slope'] > 0).sum()
trending_1min_down = (df['ma_1min_slope'] < 0).sum()

print(f"1-min moving average trends:")
print(f"  Upward trending periods: {trending_1min_up:,}")
print(f"  Downward trending periods: {trending_1min_down:,}")

# Summary
print(f"\n=== ROLLING AVERAGE SUMMARY ===")
print(f"✓ 1-minute rolling average calculated")
print(f"✓ 5-minute rolling average calculated")
print(f"✓ 15-minute rolling average calculated")
print(f"✓ Crossover signals identified")
print(f"✓ Trend analysis completed")
print(f"✓ Ready for technical analysis and trading signals")

# Display final dataframe info
print(f"\n=== UPDATED DATAFRAME INFO ===")
print(f"Columns: {list(df.columns)}")
print(f"Shape: {df.shape}")
print(f"New columns added:")
new_cols = ['price_1min_avg', 'price_5min_avg', 'price_15min_avg', 'price_vs_1min', 
            'price_vs_5min', 'price_vs_15min', 'above_1min', 'above_5min', 'above_15min',
            'ma_1min_5min_cross', 'ma_1min_5min_signal', 'ma_5min_15min_cross', 
            'ma_5min_15min_signal', 'ma_1min_slope', 'ma_5min_slope', 'ma_15min_slope']
for col in new_cols:
    if col in df.columns:
        print(f"  ✓ {col}")

=== ROLLING AVERAGE CALCULATION ===
Calculating rolling averages...
=== SAMPLE DATA WITH ROLLING AVERAGES ===
                  date   price  price_1min_avg  price_5min_avg  \
0  2025-08-07 09:15:00  386.85      386.850000      386.850000   
1  2025-08-07 09:15:01  386.30      386.575000      386.575000   
2  2025-08-07 09:15:01  386.75      386.633333      386.633333   
3  2025-08-07 09:15:02  386.80      386.675000      386.675000   
4  2025-08-07 09:15:02  386.95      386.730000      386.730000   
5  2025-08-07 09:15:03  386.90      386.758333      386.758333   
6  2025-08-07 09:15:03  386.80      386.764286      386.764286   
7  2025-08-07 09:15:04  386.75      386.762500      386.762500   
8  2025-08-07 09:15:04  386.90      386.777778      386.777778   
9  2025-08-07 09:15:05  386.65      386.765000      386.765000   
10 2025-08-07 09:15:05  386.75      386.763636      386.763636   
11 2025-08-07 09:15:06  386.65      386.754167      386.754167   
12 2025-08-07 09:15:06  386.55  

  df_temp['price_1min_avg'] = df_temp['price'].rolling(window='1T', min_periods=1).mean()
  df_temp['price_5min_avg'] = df_temp['price'].rolling(window='5T', min_periods=1).mean()
  df_temp['price_15min_avg'] = df_temp['price'].rolling(window='15T', min_periods=1).mean()


In [16]:
# Calculate rolling sum of volume over time windows
print("=== ROLLING VOLUME SUM CALCULATION ===")

# Ensure the dataframe is sorted by datetime and has datetime index
df_temp = df.set_index('date')

# Calculate rolling sum of volume at different time intervals
print("Calculating rolling volume sums...")

# 1-minute rolling volume sum
df_temp['volume_1min_sum'] = df_temp['qty'].rolling(window='1T', min_periods=1).sum()

# 5-minute rolling volume sum
df_temp['volume_5min_sum'] = df_temp['qty'].rolling(window='5T', min_periods=1).sum()

# 15-minute rolling volume sum
df_temp['volume_15min_sum'] = df_temp['qty'].rolling(window='15T', min_periods=1).sum()

# 30-minute rolling volume sum
df_temp['volume_30min_sum'] = df_temp['qty'].rolling(window='30T', min_periods=1).sum()

# Reset index to get back to normal dataframe format
df = df_temp.reset_index()

# Display sample data with rolling volume sums
print("=== SAMPLE DATA WITH ROLLING VOLUME SUMS ===")
print(df[['date', 'qty', 'volume_1min_sum', 'volume_5min_sum', 'volume_15min_sum', 'volume_30min_sum']].head(15))

# Basic statistics of rolling volume sums
print(f"\n=== ROLLING VOLUME SUM STATISTICS ===")
volume_sum_cols = ['volume_1min_sum', 'volume_5min_sum', 'volume_15min_sum', 'volume_30min_sum']

for col in volume_sum_cols:
    print(f"\n{col}:")
    print(f"  Min: {df[col].min():,.0f}")
    print(f"  Max: {df[col].max():,.0f}")
    print(f"  Mean: {df[col].mean():,.0f}")
    print(f"  Std: {df[col].std():,.0f}")

# Volume analysis by time windows
print(f"\n=== VOLUME ANALYSIS BY TIME WINDOWS ===")

# Compare volume across different timeframes
print("Volume comparison across timeframes:")
print(f"  1-min average: {df['volume_1min_sum'].mean():,.0f}")
print(f"  5-min average: {df['volume_5min_sum'].mean():,.0f}")
print(f"  15-min average: {df['volume_15min_sum'].mean():,.0f}")
print(f"  30-min average: {df['volume_30min_sum'].mean():,.0f}")

# Volume intensity analysis
print(f"\n=== VOLUME INTENSITY ANALYSIS ===")

# Calculate volume per minute for each window
df['volume_1min_per_min'] = df['volume_1min_sum'] / 1
df['volume_5min_per_min'] = df['volume_5min_sum'] / 5
df['volume_15min_per_min'] = df['volume_15min_sum'] / 15
df['volume_30min_per_min'] = df['volume_30min_sum'] / 30

print("Volume per minute for each window:")
print(f"  1-min window: {df['volume_1min_per_min'].mean():,.0f} per minute")
print(f"  5-min window: {df['volume_5min_per_min'].mean():,.0f} per minute")
print(f"  15-min window: {df['volume_15min_per_min'].mean():,.0f} per minute")
print(f"  30-min window: {df['volume_30min_per_min'].mean():,.0f} per minute")

# Volume spikes detection
print(f"\n=== VOLUME SPIKE DETECTION ===")

# Find periods of unusually high volume (above 2 standard deviations)
for col in volume_sum_cols:
    mean_vol = df[col].mean()
    std_vol = df[col].std()
    threshold = mean_vol + 2 * std_vol
    
    high_volume_periods = (df[col] > threshold).sum()
    print(f"{col}: {high_volume_periods} periods above {threshold:,.0f} (2σ threshold)")

# Show examples of volume spikes
print(f"\n=== EXAMPLES OF VOLUME SPIKES ===")
for col in volume_sum_cols:
    mean_vol = df[col].mean()
    std_vol = df[col].std()
    threshold = mean_vol + 2 * std_vol
    
    spikes = df[df[col] > threshold][['date', col, 'qty', 'price']].head(3)
    if len(spikes) > 0:
        print(f"\n{col} spikes (top 3):")
        print(spikes.to_string(index=False))

# Volume trend analysis
print(f"\n=== VOLUME TREND ANALYSIS ===")

# Calculate volume trends (slopes) for each window
df['volume_1min_trend'] = df['volume_1min_sum'].diff()
df['volume_5min_trend'] = df['volume_5min_sum'].diff()
df['volume_15min_trend'] = df['volume_15min_sum'].diff()
df['volume_30min_trend'] = df['volume_30min_sum'].diff()

# Count increasing vs decreasing volume periods
for col in ['volume_1min_trend', 'volume_5min_trend', 'volume_15min_trend', 'volume_30min_trend']:
    increasing = (df[col] > 0).sum()
    decreasing = (df[col] < 0).sum()
    window_name = col.replace('_trend', '').replace('_', ' ').title()
    print(f"{window_name}:")
    print(f"  Increasing: {increasing:,} periods")
    print(f"  Decreasing: {decreasing:,} periods")

# Volume vs Price correlation
print(f"\n=== VOLUME-PRICE CORRELATION ===")

# Calculate correlation between volume and price for each timeframe
for col in volume_sum_cols:
    correlation = df[col].corr(df['price'])
    print(f"{col} vs Price correlation: {correlation:.4f}")

# Volume vs Price change correlation
print(f"\nVolume vs Price Change correlation:")
for col in volume_sum_cols:
    correlation = df[col].corr(df['price_change'])
    print(f"{col} vs Price Change correlation: {correlation:.4f}")

# Time-based volume analysis
print(f"\n=== TIME-BASED VOLUME ANALYSIS ===")

# Volume by hour of the day
df['hour'] = df['date'].dt.hour
hourly_volume = df.groupby('hour')['qty'].sum()
print("Total volume by hour:")
for hour, volume in hourly_volume.items():
    print(f"  {hour:02d}:00 - {hour:02d}:59: {volume:,}")

# Rolling volume vs cumulative volume comparison
print(f"\n=== ROLLING VS CUMULATIVE VOLUME ===")
print(f"Final cumulative volume: {df['cum_trnvr'].iloc[-1]/df['price'].iloc[-1]:,.0f}")
print(f"Final 30-min rolling volume: {df['volume_30min_sum'].iloc[-1]:,.0f}")
print(f"Rolling volume as % of total: {(df['volume_30min_sum'].iloc[-1]/(df['cum_trnvr'].iloc[-1]/df['price'].iloc[-1])*100):.1f}%")

# Summary
print(f"\n=== ROLLING VOLUME SUMMARY ===")
print(f"✓ 1-minute rolling volume sum calculated")
print(f"✓ 5-minute rolling volume sum calculated")
print(f"✓ 15-minute rolling volume sum calculated")
print(f"✓ 30-minute rolling volume sum calculated")
print(f"✓ Volume intensity analysis completed")
print(f"✓ Volume spike detection implemented")
print(f"✓ Volume-price correlation analyzed")
print(f"✓ Ready for volume-based analysis and trading signals")

# Display final dataframe info
print(f"\n=== UPDATED DATAFRAME INFO ===")
print(f"Columns: {list(df.columns)}")
print(f"Shape: {df.shape}")
print(f"New volume columns added:")
volume_new_cols = ['volume_1min_sum', 'volume_5min_sum', 'volume_15min_sum', 'volume_30min_sum',
                   'volume_1min_per_min', 'volume_5min_per_min', 'volume_15min_per_min', 'volume_30min_per_min',
                   'volume_1min_trend', 'volume_5min_trend', 'volume_15min_trend', 'volume_30min_trend']
for col in volume_new_cols:
    if col in df.columns:
        print(f"  ✓ {col}")

=== ROLLING VOLUME SUM CALCULATION ===
Calculating rolling volume sums...
=== SAMPLE DATA WITH ROLLING VOLUME SUMS ===
                  date    qty  volume_1min_sum  volume_5min_sum  \
0  2025-08-07 09:15:00  65740          65740.0          65740.0   
1  2025-08-07 09:15:01    895          66635.0          66635.0   
2  2025-08-07 09:15:01   1401          68036.0          68036.0   
3  2025-08-07 09:15:02   1795          69831.0          69831.0   
4  2025-08-07 09:15:02    741          70572.0          70572.0   
5  2025-08-07 09:15:03   2717          73289.0          73289.0   
6  2025-08-07 09:15:03   9068          82357.0          82357.0   
7  2025-08-07 09:15:04   1141          83498.0          83498.0   
8  2025-08-07 09:15:04   1798          85296.0          85296.0   
9  2025-08-07 09:15:05   2092          87388.0          87388.0   
10 2025-08-07 09:15:05   1679          89067.0          89067.0   
11 2025-08-07 09:15:06   4519          93586.0          93586.0   
12 2025-08

  df_temp['volume_1min_sum'] = df_temp['qty'].rolling(window='1T', min_periods=1).sum()
  df_temp['volume_5min_sum'] = df_temp['qty'].rolling(window='5T', min_periods=1).sum()
  df_temp['volume_15min_sum'] = df_temp['qty'].rolling(window='15T', min_periods=1).sum()
  df_temp['volume_30min_sum'] = df_temp['qty'].rolling(window='30T', min_periods=1).sum()


In [17]:
# Compute VWAP (Volume Weighted Average Price)
print("=== VWAP CALCULATION ===")

# Calculate VWAP for the entire dataset
print("Calculating VWAP...")

# Method 1: Simple VWAP for entire dataset
total_volume = df['qty'].sum()
total_price_volume = (df['price'] * df['qty']).sum()
vwap_total = total_price_volume / total_volume

print(f"=== OVERALL VWAP ===")
print(f"Total volume: {total_volume:,}")
print(f"Total price × volume: ₹{total_price_volume:,.2f}")
print(f"Overall VWAP: ₹{vwap_total:.2f}")

# Method 2: Rolling VWAP over different time windows
print(f"\n=== ROLLING VWAP CALCULATION ===")

# Ensure datetime index for time-based rolling
df_temp = df.set_index('date')

# Calculate rolling VWAP for different time windows
df_temp['vwap_1min'] = (df_temp['price'] * df_temp['qty']).rolling(window='1T', min_periods=1).sum() / df_temp['qty'].rolling(window='1T', min_periods=1).sum()

df_temp['vwap_5min'] = (df_temp['price'] * df_temp['qty']).rolling(window='5T', min_periods=1).sum() / df_temp['qty'].rolling(window='5T', min_periods=1).sum()

df_temp['vwap_15min'] = (df_temp['price'] * df_temp['qty']).rolling(window='15T', min_periods=1).sum() / df_temp['qty'].rolling(window='15T', min_periods=1).sum()

df_temp['vwap_30min'] = (df_temp['price'] * df_temp['qty']).rolling(window='30T', min_periods=1).sum() / df_temp['qty'].rolling(window='30T', min_periods=1).sum()

# Reset index
df = df_temp.reset_index()

# Display sample data with VWAP values
print("=== SAMPLE DATA WITH VWAP VALUES ===")
print(df[['date', 'price', 'qty', 'vwap_1min', 'vwap_5min', 'vwap_15min', 'vwap_30min']].head(15))

# VWAP Statistics
print(f"\n=== VWAP STATISTICS ===")
vwap_cols = ['vwap_1min', 'vwap_5min', 'vwap_15min', 'vwap_30min']

for col in vwap_cols:
    print(f"\n{col}:")
    print(f"  Min: ₹{df[col].min():.2f}")
    print(f"  Max: ₹{df[col].min():.2f}")
    print(f"  Mean: ₹{df[col].mean():.2f}")
    print(f"  Std: ₹{df[col].std():.2f}")

# Price vs VWAP Analysis
print(f"\n=== PRICE VS VWAP ANALYSIS ===")

# Calculate price position relative to each VWAP
df['price_vs_vwap_1min'] = df['price'] - df['vwap_1min']
df['price_vs_vwap_5min'] = df['price'] - df['vwap_5min']
df['price_vs_vwap_15min'] = df['price'] - df['vwap_15min']
df['price_vs_vwap_30min'] = df['price'] - df['vwap_30min']

# Count how many times price is above/below each VWAP
print("Price position relative to VWAP:")
for i, vwap_type in enumerate(['1min', '5min', '15min', '30min']):
    col = f'price_vs_vwap_{vwap_type}'
    above_count = (df[col] > 0).sum()
    below_count = (df[col] < 0).sum()
    equal_count = (df[col] == 0).sum()
    
    print(f"\n  {vwap_type} VWAP:")
    print(f"    Above: {above_count:,} times ({(above_count/len(df)*100):.1f}%)")
    print(f"    Below: {below_count:,} times ({(below_count/len(df)*100):.1f}%)")
    print(f"    Equal: {equal_count:,} times ({(equal_count/len(df)*100):.1f}%)")

# VWAP as Support/Resistance
print(f"\n=== VWAP AS SUPPORT/RESISTANCE ===")

# Find periods when price bounces off VWAP
for i, vwap_type in enumerate(['1min', '5min', '15min', '30min']):
    vwap_col = f'vwap_{vwap_type}'
    price_vs_col = f'price_vs_vwap_{vwap_type}'
    
    # Find when price is very close to VWAP (within 0.1%)
    close_to_vwap = (abs(df[price_vs_col]) / df[vwap_col] * 100) < 0.1
    close_count = close_to_vwap.sum()
    
    print(f"{vwap_type} VWAP: {close_count:,} times price within 0.1% of VWAP")

# VWAP Crossover Analysis
print(f"\n=== VWAP CROSSOVER ANALYSIS ===")

# Price crossing above/below VWAP
for i, vwap_type in enumerate(['1min', '5min', '15min', '30min']):
    price_vs_col = f'price_vs_vwap_{vwap_type}'
    
    # Create crossover signals
    df[f'above_vwap_{vwap_type}'] = df[price_vs_col] > 0
    df[f'vwap_{vwap_type}_cross'] = df[f'above_vwap_{vwap_type}'].astype(int)
    df[f'vwap_{vwap_type}_signal'] = df[f'vwap_{vwap_type}_cross'].diff()
    
    # Count crossovers
    bullish_crosses = (df[f'vwap_{vwap_type}_signal'] == 1).sum()
    bearish_crosses = (df[f'vwap_{vwap_type}_signal'] == -1).sum()
    
    print(f"{vwap_type} VWAP crossovers:")
    print(f"  Bullish (price crosses above): {bullish_crosses}")
    print(f"  Bearish (price crosses below): {bearish_crosses}")

# VWAP Trend Analysis
print(f"\n=== VWAP TREND ANALYSIS ===")

# Calculate VWAP trends (slopes)
df['vwap_1min_trend'] = df['vwap_1min'].diff()
df['vwap_5min_trend'] = df['vwap_5min'].diff()
df['vwap_15min_trend'] = df['vwap_15min'].diff()
df['vwap_30min_trend'] = df['vwap_30min'].diff()

# Count trending periods
for i, vwap_type in enumerate(['1min', '5min', '15min', '30min']):
    trend_col = f'vwap_{vwap_type}_trend'
    up_trend = (df[trend_col] > 0).sum()
    down_trend = (df[trend_col] < 0).sum()
    
    print(f"{vwap_type} VWAP trends:")
    print(f"  Upward: {up_trend:,} periods")
    print(f"  Downward: {down_trend:,} periods")

# VWAP vs Simple Moving Average Comparison
print(f"\n=== VWAP VS SIMPLE MOVING AVERAGE ===")

# Compare VWAP with price averages
print("VWAP vs Simple Price Average:")
print(f"  Overall VWAP: ₹{vwap_total:.2f}")
print(f"  Simple Price Average: ₹{df['price'].mean():.2f}")
print(f"  Difference: ₹{vwap_total - df['price'].mean():.2f}")

# Show correlation between VWAP and price
for col in vwap_cols:
    correlation = df[col].corr(df['price'])
    print(f"  {col} vs Price correlation: {correlation:.4f}")

# Trading Signals based on VWAP
print(f"\n=== VWAP TRADING SIGNALS ===")

# Generate basic trading signals
df['vwap_signal'] = 'Hold'
df.loc[df['price'] > df['vwap_15min'], 'vwap_signal'] = 'Buy'
df.loc[df['price'] < df['vwap_15min'], 'vwap_signal'] = 'Sell'

signal_counts = df['vwap_signal'].value_counts()
print("VWAP-based trading signals (using 15-min VWAP):")
for signal, count in signal_counts.items():
    print(f"  {signal}: {count:,} times ({(count/len(df)*100):.1f}%)")

# Summary
print(f"\n=== VWAP SUMMARY ===")
print(f"✓ Overall VWAP: ₹{vwap_total:.2f}")
print(f"✓ Rolling VWAP calculated for 1min, 5min, 15min, 30min windows")
print(f"✓ Price vs VWAP analysis completed")
print(f"✓ VWAP crossover signals generated")
print(f"✓ VWAP trend analysis completed")
print(f"✓ Trading signals based on VWAP generated")
print(f"✓ Ready for VWAP-based trading strategies")

# Display final dataframe info
print(f"\n=== UPDATED DATAFRAME INFO ===")
print(f"Columns: {list(df.columns)}")
print(f"Shape: {df.shape}")
print(f"New VWAP columns added:")
vwap_new_cols = ['vwap_1min', 'vwap_5min', 'vwap_15min', 'vwap_30min',
                  'price_vs_vwap_1min', 'price_vs_vwap_5min', 'price_vs_vwap_15min', 'price_vs_vwap_30min',
                  'above_vwap_1min', 'above_vwap_5min', 'above_vwap_15min', 'above_vwap_30min',
                  'vwap_1min_cross', 'vwap_5min_cross', 'vwap_15min_cross', 'vwap_30min_cross',
                  'vwap_1min_signal', 'vwap_5min_signal', 'vwap_15min_signal', 'vwap_30min_signal',
                  'vwap_1min_trend', 'vwap_5min_trend', 'vwap_15min_trend', 'vwap_30min_trend',
                  'vwap_signal']
for col in vwap_new_cols:
    if col in df.columns:
        print(f"  ✓ {col}")

=== VWAP CALCULATION ===
Calculating VWAP...
=== OVERALL VWAP ===
Total volume: 11,510,171
Total price × volume: ₹4,451,969,111.70
Overall VWAP: ₹386.79

=== ROLLING VWAP CALCULATION ===
=== SAMPLE DATA WITH VWAP VALUES ===
                  date   price    qty   vwap_1min   vwap_5min  vwap_15min  \
0  2025-08-07 09:15:00  386.85  65740  386.850000  386.850000  386.850000   
1  2025-08-07 09:15:01  386.30    895  386.842613  386.842613  386.842613   
2  2025-08-07 09:15:01  386.75   1401  386.840706  386.840706  386.840706   
3  2025-08-07 09:15:02  386.80   1795  386.839659  386.839659  386.839659   
4  2025-08-07 09:15:02  386.95    741  386.840818  386.840818  386.840818   
5  2025-08-07 09:15:03  386.90   2717  386.843012  386.843012  386.843012   
6  2025-08-07 09:15:03  386.80   9068  386.838276  386.838276  386.838276   
7  2025-08-07 09:15:04  386.75   1141  386.837070  386.837070  386.837070   
8  2025-08-07 09:15:04  386.90   1798  386.838396  386.838396  386.838396   
9  202

  df_temp['vwap_1min'] = (df_temp['price'] * df_temp['qty']).rolling(window='1T', min_periods=1).sum() / df_temp['qty'].rolling(window='1T', min_periods=1).sum()
  df_temp['vwap_5min'] = (df_temp['price'] * df_temp['qty']).rolling(window='5T', min_periods=1).sum() / df_temp['qty'].rolling(window='5T', min_periods=1).sum()
  df_temp['vwap_15min'] = (df_temp['price'] * df_temp['qty']).rolling(window='15T', min_periods=1).sum() / df_temp['qty'].rolling(window='15T', min_periods=1).sum()
  df_temp['vwap_30min'] = (df_temp['price'] * df_temp['qty']).rolling(window='30T', min_periods=1).sum() / df_temp['qty'].rolling(window='30T', min_periods=1).sum()


In [19]:
# Aggregate trades per minute: avg price, total qty, total trnvr
print("=== MINUTE-BY-MINUTE TRADE AGGREGATION ===")

# Create minute-level aggregation
print("Aggregating trades by minute...")

# Extract minute from datetime for grouping (using 'min' instead of 'T')
df['minute_key'] = df['date'].dt.floor('1min')  # Floor to nearest minute

# Group by minute and aggregate
minute_agg = df.groupby('minute_key').agg({
    'price': ['mean', 'min', 'max', 'first', 'last'],  # Price statistics
    'qty': ['sum', 'count'],                           # Total quantity and trade count
    'trnvr': 'sum'                                     # Total turnover
}).round(2)

# Check the actual column structure before flattening
print(f"Original aggregation columns: {minute_agg.columns.tolist()}")
print(f"Column levels: {minute_agg.columns.nlevels}")

# Flatten column names correctly
minute_agg.columns = ['avg_price', 'min_price', 'max_price', 'open_price', 'close_price', 
                      'total_qty', 'trade_count', 'total_trnvr']

# Reset index to make minute_key a column
minute_agg = minute_agg.reset_index()
minute_agg.rename(columns={'minute_key': 'minute'}, inplace=True)

# Display the aggregated data
print("=== MINUTE-BY-MINUTE AGGREGATED DATA ===")
print(f"Total minutes with trades: {len(minute_agg)}")
print(f"Time range: {minute_agg['minute'].min()} to {minute_agg['minute'].max()}")

# Show first 15 rows
print(f"\n=== FIRST 15 MINUTES ===")
print(minute_agg.head(15).to_string(index=False))

# Show last 15 rows
print(f"\n=== LAST 15 MINUTES ===")
print(minute_agg.tail(15).to_string(index=False))

# Basic statistics of aggregated data
print(f"\n=== AGGREGATED DATA STATISTICS ===")
print("Price Statistics:")
print(f"  Average price range: ₹{minute_agg['avg_price'].min():.2f} - ₹{minute_agg['avg_price'].max():.2f}")
print(f"  Overall average price: ₹{minute_agg['avg_price'].mean():.2f}")
print(f"  Price volatility (std): ₹{minute_agg['avg_price'].std():.2f}")

print(f"\nVolume Statistics:")
print(f"  Total volume across all minutes: {minute_agg['total_qty'].sum():,}")
print(f"  Average volume per minute: {minute_agg['total_qty'].mean():,.0f}")
print(f"  Highest volume minute: {minute_agg['total_qty'].max():,}")
print(f"  Lowest volume minute: {minute_agg['total_qty'].min():,}")

print(f"\nTurnover Statistics:")
print(f"  Total turnover across all minutes: ₹{minute_agg['total_trnvr'].sum():,.2f}")
print(f"  Average turnover per minute: ₹{minute_agg['total_trnvr'].mean():,.2f}")
print(f"  Highest turnover minute: ₹{minute_agg['total_trnvr'].max():,.2f}")
print(f"  Lowest turnover minute: ₹{minute_agg['total_trnvr'].min():,.2f}")

print(f"\nTrade Count Statistics:")
print(f"  Total trades across all minutes: {minute_agg['trade_count'].sum():,}")
print(f"  Average trades per minute: {minute_agg['trade_count'].mean():.1f}")
print(f"  Busiest minute: {minute_agg['trade_count'].max():,} trades")
print(f"  Quietest minute: {minute_agg['trade_count'].min():,} trades")

# Minute-by-minute analysis
print(f"\n=== MINUTE-BY-MINUTE ANALYSIS ===")

# Find highest and lowest volume minutes
highest_volume_minute = minute_agg.loc[minute_agg['total_qty'].idxmax()]
lowest_volume_minute = minute_agg.loc[minute_agg['total_qty'].idxmin()]

print("Highest volume minute:")
print(f"  Time: {highest_volume_minute['minute']}")
print(f"  Volume: {highest_volume_minute['total_qty']:,}")
print(f"  Turnover: ₹{highest_volume_minute['total_trnvr']:,.2f}")
print(f"  Avg Price: ₹{highest_volume_minute['avg_price']:.2f}")
print(f"  Trades: {highest_volume_minute['trade_count']}")

print(f"\nLowest volume minute:")
print(f"  Time: {lowest_volume_minute['minute']}")
print(f"  Volume: {lowest_volume_minute['total_qty']:,}")
print(f"  Turnover: ₹{lowest_volume_minute['total_trnvr']:,.2f}")
print(f"  Avg Price: ₹{lowest_volume_minute['avg_price']:.2f}")
print(f"  Trades: {lowest_volume_minute['trade_count']}")

# Find highest and lowest turnover minutes
highest_turnover_minute = minute_agg.loc[minute_agg['total_trnvr'].idxmax()]
lowest_turnover_minute = minute_agg.loc[minute_agg['total_trnvr'].idxmin()]

print(f"\nHighest turnover minute:")
print(f"  Time: {highest_turnover_minute['minute']}")
print(f"  Turnover: ₹{highest_turnover_minute['total_trnvr']:,.2f}")
print(f"  Volume: {highest_turnover_minute['total_qty']:,}")
print(f"  Avg Price: ₹{highest_turnover_minute['avg_price']:.2f}")

# Time-based patterns
print(f"\n=== TIME-BASED PATTERNS ===")

# Add hour column for hourly analysis
minute_agg['hour'] = minute_agg['minute'].dt.hour

# Hourly volume analysis
hourly_volume = minute_agg.groupby('hour')['total_qty'].sum()
print("Total volume by hour:")
for hour, volume in hourly_volume.items():
    print(f"  {hour:02d}:00 - {hour:02d}:59: {volume:,}")

# Hourly turnover analysis
hourly_turnover = minute_agg.groupby('hour')['total_trnvr'].sum()
print(f"\nTotal turnover by hour:")
for hour, turnover in hourly_turnover.items():
    print(f"  {hour:02d}:00 - {hour:02d}:59: ₹{turnover:,.2f}")

# Price movement analysis per minute
print(f"\n=== PRICE MOVEMENT ANALYSIS PER MINUTE ===")

# Calculate minute-to-minute price changes
minute_agg['price_change'] = minute_agg['close_price'] - minute_agg['open_price']
minute_agg['price_change_pct'] = (minute_agg['price_change'] / minute_agg['open_price']) * 100

# Price change statistics
print(f"Minute-to-minute price changes:")
print(f"  Average change: ₹{minute_agg['price_change'].mean():.2f}")
print(f"  Average change %: {minute_agg['price_change_pct'].mean():.2f}%")
print(f"  Largest increase: ₹{minute_agg['price_change'].max():.2f}")
print(f"  Largest decrease: ₹{minute_agg['price_change'].min():.2f}")

# Volume-weighted price analysis
print(f"\n=== VOLUME-WEIGHTED ANALYSIS ===")

# Calculate VWAP for each minute
minute_agg['minute_vwap'] = minute_agg['total_trnvr'] / minute_agg['total_qty']

# Compare VWAP with average price
minute_agg['vwap_vs_avg_diff'] = minute_agg['minute_vwap'] - minute_agg['avg_price']
print(f"VWAP vs Average Price analysis:")
print(f"  Average difference: ₹{minute_agg['vwap_vs_avg_diff'].mean():.2f}")
print(f"  Max difference: ₹{minute_agg['vwap_vs_avg_diff'].max():.2f}")
print(f"  Min difference: ₹{minute_agg['vwap_vs_avg_diff'].min():.2f}")

# Trading intensity analysis
print(f"\n=== TRADING INTENSITY ANALYSIS ===")

# Calculate volume per trade for each minute
minute_agg['volume_per_trade'] = minute_agg['total_qty'] / minute_agg['trade_count']

print(f"Volume per trade analysis:")
print(f"  Average volume per trade: {minute_agg['volume_per_trade'].mean():,.0f}")
print(f"  Highest volume per trade: {minute_agg['volume_per_trade'].max():,.0f}")
print(f"  Lowest volume per trade: {minute_agg['volume_per_trade'].min():,.0f}")

# Summary table
print(f"\n" + "="*80)
print("=== SUMMARY TABLE ===")
summary_data = {
    'Metric': ['Total Minutes', 'Total Volume', 'Total Turnover', 'Total Trades', 'Avg Price', 'Avg Volume/Min', 'Avg Turnover/Min'],
    'Value': [
        f"{len(minute_agg):,}",
        f"{minute_agg['total_qty'].sum():,}",
        f"₹{minute_agg['total_trnvr'].sum():,.2f}",
        f"{minute_agg['trade_count'].sum():,}",
        f"₹{minute_agg['avg_price'].mean():.2f}",
        f"{minute_agg['total_qty'].mean():,.0f}",
        f"₹{minute_agg['total_trnvr'].mean():,.2f}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

# Save aggregated data
print(f"\n=== DATA EXPORT ===")
print(f"✓ Minute-by-minute aggregation completed")
print(f"✓ {len(minute_agg)} minutes of aggregated data")
print(f"✓ Ready for time-series analysis and visualization")
print(f"✓ Data can be exported to CSV for further analysis")

# Display final aggregated dataframe info
print(f"\n=== AGGREGATED DATAFRAME INFO ===")
print(f"Columns: {list(minute_agg.columns)}")
print(f"Shape: {minute_agg.shape}")
print(f"Data types:")
print(minute_agg.dtypes)

=== MINUTE-BY-MINUTE TRADE AGGREGATION ===
Aggregating trades by minute...
Original aggregation columns: [('price', 'mean'), ('price', 'min'), ('price', 'max'), ('price', 'first'), ('price', 'last'), ('qty', 'sum'), ('qty', 'count'), ('trnvr', 'sum')]
Column levels: 2
=== MINUTE-BY-MINUTE AGGREGATED DATA ===
Total minutes with trades: 375
Time range: 2025-08-07 09:15:00 to 2025-08-07 15:29:00

=== FIRST 15 MINUTES ===
             minute  avg_price  min_price  max_price  open_price  close_price  total_qty  trade_count  total_trnvr
2025-08-07 09:15:00     386.24     385.70     386.95      386.85       386.00     240694          116  93007031.40
2025-08-07 09:16:00     387.07     385.60     387.75      386.00       387.60     176938           94  68447339.50
2025-08-07 09:17:00     388.06     387.55     388.55      387.60       388.55     109071           15  42309419.75
2025-08-07 09:18:00     389.00     388.45     389.45      388.85       389.00     140751           14  54755417.20
202

In [21]:
# Aggregate per hour: price volatility, volume, turnover
print("=== HOURLY AGGREGATION ANALYSIS ===")

# Create hourly aggregation
print("Aggregating data by hour...")

# Extract hour from datetime for grouping
df['hour_key'] = df['date'].dt.hour

# Group by hour and aggregate
hourly_agg = df.groupby('hour_key').agg({
    'price': ['mean', 'min', 'max', 'std', 'first', 'last'],  # Price statistics including volatility
    'qty': ['sum', 'count'],                                   # Total volume and trade count
    'trnvr': 'sum',                                            # Total turnover
    'price_change': ['mean', 'std', 'min', 'max']              # Price change statistics
}).round(2)

# Flatten column names
hourly_agg.columns = [
    'avg_price', 'min_price', 'max_price', 'price_std', 'open_price', 'close_price',
    'total_qty', 'trade_count', 'total_trnvr',
    'avg_price_change', 'price_change_std', 'min_price_change', 'max_price_change'
]

# Reset index to make hour_key a column
hourly_agg = hourly_agg.reset_index()
hourly_agg.rename(columns={'hour_key': 'hour'}, inplace=True)

# Display the hourly aggregation
print(f"\n=== HOURLY AGGREGATION RESULTS ===")
print(f"Shape: {hourly_agg.shape}")
print(hourly_agg)

# Find highest and lowest volume hours
print(f"\n=== VOLUME ANALYSIS BY HOUR ===")
highest_volume_hour = hourly_agg.loc[hourly_agg['total_qty'].idxmax()]
lowest_volume_hour = hourly_agg.loc[hourly_agg['total_qty'].idxmin()]

print("Highest volume hour:")
print(f"  Hour: {int(highest_volume_hour['hour']):02d}:00 - {int(highest_volume_hour['hour']):02d}:59")
print(f"  Volume: {highest_volume_hour['total_qty']:,}")
print(f"  Turnover: ₹{highest_volume_hour['total_trnvr']:,.2f}")
print(f"  Average price: ₹{highest_volume_hour['avg_price']:.2f}")

print("\nLowest volume hour:")
print(f"  Hour: {int(lowest_volume_hour['hour']):02d}:00 - {int(lowest_volume_hour['hour']):02d}:59")
print(f"  Volume: {lowest_volume_hour['total_qty']:,}")
print(f"  Turnover: ₹{lowest_volume_hour['total_trnvr']:,.2f}")
print(f"  Average price: ₹{lowest_volume_hour['avg_price']:.2f}")

# Volatility analysis by hour
print(f"\n=== VOLATILITY ANALYSIS BY HOUR ===")
highest_volatility_hour = hourly_agg.loc[hourly_agg['price_std'].idxmax()]
print(f"Highest volatility hour: {int(highest_volatility_hour['hour']):02d}:00")
print(f"  Price standard deviation: ₹{highest_volatility_hour['price_std']:.2f}")
print(f"  Price range: ₹{highest_volatility_hour['min_price']:.2f} - ₹{highest_volatility_hour['max_price']:.2f}")

# Summary statistics
print(f"\n=== SUMMARY STATISTICS ===")
print(f"Total trading hours: {len(hourly_agg)}")
print(f"Average hourly volume: {hourly_agg['total_qty'].mean():,.0f}")
print(f"Average hourly turnover: ₹{hourly_agg['total_trnvr'].mean():,.2f}")
print(f"Average hourly price volatility: ₹{hourly_agg['price_std'].mean():.2f}")

=== HOURLY AGGREGATION ANALYSIS ===
Aggregating data by hour...

=== HOURLY AGGREGATION RESULTS ===
Shape: (7, 14)
   hour  avg_price  min_price  max_price  price_std  open_price  close_price  \
0     9     388.60     385.60     390.20       1.13      386.85       388.30   
1    10     388.23     387.45     389.25       0.39      388.10       388.10   
2    11     386.51     385.25     388.05       0.65      388.00       385.55   
3    12     385.22     384.40     386.50       0.44      385.70       385.40   
4    13     384.43     383.50     385.60       0.49      385.55       384.75   
5    14     385.97     384.70     387.10       0.63      384.70       387.05   
6    15     387.92     387.00     388.75       0.46      387.35       388.25   

   total_qty  trade_count   total_trnvr  avg_price_change  price_change_std  \
0    2164111         1047  8.408010e+08               0.0              0.12   
1    1436774         1476  5.577489e+08              -0.0              0.09   
2    13

In [22]:
# Aggregate per trading session: OHLC + total volume
print("=== TRADING SESSION OHLC ANALYSIS ===")

# Create trading session aggregation (assuming single day data)
print("Aggregating data by trading session...")

# For single day data, we can create session-level OHLC
# If you have multiple days, you can group by date_only instead
if len(df['date_only'].unique()) == 1:
    # Single trading day - create session OHLC
    session_ohlc = {
        'date': df['date_only'].iloc[0],
        'open': df['price'].iloc[0],           # First price of the day
        'high': df['price'].max(),             # Highest price of the day
        'low': df['price'].min(),              # Lowest price of the day
        'close': df['price'].iloc[-1],         # Last price of the day
        'total_volume': df['qty'].sum(),       # Total volume for the day
        'total_turnover': df['trnvr'].sum(),   # Total turnover for the day
        'trade_count': len(df),                # Total number of trades
        'price_range': df['price'].max() - df['price'].min(),  # High - Low
        'avg_price': df['price'].mean()        # Average price for the day
    }
    
    # Create DataFrame from the session data
    session_df = pd.DataFrame([session_ohlc])
    
    print(f"=== SINGLE TRADING SESSION OHLC ===")
    print(f"Date: {session_df['date'].iloc[0]}")
    print(f"Open: ₹{session_df['open'].iloc[0]:.2f}")
    print(f"High: ₹{session_df['high'].iloc[0]:.2f}")
    print(f"Low: ₹{session_df['low'].iloc[0]:.2f}")
    print(f"Close: ₹{session_df['close'].iloc[0]:.2f}")
    print(f"Price Range: ₹{session_df['price_range'].iloc[0]:.2f}")
    print(f"Total Volume: {session_df['total_volume'].iloc[0]:,}")
    print(f"Total Turnover: ₹{session_df['total_turnover'].iloc[0]:,.2f}")
    print(f"Trade Count: {session_df['trade_count'].iloc[0]:,}")
    print(f"Average Price: ₹{session_df['avg_price'].iloc[0]:.2f}")
    
    # Calculate additional metrics
    print(f"\n=== ADDITIONAL METRICS ===")
    session_df['price_change'] = session_df['close'] - session_df['open']
    session_df['price_change_pct'] = (session_df['price_change'] / session_df['open']) * 100
    
    print(f"Price Change: ₹{session_df['price_change'].iloc[0]:.2f}")
    print(f"Price Change %: {session_df['price_change_pct'].iloc[0]:.2f}%")
    
    # VWAP calculation for the session
    vwap = (df['price'] * df['qty']).sum() / df['qty'].sum()
    print(f"Session VWAP: ₹{vwap:.2f}")
    
    # Display the complete session DataFrame
    print(f"\n=== COMPLETE SESSION DATA ===")
    print(session_df)
    
else:
    # Multiple trading days - group by date
    print("Multiple trading days detected, grouping by date...")
    
    # Group by date and create OHLC for each day
    daily_ohlc = df.groupby('date_only').agg({
        'price': ['first', 'max', 'min', 'last'],  # OHLC
        'qty': 'sum',                               # Total volume
        'trnvr': 'sum',                             # Total turnover
        'qty': 'count'                              # Trade count
    }).round(2)
    
    # Flatten column names
    daily_ohlc.columns = ['open', 'high', 'low', 'close', 'total_volume', 'total_turnover', 'trade_count']
    
    # Reset index
    daily_ohlc = daily_ohlc.reset_index()
    
    # Calculate additional metrics
    daily_ohlc['price_range'] = daily_ohlc['high'] - daily_ohlc['low']
    daily_ohlc['price_change'] = daily_ohlc['close'] - daily_ohlc['open']
    daily_ohlc['price_change_pct'] = (daily_ohlc['price_change'] / daily_ohlc['open']) * 100
    
    print(f"\n=== MULTI-DAY OHLC DATA ===")
    print(daily_ohlc)

=== TRADING SESSION OHLC ANALYSIS ===
Aggregating data by trading session...
=== SINGLE TRADING SESSION OHLC ===
Date: 2025-08-07
Open: ₹386.85
High: ₹390.20
Low: ₹383.50
Close: ₹388.25
Price Range: ₹6.70
Total Volume: 11,510,171
Total Turnover: ₹4,451,969,111.70
Trade Count: 10,956
Average Price: ₹386.56

=== ADDITIONAL METRICS ===
Price Change: ₹1.40
Price Change %: 0.36%
Session VWAP: ₹386.79

=== COMPLETE SESSION DATA ===
         date    open   high    low   close  total_volume  total_turnover  \
0  2025-08-07  386.85  390.2  383.5  388.25      11510171    4.451969e+09   

   trade_count  price_range   avg_price  price_change  price_change_pct  
0        10956          6.7  386.561094           1.4          0.361897  


In [23]:
# Count trades per time interval (activity density)
print("=== TRADE ACTIVITY DENSITY ANALYSIS ===")

# Create different time intervals for analysis
print("Analyzing trade activity density across different time intervals...")

# 1. Minute-by-minute trade count
print("\n=== MINUTE-BY-MINUTE TRADE COUNT ===")
df['minute_key'] = df['date'].dt.floor('1min')
minute_trades = df.groupby('minute_key').size().reset_index(name='trade_count')
minute_trades['time'] = minute_trades['minute_key'].dt.time

print(f"Total minutes with trades: {len(minute_trades)}")
print(f"Average trades per minute: {minute_trades['trade_count'].mean():.1f}")
print(f"Max trades in a minute: {minute_trades['trade_count'].max()}")
print(f"Min trades in a minute: {minute_trades['trade_count'].min()}")

# Display top 10 most active minutes
print(f"\n=== TOP 10 MOST ACTIVE MINUTES ===")
top_active_minutes = minute_trades.nlargest(10, 'trade_count')
for idx, row in top_active_minutes.iterrows():
    print(f"{row['time']}: {row['trade_count']} trades")

# 2. 5-minute interval trade count
print(f"\n=== 5-MINUTE INTERVAL TRADE COUNT ===")
df['five_min_key'] = df['date'].dt.floor('5min')
five_min_trades = df.groupby('five_min_key').size().reset_index(name='trade_count')
five_min_trades['time'] = five_min_trades['five_min_key'].dt.time

print(f"Total 5-minute intervals: {len(five_min_trades)}")
print(f"Average trades per 5-min: {five_min_trades['trade_count'].mean():.1f}")
print(f"Max trades in 5-min: {five_min_trades['trade_count'].max()}")

# 3. 15-minute interval trade count
print(f"\n=== 15-MINUTE INTERVAL TRADE COUNT ===")
df['fifteen_min_key'] = df['date'].dt.floor('15min')
fifteen_min_trades = df.groupby('fifteen_min_key').size().reset_index(name='trade_count')
fifteen_min_trades['time'] = fifteen_min_trades['fifteen_min_key'].dt.time

print(f"Total 15-minute intervals: {len(fifteen_min_trades)}")
print(f"Average trades per 15-min: {fifteen_min_trades['trade_count'].mean():.1f}")
print(f"Max trades in 15-min: {fifteen_min_trades['trade_count'].max()}")

# 4. Hourly trade count
print(f"\n=== HOURLY TRADE COUNT ===")
df['hour_key'] = df['date'].dt.hour
hourly_trades = df.groupby('hour_key').size().reset_index(name='trade_count')
hourly_trades['time_range'] = hourly_trades['hour_key'].apply(lambda x: f"{x:02d}:00-{x:02d}:59")

print(f"Total trading hours: {len(hourly_trades)}")
print(f"Average trades per hour: {hourly_trades['trade_count'].mean():.1f}")
print(f"Max trades in an hour: {hourly_trades['trade_count'].max()}")

# Display hourly breakdown
print(f"\n=== HOURLY BREAKDOWN ===")
for idx, row in hourly_trades.iterrows():
    print(f"{row['time_range']}: {row['trade_count']:,} trades")

# 5. Activity density analysis
print(f"\n=== ACTIVITY DENSITY ANALYSIS ===")

# Find peak activity periods
peak_minute = minute_trades.loc[minute_trades['trade_count'].idxmax()]
peak_five_min = five_min_trades.loc[five_min_trades['trade_count'].idxmax()]
peak_fifteen_min = fifteen_min_trades.loc[fifteen_min_trades['trade_count'].idxmax()]
peak_hour = hourly_trades.loc[hourly_trades['trade_count'].idxmax()]

print(f"Peak activity minute: {peak_minute['time']} ({peak_minute['trade_count']} trades)")
print(f"Peak activity 5-min: {peak_five_min['time']} ({peak_five_min['trade_count']} trades)")
print(f"Peak activity 15-min: {peak_fifteen_min['time']} ({peak_fifteen_min['trade_count']} trades)")
print(f"Peak activity hour: {peak_hour['time_range']} ({peak_hour['trade_count']:,} trades)")

# 6. Quiet periods analysis
print(f"\n=== QUIET PERIODS ANALYSIS ===")
quiet_minutes = minute_trades[minute_trades['trade_count'] == minute_trades['trade_count'].min()]
print(f"Minutes with minimum activity ({minute_trades['trade_count'].min()} trades):")
for idx, row in quiet_minutes.head(5).iterrows():
    print(f"  {row['time']}")

# 7. Summary statistics
print(f"\n=== SUMMARY STATISTICS ===")
total_trades = len(df)
total_minutes = len(minute_trades)
total_hours = len(hourly_trades)

print(f"Total trades: {total_trades:,}")
print(f"Total active minutes: {total_minutes}")
print(f"Total trading hours: {total_hours}")
print(f"Average trades per minute: {total_trades/total_minutes:.1f}")
print(f"Average trades per hour: {total_trades/total_hours:.1f}")

# Display sample of minute-by-minute data
print(f"\n=== SAMPLE MINUTE-BY-MINUTE DATA (First 10 rows) ===")
print(minute_trades.head(10))

=== TRADE ACTIVITY DENSITY ANALYSIS ===
Analyzing trade activity density across different time intervals...

=== MINUTE-BY-MINUTE TRADE COUNT ===
Total minutes with trades: 375
Average trades per minute: 29.2
Max trades in a minute: 116
Min trades in a minute: 9

=== TOP 10 MOST ACTIVE MINUTES ===
09:15:00: 116 trades
15:09:00: 108 trades
09:22:00: 105 trades
15:23:00: 98 trades
09:19:00: 96 trades
15:26:00: 96 trades
09:16:00: 94 trades
09:21:00: 94 trades
15:17:00: 94 trades
12:49:00: 93 trades

=== 5-MINUTE INTERVAL TRADE COUNT ===
Total 5-minute intervals: 75
Average trades per 5-min: 146.1
Max trades in 5-min: 383

=== 15-MINUTE INTERVAL TRADE COUNT ===
Total 15-minute intervals: 25
Average trades per 15-min: 438.2
Max trades in 15-min: 971

=== HOURLY TRADE COUNT ===
Total trading hours: 7
Average trades per hour: 1565.1
Max trades in an hour: 1807

=== HOURLY BREAKDOWN ===
09:00-09:59: 1,047 trades
10:00-10:59: 1,476 trades
11:00-11:59: 1,653 trades
12:00-12:59: 1,807 trades
13:

In [24]:
# Calculate intraday volatility using standard deviation of price per minute/hour
print("=== INTRADAY VOLATILITY ANALYSIS ===")

# Calculate volatility at different time intervals
print("Calculating price volatility across different time intervals...")

# 1. Minute-by-minute volatility
print("\n=== MINUTE-BY-MINUTE VOLATILITY ===")
df['minute_key'] = df['date'].dt.floor('1min')
minute_volatility = df.groupby('minute_key').agg({
    'price': ['mean', 'std', 'min', 'max', 'count']  # Price statistics including std dev
}).round(4)

# Flatten column names
minute_volatility.columns = ['avg_price', 'price_std', 'min_price', 'max_price', 'trade_count']
minute_volatility = minute_volatility.reset_index()
minute_volatility['time'] = minute_volatility['minute_key'].dt.time

# Filter out minutes with only 1 trade (std dev = 0 or NaN)
minute_volatility_filtered = minute_volatility[minute_volatility['trade_count'] > 1]

print(f"Total minutes analyzed: {len(minute_volatility)}")
print(f"Minutes with volatility (multiple trades): {len(minute_volatility_filtered)}")
print(f"Average minute volatility: ₹{minute_volatility_filtered['price_std'].mean():.4f}")
print(f"Max minute volatility: ₹{minute_volatility_filtered['price_std'].max():.4f}")

# 2. 5-minute interval volatility
print(f"\n=== 5-MINUTE INTERVAL VOLATILITY ===")
df['five_min_key'] = df['date'].dt.floor('5min')
five_min_volatility = df.groupby('five_min_key').agg({
    'price': ['mean', 'std', 'min', 'max', 'count']
}).round(4)

five_min_volatility.columns = ['avg_price', 'price_std', 'min_price', 'max_price', 'trade_count']
five_min_volatility = five_min_volatility.reset_index()
five_min_volatility['time'] = five_min_volatility['five_min_key'].dt.time

five_min_filtered = five_min_volatility[five_min_volatility['trade_count'] > 1]

print(f"Total 5-minute intervals: {len(five_min_volatility)}")
print(f"Intervals with volatility: {len(five_min_filtered)}")
print(f"Average 5-min volatility: ₹{five_min_filtered['price_std'].mean():.4f}")
print(f"Max 5-min volatility: ₹{five_min_filtered['price_std'].max():.4f}")

# 3. 15-minute interval volatility
print(f"\n=== 15-MINUTE INTERVAL VOLATILITY ===")
df['fifteen_min_key'] = df['date'].dt.floor('15min')
fifteen_min_volatility = df.groupby('fifteen_min_key').agg({
    'price': ['mean', 'std', 'min', 'max', 'count']
}).round(4)

fifteen_min_volatility.columns = ['avg_price', 'price_std', 'min_price', 'max_price', 'trade_count']
fifteen_min_volatility = fifteen_min_volatility.reset_index()
fifteen_min_volatility['time'] = fifteen_min_volatility['fifteen_min_key'].dt.time

fifteen_min_filtered = fifteen_min_volatility[fifteen_min_volatility['trade_count'] > 1]

print(f"Total 15-minute intervals: {len(fifteen_min_volatility)}")
print(f"Intervals with volatility: {len(fifteen_min_filtered)}")
print(f"Average 15-min volatility: ₹{fifteen_min_volatility['price_std'].mean():.4f}")
print(f"Max 15-min volatility: ₹{fifteen_min_volatility['price_std'].max():.4f}")

# 4. Hourly volatility
print(f"\n=== HOURLY VOLATILITY ===")
df['hour_key'] = df['date'].dt.hour
hourly_volatility = df.groupby('hour_key').agg({
    'price': ['mean', 'std', 'min', 'max', 'count']
}).round(4)

hourly_volatility.columns = ['avg_price', 'price_std', 'min_price', 'max_price', 'trade_count']
hourly_volatility = hourly_volatility.reset_index()
hourly_volatility['time_range'] = hourly_volatility['hour_key'].apply(lambda x: f"{x:02d}:00-{x:02d}:59")

print(f"Total trading hours: {len(hourly_volatility)}")
print(f"Average hourly volatility: ₹{hourly_volatility['price_std'].mean():.4f}")
print(f"Max hourly volatility: ₹{hourly_volatility['price_std'].max():.4f}")

# 5. Peak volatility periods identification
print(f"\n=== PEAK VOLATILITY PERIODS ===")

# Find highest volatility periods
highest_vol_minute = minute_volatility_filtered.loc[minute_volatility_filtered['price_std'].idxmax()]
highest_vol_five_min = five_min_filtered.loc[five_min_filtered['price_std'].idxmax()]
highest_vol_fifteen_min = fifteen_min_filtered.loc[fifteen_min_filtered['price_std'].idxmax()]
highest_vol_hour = hourly_volatility.loc[hourly_volatility['price_std'].idxmax()]

print(f"Highest minute volatility: {highest_vol_minute['time']} (₹{highest_vol_minute['price_std']:.4f})")
print(f"  Price range: ₹{highest_vol_minute['min_price']:.2f} - ₹{highest_vol_minute['max_price']:.2f}")
print(f"  Trades: {highest_vol_minute['trade_count']}")

print(f"Highest 5-min volatility: {highest_vol_five_min['time']} (₹{highest_vol_five_min['price_std']:.4f})")
print(f"  Price range: ₹{highest_vol_five_min['min_price']:.2f} - ₹{highest_vol_five_min['max_price']:.2f}")

print(f"Highest 15-min volatility: {highest_vol_fifteen_min['time']} (₹{highest_vol_fifteen_min['price_std']:.4f})")
print(f"  Price range: ₹{highest_vol_fifteen_min['min_price']:.2f} - ₹{highest_vol_fifteen_min['max_price']:.2f}")

print(f"Highest hourly volatility: {highest_vol_hour['time_range']} (₹{highest_vol_hour['price_std']:.4f})")
print(f"  Price range: ₹{highest_vol_hour['min_price']:.2f} - ₹{highest_vol_hour['max_price']:.2f}")

# 6. Volatility distribution analysis
print(f"\n=== VOLATILITY DISTRIBUTION ANALYSIS ===")

# Calculate volatility percentiles
volatility_percentiles = minute_volatility_filtered['price_std'].quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
print("Minute volatility percentiles:")
for p, v in volatility_percentiles.items():
    print(f"  {p*100:2.0f}th percentile: ₹{v:.4f}")

# 7. Summary statistics
print(f"\n=== SUMMARY STATISTICS ===")
print(f"Overall price standard deviation: ₹{df['price'].std():.4f}")
print(f"Overall price range: ₹{df['price'].max() - df['price'].min():.2f}")
print(f"Overall coefficient of variation: {(df['price'].std() / df['price'].mean() * 100):.2f}%")

# Display sample volatility data
print(f"\n=== SAMPLE MINUTE VOLATILITY DATA (First 10 rows) ===")
print(minute_volatility_filtered[['time', 'avg_price', 'price_std', 'min_price', 'max_price', 'trade_count']].head(10))

=== INTRADAY VOLATILITY ANALYSIS ===
Calculating price volatility across different time intervals...

=== MINUTE-BY-MINUTE VOLATILITY ===
Total minutes analyzed: 375
Minutes with volatility (multiple trades): 375
Average minute volatility: ₹0.0957
Max minute volatility: ₹0.7306

=== 5-MINUTE INTERVAL VOLATILITY ===
Total 5-minute intervals: 75
Intervals with volatility: 75
Average 5-min volatility: ₹0.1911
Max 5-min volatility: ₹1.2729

=== 15-MINUTE INTERVAL VOLATILITY ===
Total 15-minute intervals: 25
Intervals with volatility: 25
Average 15-min volatility: ₹0.3198
Max 15-min volatility: ₹1.3852

=== HOURLY VOLATILITY ===
Total trading hours: 7
Average hourly volatility: ₹0.5978
Max hourly volatility: ₹1.1315

=== PEAK VOLATILITY PERIODS ===
Highest minute volatility: 09:16:00 (₹0.7306)
  Price range: ₹385.60 - ₹387.75
  Trades: 94
Highest 5-min volatility: 09:15:00 (₹1.2729)
  Price range: ₹385.60 - ₹389.45
Highest 15-min volatility: 09:15:00 (₹1.3852)
  Price range: ₹385.60 - ₹390.

In [25]:
# Identify top N volatile minutes in the day
print("=== TOP N VOLATILE MINUTES ANALYSIS ===")

# Set N for top volatile minutes (you can change this value)
N = 20  # Change this to see top 10, 15, 25, etc.

print(f"Identifying top {N} most volatile minutes...")

# Calculate minute-by-minute volatility (if not already calculated)
if 'minute_key' not in df.columns:
    df['minute_key'] = df['date'].dt.floor('1min')

minute_volatility = df.groupby('minute_key').agg({
    'price': ['mean', 'std', 'min', 'max', 'count']
}).round(4)

# Flatten column names
minute_volatility.columns = ['avg_price', 'price_std', 'min_price', 'max_price', 'trade_count']
minute_volatility = minute_volatility.reset_index()
minute_volatility['time'] = minute_volatility['minute_key'].dt.time

# Filter out minutes with only 1 trade (no meaningful volatility)
minute_volatility_filtered = minute_volatility[minute_volatility['trade_count'] > 1].copy()

# Sort by volatility (price_std) in descending order
top_volatile_minutes = minute_volatility_filtered.nlargest(N, 'price_std').copy()

# Add additional metrics
top_volatile_minutes['price_range'] = top_volatile_minutes['max_price'] - top_volatile_minutes['min_price']
top_volatile_minutes['volatility_rank'] = range(1, len(top_volatile_minutes) + 1)

print(f"\n=== TOP {N} MOST VOLATILE MINUTES ===")
print(f"Total minutes with volatility: {len(minute_volatility_filtered)}")
print(f"Showing top {N} minutes by price standard deviation")

# Display the top N volatile minutes
print(f"\n{'Rank':<4} {'Time':<8} {'Std Dev':<10} {'Price Range':<12} {'Min Price':<10} {'Max Price':<10} {'Trades':<7}")
print("-" * 80)

for idx, row in top_volatile_minutes.iterrows():
    print(f"{row['volatility_rank']:<4} {str(row['time']):<8} ₹{row['price_std']:<9.4f} ₹{row['price_range']:<11.2f} ₹{row['min_price']:<9.2f} ₹{row['max_price']:<9.2f} {row['trade_count']:<7}")

# Detailed analysis of top volatile minutes
print(f"\n=== DETAILED ANALYSIS OF TOP {N} VOLATILE MINUTES ===")

# 1. Time distribution analysis
print("\n--- TIME DISTRIBUTION ---")
top_volatile_minutes['hour'] = top_volatile_minutes['time'].apply(lambda x: x.hour)
hourly_distribution = top_volatile_minutes['hour'].value_counts().sort_index()

print("Hourly distribution of top volatile minutes:")
for hour, count in hourly_distribution.items():
    print(f"  {hour:02d}:00-{hour:02d}:59: {count} minutes")

# 2. Volatility magnitude analysis
print(f"\n--- VOLATILITY MAGNITUDE ANALYSIS ---")
print(f"Highest volatility: ₹{top_volatile_minutes['price_std'].max():.4f}")
print(f"Lowest volatility (in top {N}): ₹{top_volatile_minutes['price_std'].min():.4f}")
print(f"Average volatility (top {N}): ₹{top_volatile_minutes['price_std'].mean():.4f}")
print(f"Median volatility (top {N}): ₹{top_volatile_minutes['price_std'].median():.4f}")

# 3. Price range analysis
print(f"\n--- PRICE RANGE ANALYSIS ---")
print(f"Largest price range: ₹{top_volatile_minutes['price_range'].max():.2f}")
print(f"Smallest price range (in top {N}): ₹{top_volatile_minutes['price_range'].min():.2f}")
print(f"Average price range (top {N}): ₹{top_volatile_minutes['price_range'].mean():.2f}")

# 4. Trade count analysis
print(f"\n--- TRADE COUNT ANALYSIS ---")
print(f"Most trades in volatile minute: {top_volatile_minutes['trade_count'].max()}")
print(f"Least trades in volatile minute: {top_volatile_minutes['trade_count'].min()}")
print(f"Average trades in volatile minutes: {top_volatile_minutes['trade_count'].mean():.1f}")

# 5. Extreme volatility periods
print(f"\n--- EXTREME VOLATILITY PERIODS ---")
extreme_volatility = top_volatile_minutes[top_volatile_minutes['price_std'] > top_volatile_minutes['price_std'].quantile(0.9)]
print(f"Extreme volatility minutes (top 10%): {len(extreme_volatility)}")
for idx, row in extreme_volatility.iterrows():
    print(f"  {row['time']}: ₹{row['price_std']:.4f} std dev, ₹{row['price_range']:.2f} range")

# 6. Comparison with overall statistics
print(f"\n--- COMPARISON WITH OVERALL STATISTICS ---")
overall_std = df['price'].std()
overall_range = df['price'].max() - df['price'].min()

print(f"Overall day volatility: ₹{overall_std:.4f}")
print(f"Overall day price range: ₹{overall_range:.2f}")
print(f"Top {N} volatility vs overall: {top_volatile_minutes['price_std'].mean()/overall_std:.2f}x higher")
print(f"Top {N} price range vs overall: {top_volatile_minutes['price_range'].mean()/overall_range:.2f}x higher")

# 7. Export top volatile minutes data
print(f"\n=== EXPORT DATA ===")
print("Top volatile minutes data is available in 'top_volatile_minutes' DataFrame")
print("You can export this data or use it for further analysis")

# Display summary table
print(f"\n=== SUMMARY TABLE ===")
summary_stats = {
    'Metric': ['Count', 'Avg Std Dev', 'Avg Price Range', 'Avg Trades', 'Time Range'],
    'Value': [
        len(top_volatile_minutes),
        f"₹{top_volatile_minutes['price_std'].mean():.4f}",
        f"₹{top_volatile_minutes['price_range'].mean():.2f}",
        f"{top_volatile_minutes['trade_count'].mean():.1f}",
        f"{top_volatile_minutes['time'].min()} - {top_volatile_minutes['time'].max()}"
    ]
}

summary_df = pd.DataFrame(summary_stats)
print(summary_df.to_string(index=False))

=== TOP N VOLATILE MINUTES ANALYSIS ===
Identifying top 20 most volatile minutes...

=== TOP 20 MOST VOLATILE MINUTES ===
Total minutes with volatility: 375
Showing top 20 minutes by price standard deviation

Rank Time     Std Dev    Price Range  Min Price  Max Price  Trades 
--------------------------------------------------------------------------------
1    09:16:00 ₹0.7306    ₹2.15        ₹385.60    ₹387.75    94     
2    09:17:00 ₹0.3531    ₹1.00        ₹387.55    ₹388.55    15     
3    14:24:00 ₹0.2965    ₹0.95        ₹385.15    ₹386.10    16     
4    13:02:00 ₹0.2861    ₹0.75        ₹384.65    ₹385.40    14     
5    09:18:00 ₹0.2742    ₹1.00        ₹388.45    ₹389.45    14     
6    09:15:00 ₹0.2694    ₹1.25        ₹385.70    ₹386.95    116    
7    09:20:00 ₹0.2465    ₹0.95        ₹388.90    ₹389.85    13     
8    09:23:00 ₹0.2419    ₹0.75        ₹389.10    ₹389.85    14     
9    09:32:00 ₹0.2281    ₹0.70        ₹389.15    ₹389.85    11     
10   10:36:00 ₹0.2203    ₹0.60

In [26]:
# Measure average trade size and related volume metrics
print("=== TRADE SIZE ANALYSIS ===")

# Calculate basic trade size statistics
print("Calculating trade size metrics...")

# Basic statistics
avg_trade_size = df['qty'].mean()
median_trade_size = df['qty'].median()
min_trade_size = df['qty'].min()
max_trade_size = df['qty'].max()
std_trade_size = df['qty'].std()

print(f"=== BASIC TRADE SIZE STATISTICS ===")
print(f"Average trade size: {avg_trade_size:,.0f}")
print(f"Median trade size: {median_trade_size:,.0f}")
print(f"Minimum trade size: {min_trade_size:,}")
print(f"Maximum trade size: {max_trade_size:,}")
print(f"Standard deviation: {std_trade_size:,.0f}")

# Calculate percentiles for trade size distribution
print(f"\n=== TRADE SIZE PERCENTILES ===")
trade_size_percentiles = df['qty'].quantile([0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
for p, v in trade_size_percentiles.items():
    print(f"{p*100:2.0f}th percentile: {v:,.0f}")

# Trade size categories analysis
print(f"\n=== TRADE SIZE CATEGORIES ===")

# Define trade size categories
def categorize_trade_size(qty):
    if qty <= 100:
        return 'Small (≤100)'
    elif qty <= 500:
        return 'Medium (101-500)'
    elif qty <= 1000:
        return 'Large (501-1000)'
    elif qty <= 5000:
        return 'Very Large (1001-5000)'
    else:
        return 'Huge (>5000)'

# Apply categorization
df['trade_size_category'] = df['qty'].apply(categorize_trade_size)

# Count trades in each category
size_category_counts = df['trade_size_category'].value_counts()
size_category_percentages = (size_category_counts / len(df) * 100).round(2)

print("Trade size distribution:")
for category, count in size_category_counts.items():
    percentage = size_category_percentages[category]
    print(f"  {category}: {count:,} trades ({percentage}%)")

# Volume-weighted average trade size
print(f"\n=== VOLUME-WEIGHTED METRICS ===")
volume_weighted_avg = (df['qty'] * df['qty']).sum() / df['qty'].sum()
print(f"Volume-weighted average trade size: {volume_weighted_avg:,.0f}")

# Compare with simple average
print(f"Simple average vs volume-weighted: {avg_trade_size:,.0f} vs {volume_weighted_avg:,.0f}")
print(f"Difference: {volume_weighted_avg - avg_trade_size:,.0f}")

# Trade size by time of day
print(f"\n=== TRADE SIZE BY TIME OF DAY ===")
df['hour'] = df['date'].dt.hour
hourly_trade_sizes = df.groupby('hour')['qty'].agg(['mean', 'median', 'count']).round(0)
hourly_trade_sizes.columns = ['Avg Trade Size', 'Median Trade Size', 'Trade Count']

print("Hourly trade size analysis:")
for hour, row in hourly_trade_sizes.iterrows():
    print(f"  {hour:02d}:00-{hour:02d}:59: Avg={row['Avg Trade Size']:,.0f}, Median={row['Median Trade Size']:,.0f}, Count={row['Trade Count']:,}")

# Find hours with largest and smallest average trade sizes
max_avg_hour = hourly_trade_sizes['Avg Trade Size'].idxmax()
min_avg_hour = hourly_trade_sizes['Avg Trade Size'].idxmin()

print(f"\nHour with largest average trade size: {max_avg_hour:02d}:00 ({hourly_trade_sizes.loc[max_avg_hour, 'Avg Trade Size']:,.0f})")
print(f"Hour with smallest average trade size: {min_avg_hour:02d}:00 ({hourly_trade_sizes.loc[min_avg_hour, 'Avg Trade Size']:,.0f})")

# Trade size outliers analysis
print(f"\n=== OUTLIER ANALYSIS ===")

# Define outliers using IQR method
Q1 = df['qty'].quantile(0.25)
Q3 = df['qty'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = df[(df['qty'] < lower_bound) | (df['qty'] > upper_bound)]
normal_trades = df[(df['qty'] >= lower_bound) & (df['qty'] <= upper_bound)]

print(f"Outlier threshold (IQR method):")
print(f"  Lower bound: {lower_bound:,.0f}")
print(f"  Upper bound: {upper_bound:,.0f}")
print(f"  Outlier trades: {len(outliers):,} ({(len(outliers)/len(df)*100):.2f}%)")
print(f"  Normal trades: {len(normal_trades):,} ({(len(normal_trades)/len(df)*100):.2f}%)")

if len(outliers) > 0:
    print(f"\nOutlier trade sizes:")
    print(f"  Largest outlier: {outliers['qty'].max():,}")
    print(f"  Smallest outlier: {outliers['qty'].min():,}")
    print(f"  Average outlier size: {outliers['qty'].mean():,.0f}")

# Summary statistics
print(f"\n=== SUMMARY ===")
print(f"Total trades analyzed: {len(df):,}")
print(f"Total volume traded: {df['qty'].sum():,}")
print(f"Average trade size: {avg_trade_size:,.0f}")
print(f"Most common trade size category: {size_category_counts.index[0]}")
print(f"Trade size coefficient of variation: {(std_trade_size/avg_trade_size*100):.1f}%")

# Display sample of different trade sizes
print(f"\n=== SAMPLE TRADES BY SIZE ===")
print("Small trades sample:")
small_trades = df[df['qty'] <= 100][['date', 'price', 'qty', 'trnvr']].head(3)
print(small_trades)

print("\nLarge trades sample:")
large_trades = df[df['qty'] > 1000][['date', 'price', 'qty', 'trnvr']].head(3)
print(large_trades)

=== TRADE SIZE ANALYSIS ===
Calculating trade size metrics...
=== BASIC TRADE SIZE STATISTICS ===
Average trade size: 1,051
Median trade size: 200
Minimum trade size: 1
Maximum trade size: 153,858
Standard deviation: 3,358

=== TRADE SIZE PERCENTILES ===
10th percentile: 4
25th percentile: 26
50th percentile: 200
75th percentile: 799
90th percentile: 2,440
95th percentile: 4,485
99th percentile: 13,820

=== TRADE SIZE CATEGORIES ===
Trade size distribution:
  Small (≤100): 4,440 trades (40.53%)
  Medium (101-500): 2,757 trades (25.16%)
  Very Large (1001-5000): 1,855 trades (16.93%)
  Large (501-1000): 1,408 trades (12.85%)
  Huge (>5000): 496 trades (4.53%)

=== VOLUME-WEIGHTED METRICS ===
Volume-weighted average trade size: 11,780
Simple average vs volume-weighted: 1,051 vs 11,780
Difference: 10,729

=== TRADE SIZE BY TIME OF DAY ===
Hourly trade size analysis:
  09:00-09:59: Avg=2,067, Median=424, Count=1,047.0
  10:00-10:59: Avg=973, Median=180, Count=1,476.0
  11:00-11:59: Avg=811

In [27]:
# Identify unusual trade sizes (top 1% volume)
print("=== UNUSUAL TRADE SIZES ANALYSIS ===")

# Calculate the 99th percentile threshold for unusual trade sizes
print("Identifying top 1% largest trade sizes...")

# Calculate volume percentiles
volume_percentiles = df['qty'].quantile([0.5, 0.75, 0.9, 0.95, 0.99, 0.999])
unusual_threshold = volume_percentiles[0.99]

print(f"=== VOLUME PERCENTILE THRESHOLDS ===")
for p, v in volume_percentiles.items():
    print(f"{p*100:3.1f}th percentile: {v:,.0f}")

print(f"\nUnusual trade size threshold (top 1%): {unusual_threshold:,.0f}")

# Identify unusual trades (top 1%)
unusual_trades = df[df['qty'] >= unusual_threshold].copy()
unusual_trades = unusual_trades.sort_values('qty', ascending=False)

print(f"\n=== UNUSUAL TRADES SUMMARY ===")
print(f"Total unusual trades (top 1%): {len(unusual_trades):,}")
print(f"Percentage of total trades: {(len(unusual_trades)/len(df)*100):.2f}%")
print(f"Total volume in unusual trades: {unusual_trades['qty'].sum():,}")
print(f"Percentage of total volume: {(unusual_trades['qty'].sum()/df['qty'].sum()*100):.2f}%")

# Display all unusual trades
print(f"\n=== ALL UNUSUAL TRADES (TOP 1%) ===")
print(f"{'Rank':<4} {'Time':<12} {'Price':<8} {'Quantity':<10} {'Turnover':<12} {'Cum Turnover':<15}")
print("-" * 80)

for idx, (_, row) in enumerate(unusual_trades.iterrows(), 1):
    print(f"{idx:<4} {row['date'].strftime('%H:%M:%S'):<12} ₹{row['price']:<7.2f} {row['qty']:<10,} ₹{row['trnvr']:<11,.2f} ₹{row['cum_trnvr']:<14,.2f}")

# Detailed analysis of unusual trades
print(f"\n=== DETAILED ANALYSIS OF UNUSUAL TRADES ===")

# 1. Size distribution of unusual trades
print("\n--- SIZE DISTRIBUTION ---")
unusual_trades['size_category'] = unusual_trades['qty'].apply(lambda x: 
    'Large (1K-5K)' if x <= 5000 else
    'Very Large (5K-10K)' if x <= 10000 else
    'Huge (10K-50K)' if x <= 50000 else
    'Massive (>50K)'
)

size_distribution = unusual_trades['size_category'].value_counts()
print("Size distribution of unusual trades:")
for category, count in size_distribution.items():
    print(f"  {category}: {count} trades")

# 2. Time distribution of unusual trades
print(f"\n--- TIME DISTRIBUTION ---")
unusual_trades['hour'] = unusual_trades['date'].dt.hour
hourly_distribution = unusual_trades['hour'].value_counts().sort_index()

print("Hourly distribution of unusual trades:")
for hour, count in hourly_distribution.items():
    print(f"  {hour:02d}:00-{hour:02d}:59: {count} trades")

# 3. Price analysis of unusual trades
print(f"\n--- PRICE ANALYSIS ---")
print(f"Average price of unusual trades: ₹{unusual_trades['price'].mean():.2f}")
print(f"Price range of unusual trades: ₹{unusual_trades['price'].min():.2f} - ₹{unusual_trades['price'].max():.2f}")
print(f"Price standard deviation: ₹{unusual_trades['price'].std():.2f}")

# 4. Impact analysis
print(f"\n--- MARKET IMPACT ANALYSIS ---")
total_volume = df['qty'].sum()
total_turnover = df['trnvr'].sum()

print(f"Unusual trades represent:")
print(f"  Volume: {unusual_trades['qty'].sum():,} / {total_volume:,} = {(unusual_trades['qty'].sum()/total_volume*100):.2f}%")
print(f"  Turnover: ₹{unusual_trades['trnvr'].sum():,.2f} / ₹{total_turnover:,.2f} = {(unusual_trades['trnvr'].sum()/total_turnover*100):.2f}%")

# 5. Largest single trades
print(f"\n--- TOP 5 LARGEST SINGLE TRADES ---")
top_5_trades = unusual_trades.head(5)
for idx, (_, row) in enumerate(top_5_trades.iterrows(), 1):
    print(f"{idx}. {row['date'].strftime('%H:%M:%S')} - {row['qty']:,} shares at ₹{row['price']:.2f}")
    print(f"   Turnover: ₹{row['trnvr']:,.2f}")

# 6. Statistical comparison
print(f"\n--- STATISTICAL COMPARISON ---")
print("Unusual trades vs Normal trades:")
print(f"  Average size: {unusual_trades['qty'].mean():,.0f} vs {df[df['qty'] < unusual_threshold]['qty'].mean():,.0f}")
print(f"  Median size: {unusual_trades['qty'].median():,.0f} vs {df[df['qty'] < unusual_threshold]['qty'].median():,.0f}")
print(f"  Size ratio: {unusual_trades['qty'].mean() / df[df['qty'] < unusual_threshold]['qty'].mean():.1f}x larger")

# 7. Clustering analysis
print(f"\n--- CLUSTERING ANALYSIS ---")
# Check if unusual trades are clustered in time
unusual_trades_sorted = unusual_trades.sort_values('date')
time_diffs = unusual_trades_sorted['date'].diff().dt.total_seconds() / 60  # in minutes

clustered_trades = time_diffs[time_diffs <= 5]  # Within 5 minutes
print(f"Unusual trades within 5 minutes of each other: {len(clustered_trades)}")
print(f"Clustering percentage: {(len(clustered_trades)/len(unusual_trades)*100):.1f}%")

# 8. Export unusual trades data
print(f"\n=== EXPORT DATA ===")
print("Unusual trades data is available in 'unusual_trades' DataFrame")
print("You can export this data for further analysis")

# Summary table
print(f"\n=== SUMMARY TABLE ===")
summary_data = {
    'Metric': ['Count', 'Total Volume', 'Total Turnover', 'Avg Size', 'Largest Trade', 'Time Range'],
    'Value': [
        len(unusual_trades),
        f"{unusual_trades['qty'].sum():,}",
        f"₹{unusual_trades['trnvr'].sum():,.2f}",
        f"{unusual_trades['qty'].mean():,.0f}",
        f"{unusual_trades['qty'].max():,}",
        f"{unusual_trades['date'].min().strftime('%H:%M')} - {unusual_trades['date'].max().strftime('%H:%M')}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

=== UNUSUAL TRADE SIZES ANALYSIS ===
Identifying top 1% largest trade sizes...
=== VOLUME PERCENTILE THRESHOLDS ===
50.0th percentile: 200
75.0th percentile: 799
90.0th percentile: 2,440
95.0th percentile: 4,485
99.0th percentile: 13,820
99.9th percentile: 38,091

Unusual trade size threshold (top 1%): 13,820

=== UNUSUAL TRADES SUMMARY ===
Total unusual trades (top 1%): 110
Percentage of total trades: 1.00%
Total volume in unusual trades: 2,781,296
Percentage of total volume: 24.16%

=== ALL UNUSUAL TRADES (TOP 1%) ===
Rank Time         Price    Quantity   Turnover     Cum Turnover   
--------------------------------------------------------------------------------
1    15:01:57     ₹387.05  153,858    ₹59,550,738.90 ₹3,896,603,224.25
2    09:20:41     ₹389.30  67,256     ₹26,182,760.80 ₹344,731,947.85
3    09:15:00     ₹386.85  65,740     ₹25,431,519.00 ₹25,431,519.00 
4    12:08:06     ₹385.05  60,580     ₹23,326,329.00 ₹1,993,683,235.00
5    12:08:12     ₹384.80  55,402     ₹21,318,

In [28]:
# Compute bidirectional price changes (number of up vs. down ticks)
print("=== BIDIRECTIONAL PRICE CHANGES ANALYSIS ===")

# Calculate price changes (if not already calculated)
if 'price_change' not in df.columns:
    df['price_change'] = df['price'].diff()

# Create direction column (if not already created)
if 'direction' not in df.columns:
    df['direction'] = df['price_change'].apply(lambda x: 
        'Up' if x > 0 else 
        'Down' if x < 0 else 
        'No change'
    )

print("Analyzing price movement directions...")

# Count the occurrences of each direction
direction_counts = df['direction'].value_counts()
total_ticks = len(df)

print(f"=== DIRECTION DISTRIBUTION ===")
print(f"Total price ticks: {total_ticks:,}")

for direction, count in direction_counts.items():
    percentage = (count / total_ticks) * 100
    print(f"{direction} ticks: {count:,} ({percentage:.2f}%)")

# Calculate up vs down ratio
up_ticks = direction_counts.get('Up', 0)
down_ticks = direction_counts.get('Down', 0)
no_change_ticks = direction_counts.get('No change', 0)

if down_ticks > 0:
    up_down_ratio = up_ticks / down_ticks
    print(f"\nUp/Down ratio: {up_down_ratio:.3f}")
    print(f"Up ticks are {up_down_ratio:.1f}x more frequent than down ticks" if up_down_ratio > 1 else f"Down ticks are {1/up_down_ratio:.1f}x more frequent than up ticks")

# Direction analysis by time intervals
print(f"\n=== DIRECTION ANALYSIS BY TIME INTERVALS ===")

# 1. Minute-by-minute direction analysis
df['minute_key'] = df['date'].dt.floor('1min')
minute_direction = df.groupby('minute_key')['direction'].value_counts().unstack(fill_value=0)

# Calculate up/down ratio per minute
minute_direction['up_down_ratio'] = minute_direction['Up'] / minute_direction['Down'].replace(0, 1)
minute_direction['total_ticks'] = minute_direction['Up'] + minute_direction['Down'] + minute_direction.get('No change', 0)

print(f"Minute-by-minute direction analysis:")
print(f"Minutes with up bias (>1.2 ratio): {(minute_direction['up_down_ratio'] > 1.2).sum()}")
print(f"Minutes with down bias (<0.8 ratio): {(minute_direction['up_down_ratio'] < 0.8).sum()}")
print(f"Minutes with neutral bias (0.8-1.2 ratio): ((minute_direction['up_down_ratio'] >= 0.8) & (minute_direction['up_down_ratio'] <= 1.2)).sum()")

# 2. Hourly direction analysis
df['hour_key'] = df['date'].dt.hour
hourly_direction = df.groupby('hour_key')['direction'].value_counts().unstack(fill_value=0)

# Calculate up/down ratio per hour
hourly_direction['up_down_ratio'] = hourly_direction['Up'] / hourly_direction['Down'].replace(0, 1)
hourly_direction['total_ticks'] = hourly_direction['Up'] + hourly_direction['Down'] + hourly_direction.get('No change', 0)

print(f"\nHourly direction analysis:")
for hour in hourly_direction.index:
    up_count = hourly_direction.loc[hour, 'Up']
    down_count = hourly_direction.loc[hour, 'Down']
    ratio = hourly_direction.loc[hour, 'up_down_ratio']
    total = hourly_direction.loc[hour, 'total_ticks']
    
    bias = "Up" if ratio > 1.2 else "Down" if ratio < 0.8 else "Neutral"
    print(f"  {hour:02d}:00-{hour:02d}:59: Up={up_count}, Down={down_count}, Ratio={ratio:.2f} ({bias})")

# 3. Consecutive direction streaks
print(f"\n=== CONSECUTIVE DIRECTION STREAKS ===")

def find_streaks(directions):
    """Find consecutive streaks of the same direction"""
    streaks = []
    current_streak = 1
    current_direction = directions.iloc[0]
    
    for i in range(1, len(directions)):
        if directions.iloc[i] == current_direction:
            current_streak += 1
        else:
            if current_direction != 'No change':  # Only count meaningful streaks
                streaks.append((current_direction, current_streak))
            current_streak = 1
            current_direction = directions.iloc[i]
    
    # Add the last streak
    if current_direction != 'No change':
        streaks.append((current_direction, current_streak))
    
    return streaks

# Find streaks in the data
streaks = find_streaks(df['direction'])

# Analyze streaks
up_streaks = [s for s in streaks if s[0] == 'Up']
down_streaks = [s for s in streaks if s[0] == 'Down']

print(f"Up streaks: {len(up_streaks)}")
print(f"Down streaks: {len(down_streaks)}")

if up_streaks:
    print(f"Longest up streak: {max(up_streaks, key=lambda x: x[1])[1]} ticks")
    print(f"Average up streak: {sum(s[1] for s in up_streaks) / len(up_streaks):.1f} ticks")

if down_streaks:
    print(f"Longest down streak: {max(down_streaks, key=lambda x: x[1])[1]} ticks")
    print(f"Average down streak: {sum(s[1] for s in down_streaks) / len(down_streaks):.1f} ticks")

# 4. Direction change frequency
print(f"\n=== DIRECTION CHANGE FREQUENCY ===")
direction_changes = (df['direction'] != df['direction'].shift()).sum()
print(f"Total direction changes: {direction_changes}")
print(f"Average ticks per direction change: {total_ticks / direction_changes:.1f}")

# 5. Price change magnitude analysis
print(f"\n=== PRICE CHANGE MAGNITUDE BY DIRECTION ===")
up_changes = df[df['direction'] == 'Up']['price_change']
down_changes = df[df['direction'] == 'Down']['price_change']

if len(up_changes) > 0:
    print(f"Up ticks:")
    print(f"  Average change: ₹{up_changes.mean():.4f}")
    print(f"  Median change: ₹{up_changes.median():.4f}")
    print(f"  Max change: ₹{up_changes.max():.4f}")

if len(down_changes) > 0:
    print(f"Down ticks:")
    print(f"  Average change: ₹{down_changes.mean():.4f}")
    print(f"  Median change: ₹{down_changes.median():.4f}")
    print(f"  Max change: ₹{down_changes.min():.4f}")

# 6. Direction bias summary
print(f"\n=== DIRECTION BIAS SUMMARY ===")
print(f"Overall bias: {'Upward' if up_ticks > down_ticks else 'Downward' if down_ticks > up_ticks else 'Neutral'}")
print(f"Up ticks: {up_ticks:,} ({up_ticks/total_ticks*100:.1f}%)")
print(f"Down ticks: {down_ticks:,} ({down_ticks/total_ticks*100:.1f}%)")
print(f"No change: {no_change_ticks:,} ({no_change_ticks/total_ticks*100:.1f}%)")

# 7. Export direction data
print(f"\n=== EXPORT DATA ===")
print("Direction analysis data is available in:")
print("- 'minute_direction': Minute-by-minute direction breakdown")
print("- 'hourly_direction': Hourly direction breakdown")
print("- 'streaks': List of consecutive direction streaks")

# Display sample of minute direction data
print(f"\n=== SAMPLE MINUTE DIRECTION DATA (First 10 rows) ===")
print(minute_direction[['Up', 'Down', 'up_down_ratio', 'total_ticks']].head(10))

=== BIDIRECTIONAL PRICE CHANGES ANALYSIS ===
Analyzing price movement directions...
=== DIRECTION DISTRIBUTION ===
Total price ticks: 10,956
No change ticks: 4,441 (40.53%)
Down ticks: 3,299 (30.11%)
Up ticks: 3,216 (29.35%)

Up/Down ratio: 0.975
Down ticks are 1.0x more frequent than up ticks

=== DIRECTION ANALYSIS BY TIME INTERVALS ===
Minute-by-minute direction analysis:
Minutes with up bias (>1.2 ratio): 88
Minutes with down bias (<0.8 ratio): 103
Minutes with neutral bias (0.8-1.2 ratio): ((minute_direction['up_down_ratio'] >= 0.8) & (minute_direction['up_down_ratio'] <= 1.2)).sum()

Hourly direction analysis:
  09:00-09:59: Up=336, Down=348, Ratio=0.97 (Neutral)
  10:00-10:59: Up=467, Down=478, Ratio=0.98 (Neutral)
  11:00-11:59: Up=485, Down=538, Ratio=0.90 (Neutral)
  12:00-12:59: Up=544, Down=565, Ratio=0.96 (Neutral)
  13:00-13:59: Up=455, Down=473, Ratio=0.96 (Neutral)
  14:00-14:59: Up=450, Down=428, Ratio=1.05 (Neutral)
  15:00-15:59: Up=479, Down=469, Ratio=1.02 (Neutral

In [29]:
# Identify spikes in price > X% within Y seconds
print("=== PRICE SPIKE DETECTION ANALYSIS ===")

# Set parameters for spike detection (you can adjust these values)
X_PERCENT = 0.5  # Minimum percentage change to consider as a spike
Y_SECONDS = 60   # Time window in seconds to look for spikes

print(f"Detecting price spikes > {X_PERCENT}% within {Y_SECONDS} seconds...")

# Ensure data is sorted by datetime
df = df.sort_values('date').reset_index(drop=True)

# Calculate percentage price changes over different time windows
print("Calculating percentage price changes...")

# Method 1: Rolling percentage change over Y seconds
df['price_pct_change'] = df['price'].pct_change() * 100

# Method 2: Percentage change from Y seconds ago
df['price_pct_change_rolling'] = df['price'].pct_change(periods=Y_SECONDS) * 100

# Method 3: Maximum percentage change within rolling Y-second window
def max_pct_change_in_window(prices, window_seconds):
    """Calculate maximum percentage change within a time window"""
    if len(prices) < 2:
        return 0
    
    max_pct_change = 0
    for i in range(len(prices)):
        for j in range(i+1, min(i+window_seconds+1, len(prices))):
            pct_change = abs((prices.iloc[j] - prices.iloc[i]) / prices.iloc[i]) * 100
            max_pct_change = max(max_pct_change, pct_change)
    
    return max_pct_change

# Calculate rolling maximum percentage change
print("Calculating rolling maximum percentage changes...")
df['max_pct_change_window'] = df['price'].rolling(window=Y_SECONDS, min_periods=1).apply(
    lambda x: max_pct_change_in_window(x, Y_SECONDS), raw=False
)

# Identify spikes based on different criteria
print("Identifying price spikes...")

# Criterion 1: Simple percentage change threshold
spikes_simple = df[abs(df['price_pct_change']) > X_PERCENT].copy()
spikes_simple['spike_type'] = spikes_simple['price_pct_change'].apply(
    lambda x: 'Up' if x > 0 else 'Down'
)

# Criterion 2: Rolling window percentage change threshold
spikes_rolling = df[abs(df['price_pct_change_rolling']) > X_PERCENT].copy()
spikes_rolling['spike_type'] = spikes_rolling['price_pct_change_rolling'].apply(
    lambda x: 'Up' if x > 0 else 'Down'
)

# Criterion 3: Maximum percentage change within window threshold
spikes_window = df[df['max_pct_change_window'] > X_PERCENT].copy()

print(f"\n=== SPIKE DETECTION RESULTS ===")
print(f"Simple percentage change spikes: {len(spikes_simple):,}")
print(f"Rolling window spikes: {len(spikes_rolling):,}")
print(f"Window maximum spikes: {len(spikes_window):,}")

# Analyze simple percentage change spikes
if len(spikes_simple) > 0:
    print(f"\n=== SIMPLE PERCENTAGE CHANGE SPIKES (>{X_PERCENT}%) ===")
    
    # Group spikes by type
    up_spikes = spikes_simple[spikes_simple['spike_type'] == 'Up']
    down_spikes = spikes_simple[spikes_simple['spike_type'] == 'Down']
    
    print(f"Up spikes: {len(up_spikes):,}")
    print(f"Down spikes: {len(down_spikes):,}")
    
    # Display top spikes
    print(f"\nTop 10 largest percentage changes:")
    top_spikes = spikes_simple.nlargest(10, 'price_pct_change')
    print(f"{'Rank':<4} {'Time':<12} {'Price':<8} {'Pct Change':<12} {'Type':<6} {'Qty':<8}")
    print("-" * 70)
    
    for idx, (_, row) in enumerate(top_spikes.iterrows(), 1):
        print(f"{idx:<4} {row['date'].strftime('%H:%M:%S'):<12} ₹{row['price']:<7.2f} {row['price_pct_change']:<11.2f}% {row['spike_type']:<6} {row['qty']:<8,}")

# Analyze rolling window spikes
if len(spikes_rolling) > 0:
    print(f"\n=== ROLLING WINDOW SPIKE ANALYSIS (>{X_PERCENT}% in {Y_SECONDS}s) ===")
    
    # Find the largest rolling window spikes
    largest_rolling_spikes = spikes_rolling.nlargest(10, 'price_pct_change_rolling')
    
    print(f"Top 10 largest rolling window changes:")
    print(f"{'Rank':<4} {'Time':<12} {'Price':<8} {'Pct Change':<12} {'Type':<6}")
    print("-" * 60)
    
    for idx, (_, row) in enumerate(largest_rolling_spikes.iterrows(), 1):
        print(f"{idx:<4} {row['date'].strftime('%H:%M:%S'):<12} ₹{row['price']:<7.2f} {row['price_pct_change_rolling']:<11.2f}% {row['spike_type']:<6}")

# Analyze window maximum spikes
if len(spikes_window) > 0:
    print(f"\n=== WINDOW MAXIMUM SPIKE ANALYSIS (>{X_PERCENT}% max in {Y_SECONDS}s) ===")
    
    # Find the largest window maximum spikes
    largest_window_spikes = spikes_window.nlargest(10, 'max_pct_change_window')
    
    print(f"Top 10 largest window maximum changes:")
    print(f"{'Rank':<4} {'Time':<12} {'Price':<8} {'Max Pct Change':<15}")
    print("-" * 60)
    
    for idx, (_, row) in enumerate(largest_window_spikes.iterrows(), 1):
        print(f"{idx:<4} {row['date'].strftime('%H:%M:%S'):<12} ₹{row['price']:<7.2f} {row['max_pct_change_window']:<14.2f}%")

# Time-based spike analysis
print(f"\n=== TIME-BASED SPIKE ANALYSIS ===")

# Group spikes by hour
if len(spikes_simple) > 0:
    spikes_simple['hour'] = spikes_simple['date'].dt.hour
    hourly_spikes = spikes_simple.groupby('hour').size()
    
    print("Hourly distribution of spikes:")
    for hour, count in hourly_spikes.items():
        print(f"  {hour:02d}:00-{hour:02d}:59: {count} spikes")

# Spike clustering analysis
print(f"\n=== SPIKE CLUSTERING ANALYSIS ===")

if len(spikes_simple) > 1:
    # Check if spikes are clustered in time
    spikes_sorted = spikes_simple.sort_values('date')
    time_diffs = spikes_sorted['date'].diff().dt.total_seconds()
    
    clustered_spikes = time_diffs[time_diffs <= 300]  # Within 5 minutes
    print(f"Spikes within 5 minutes of each other: {len(clustered_spikes)}")
    print(f"Clustering percentage: {(len(clustered_spikes)/len(spikes_simple)*100):.1f}%")

# Summary statistics
print(f"\n=== SUMMARY STATISTICS ===")
print(f"Total price ticks analyzed: {len(df):,}")
print(f"Spike threshold: {X_PERCENT}%")
print(f"Time window: {Y_SECONDS} seconds")
print(f"Total spikes detected: {len(spikes_simple):,}")
print(f"Spike frequency: {len(spikes_simple)/len(df)*100:.2f}%")

if len(spikes_simple) > 0:
    print(f"Largest spike: {spikes_simple['price_pct_change'].max():.2f}%")
    print(f"Average spike magnitude: {spikes_simple['price_pct_change'].abs().mean():.2f}%")

# Export spike data
print(f"\n=== EXPORT DATA ===")
print("Spike data is available in:")
print("- 'spikes_simple': Simple percentage change spikes")
print("- 'spikes_rolling': Rolling window spikes")
print("- 'spikes_window': Window maximum spikes")

# Display sample of spike data
if len(spikes_simple) > 0:
    print(f"\n=== SAMPLE SPIKE DATA (First 5 rows) ===")
    sample_spikes = spikes_simple[['date', 'price', 'price_pct_change', 'spike_type', 'qty']].head(5)
    print(sample_spikes)

=== PRICE SPIKE DETECTION ANALYSIS ===
Detecting price spikes > 0.5% within 60 seconds...
Calculating percentage price changes...
Calculating rolling maximum percentage changes...
Identifying price spikes...

=== SPIKE DETECTION RESULTS ===
Simple percentage change spikes: 0
Rolling window spikes: 8
Window maximum spikes: 58

=== ROLLING WINDOW SPIKE ANALYSIS (>0.5% in 60s) ===
Top 10 largest rolling window changes:
Rank Time         Price    Pct Change   Type  
------------------------------------------------------------
1    09:16:50     ₹387.75  0.56       % Up    
2    09:16:51     ₹387.75  0.54       % Up    
3    09:18:23     ₹389.45  0.54       % Up    
4    09:16:48     ₹387.60  0.52       % Up    
5    09:16:45     ₹387.75  0.52       % Up    
6    09:16:44     ₹387.75  0.52       % Up    
7    09:16:47     ₹387.60  0.51       % Up    
8    09:16:46     ₹387.60  0.51       % Up    

=== WINDOW MAXIMUM SPIKE ANALYSIS (>0.5% max in 60s) ===
Top 10 largest window maximum changes:

In [31]:
# Detect volume bursts: qty > 3× rolling average
print("=== VOLUME BURST DETECTION ANALYSIS ===")

# Set parameters for volume burst detection
MULTIPLIER = 3.0  # Volume burst threshold multiplier
ROLLING_WINDOW = 20  # Rolling average window size

print(f"Detecting volume bursts where qty > {MULTIPLIER}× rolling average...")
print(f"Rolling average window: {ROLLING_WINDOW} trades")

# Ensure data is sorted by datetime
df = df.sort_values('date').reset_index(drop=True)

# Calculate rolling average of quantity
print("Calculating rolling average of quantity...")
df['qty_rolling_avg'] = df['qty'].rolling(window=ROLLING_WINDOW, min_periods=1).mean()

# Calculate rolling standard deviation for additional context
df['qty_rolling_std'] = df['qty'].rolling(window=ROLLING_WINDOW, min_periods=1).std()

# Calculate volume burst threshold
df['volume_burst_threshold'] = df['qty_rolling_avg'] * MULTIPLIER

# Identify volume bursts
volume_bursts = df[df['qty'] > df['volume_burst_threshold']].copy()
volume_bursts = volume_bursts.sort_values('qty', ascending=False)

print(f"\n=== VOLUME BURST DETECTION RESULTS ===")
print(f"Total volume bursts detected: {len(volume_bursts):,}")
print(f"Percentage of total trades: {(len(volume_bursts)/len(df)*100):.2f}%")

if len(volume_bursts) > 0:
    # Calculate burst statistics
    total_burst_volume = volume_bursts['qty'].sum()
    total_burst_turnover = volume_bursts['trnvr'].sum()
    
    print(f"Total volume in bursts: {total_burst_volume:,}")
    print(f"Percentage of total volume: {(total_burst_volume/df['qty'].sum()*100):.2f}%")
    print(f"Total turnover in bursts: ₹{total_burst_turnover:,.2f}")
    print(f"Percentage of total turnover: {(total_burst_turnover/df['trnvr'].sum()*100):.2f}%")

# Display all volume bursts
if len(volume_bursts) > 0:
    print(f"\n=== ALL VOLUME BURSTS (>{MULTIPLIER}× rolling average) ===")
    print(f"{'Rank':<4} {'Time':<12} {'Price':<8} {'Quantity':<10} {'Rolling Avg':<12} {'Multiple':<10} {'Turnover':<12}")
    print("-" * 90)
    
    for idx, (_, row) in enumerate(volume_bursts.iterrows(), 1):
        multiple = row['qty'] / row['qty_rolling_avg']
        print(f"{idx:<4} {row['date'].strftime('%H:%M:%S'):<12} ₹{row['price']:<7.2f} {row['qty']:<10,} {row['qty_rolling_avg']:<11,.0f} {multiple:<9.1f}x ₹{row['trnvr']:<11,.2f}")

# Detailed analysis of volume bursts
if len(volume_bursts) > 0:
    print(f"\n=== DETAILED VOLUME BURST ANALYSIS ===")
    
    # 1. Burst magnitude analysis
    print("\n--- BURST MAGNITUDE ANALYSIS ---")
    # Calculate multiple directly in the DataFrame
    volume_bursts['burst_multiple'] = volume_bursts['qty'] / volume_bursts['qty_rolling_avg']
    
    print(f"Burst magnitude statistics:")
    print(f"  Average multiple: {volume_bursts['burst_multiple'].mean():.1f}x")
    print(f"  Median multiple: {volume_bursts['burst_multiple'].median():.1f}x")
    print(f"  Maximum multiple: {volume_bursts['burst_multiple'].max():.1f}x")
    print(f"  Minimum multiple: {volume_bursts['burst_multiple'].min():.1f}x")
    
    # 2. Burst size categorization
    print(f"\n--- BURST SIZE CATEGORIZATION ---")
    
    def categorize_burst_size(multiple):
        if multiple <= 5:
            return 'Moderate (3-5x)'
        elif multiple <= 10:
            return 'High (5-10x)'
        elif multiple <= 20:
            return 'Very High (10-20x)'
        else:
            return 'Extreme (>20x)'
    
    volume_bursts['burst_category'] = volume_bursts['burst_multiple'].apply(categorize_burst_size)
    burst_categories = volume_bursts['burst_category'].value_counts()
    
    print("Burst size distribution:")
    for category, count in burst_categories.items():
        print(f"  {category}: {count} bursts")
    
    # 3. Time distribution of bursts
    print(f"\n--- TIME DISTRIBUTION OF BURSTS ---")
    volume_bursts['hour'] = volume_bursts['date'].dt.hour
    hourly_bursts = volume_bursts['hour'].value_counts().sort_index()
    
    print("Hourly distribution of volume bursts:")
    for hour, count in hourly_bursts.items():
        print(f"  {hour:02d}:00-{hour:02d}:59: {count} bursts")
    
    # 4. Price analysis during bursts
    print(f"\n--- PRICE ANALYSIS DURING BURSTS ---")
    print(f"Average price during bursts: ₹{volume_bursts['price'].mean():.2f}")
    print(f"Price range during bursts: ₹{volume_bursts['price'].min():.2f} - ₹{volume_bursts['price'].max():.2f}")
    
    # Check if bursts coincide with price movements
    if 'price_change' in df.columns:
        burst_price_changes = volume_bursts['price_change'].dropna()
        if len(burst_price_changes) > 0:
            print(f"Average price change during bursts: ₹{burst_price_changes.mean():.4f}")
            print(f"Price change range during bursts: ₹{burst_price_changes.min():.4f} - ₹{burst_price_changes.max():.4f}")
    
    # 5. Burst clustering analysis
    print(f"\n--- BURST CLUSTERING ANALYSIS ---")
    if len(volume_bursts) > 1:
        bursts_sorted = volume_bursts.sort_values('date')
        time_diffs = bursts_sorted['date'].diff().dt.total_seconds() / 60  # in minutes
        
        clustered_bursts = time_diffs[time_diffs <= 5]  # Within 5 minutes
        print(f"Bursts within 5 minutes of each other: {len(clustered_bursts)}")
        print(f"Clustering percentage: {(len(clustered_bursts)/len(volume_bursts)*100):.1f}%")
    
    # 6. Top volume bursts
    print(f"\n--- TOP 10 VOLUME BURSTS BY MULTIPLE ---")
    # Use the burst_multiple column directly from the DataFrame
    top_bursts = volume_bursts.nlargest(10, 'burst_multiple')
    
    print(f"{'Rank':<4} {'Time':<12} {'Quantity':<10} {'Multiple':<10} {'Rolling Avg':<12} {'Price':<8}")
    print("-" * 70)
    
    for idx, (_, row) in enumerate(top_bursts.iterrows(), 1):
        print(f"{idx:<4} {row['date'].strftime('%H:%M:%S'):<12} {row['qty']:<10,} {row['burst_multiple']:<9.1f}x {row['qty_rolling_avg']:<11,.0f} ₹{row['price']:<7.2f}")

# Comparison with overall statistics
print(f"\n=== COMPARISON WITH OVERALL STATISTICS ===")
overall_avg_qty = df['qty'].mean()
overall_median_qty = df['qty'].median()

print(f"Overall average quantity: {overall_avg_qty:,.0f}")
print(f"Overall median quantity: {overall_median_qty:,.0f}")

if len(volume_bursts) > 0:
    burst_avg_qty = volume_bursts['qty'].mean()
    print(f"Average quantity during bursts: {burst_avg_qty:,.0f}")
    print(f"Burst average vs overall average: {burst_avg_qty/overall_avg_qty:.1f}x")

# Rolling average analysis
print(f"\n=== ROLLING AVERAGE ANALYSIS ===")
print(f"Rolling average window size: {ROLLING_WINDOW} trades")
print(f"Average rolling average: {df['qty_rolling_avg'].mean():,.0f}")
print(f"Rolling average range: {df['qty_rolling_avg'].min():,.0f} - {df['qty_rolling_avg'].max():,.0f}")

# Export volume burst data
print(f"\n=== EXPORT DATA ===")
print("Volume burst data is available in 'volume_bursts' DataFrame")
print("Additional columns added:")
print("- 'qty_rolling_avg': Rolling average of quantity")
print("- 'qty_rolling_std': Rolling standard deviation of quantity")
print("- 'volume_burst_threshold': Burst detection threshold")
print("- 'burst_multiple': Multiple of rolling average")
print("- 'burst_category': Categorization of burst magnitude")

# Display sample of volume burst data
if len(volume_bursts) > 0:
    print(f"\n=== SAMPLE VOLUME BURST DATA (First 5 rows) ===")
    sample_bursts = volume_bursts[['date', 'price', 'qty', 'qty_rolling_avg', 'burst_multiple', 'trnvr']].head(5)
    print(sample_bursts)

=== VOLUME BURST DETECTION ANALYSIS ===
Detecting volume bursts where qty > 3.0× rolling average...
Rolling average window: 20 trades
Calculating rolling average of quantity...

=== VOLUME BURST DETECTION RESULTS ===
Total volume bursts detected: 1,008
Percentage of total trades: 9.20%
Total volume in bursts: 6,070,933
Percentage of total volume: 52.74%
Total turnover in bursts: ₹2,347,871,874.70
Percentage of total turnover: 52.74%

=== ALL VOLUME BURSTS (>3.0× rolling average) ===
Rank Time         Price    Quantity   Rolling Avg  Multiple   Turnover    
------------------------------------------------------------------------------------------
1    15:01:57     ₹387.05  153,858    7,855       19.6     x ₹59,550,738.90
2    09:20:41     ₹389.30  67,256     7,164       9.4      x ₹26,182,760.80
3    12:08:06     ₹385.05  60,580     4,271       14.2     x ₹23,326,329.00
4    12:08:12     ₹384.80  55,402     6,283       8.8      x ₹21,318,689.60
5    14:40:45     ₹385.95  54,602     4,60

In [33]:
# Flag periods with zero trading for long durations (possible breaks)
print("=== ZERO TRADING DURATION ANALYSIS ===")

# Set parameters for break detection
MIN_BREAK_DURATION_MINUTES = 5  # Minimum duration to consider as a break
MARKET_HOURS = (9, 15)  # Market hours (9 AM to 3 PM)

print(f"Detecting trading breaks longer than {MIN_BREAK_DURATION_MINUTES} minutes...")
print(f"Market hours: {MARKET_HOURS[0]:02d}:00 - {MARKET_HOURS[1]:02d}:59")

# Ensure data is sorted by datetime
df = df.sort_values('date').reset_index(drop=True)

# Calculate time differences between consecutive trades
print("Calculating time gaps between trades...")
df['next_trade_time'] = df['date'].shift(-1)
df['time_gap_minutes'] = (df['next_trade_time'] - df['date']).dt.total_seconds() / 60

# Identify potential breaks (gaps longer than threshold)
potential_breaks = df[df['time_gap_minutes'] > MIN_BREAK_DURATION_MINUTES].copy()
potential_breaks = potential_breaks.sort_values('time_gap_minutes', ascending=False)

print(f"\n=== TRADING BREAK DETECTION RESULTS ===")
print(f"Total potential breaks detected: {len(potential_breaks):,}")
print(f"Break threshold: {MIN_BREAK_DURATION_MINUTES} minutes")

if len(potential_breaks) > 0:
    # Calculate break statistics
    total_break_time = potential_breaks['time_gap_minutes'].sum()
    avg_break_duration = potential_breaks['time_gap_minutes'].mean()
    max_break_duration = potential_breaks['time_gap_minutes'].max()
    
    print(f"Total break time: {total_break_time:.1f} minutes")
    print(f"Average break duration: {avg_break_duration:.1f} minutes")
    print(f"Longest break: {max_break_duration:.1f} minutes")

# Display all potential breaks
if len(potential_breaks) > 0:
    print(f"\n=== ALL TRADING BREAKS (>{MIN_BREAK_DURATION_MINUTES} minutes) ===")
    print(f"{'Rank':<4} {'Start Time':<12} {'End Time':<12} {'Duration':<10} {'Start Price':<12} {'End Price':<12} {'Price Change':<12}")
    print("-" * 90)
    
    for idx, (_, row) in enumerate(potential_breaks.iterrows(), 1):
        start_time = row['date']
        end_time = row['next_trade_time']
        duration = row['time_gap_minutes']
        start_price = row['price']
        
        # Find the price at the end of the break
        end_price = df[df['date'] == end_time]['price'].iloc[0] if not df[df['date'] == end_time].empty else start_price
        price_change = end_price - start_price
        price_change_pct = (price_change / start_price) * 100 if start_price != 0 else 0
        
        print(f"{idx:<4} {start_time.strftime('%H:%M:%S'):<12} {end_time.strftime('%H:%M:%S'):<12} {duration:<9.1f}m ₹{start_price:<11.2f} ₹{end_price:<11.2f} ₹{price_change:<11.2f} ({price_change_pct:+.2f}%)")

# Detailed analysis of trading breaks
if len(potential_breaks) > 0:
    print(f"\n=== DETAILED BREAK ANALYSIS ===")
    
    # 1. Break duration categorization
    print("\n--- BREAK DURATION CATEGORIZATION ---")
    
    def categorize_break_duration(minutes):
        if minutes <= 10:
            return 'Short (5-10 min)'
        elif minutes <= 30:
            return 'Medium (10-30 min)'
        elif minutes <= 60:
            return 'Long (30-60 min)'
        else:
            return 'Very Long (>60 min)'
    
    potential_breaks['break_category'] = potential_breaks['time_gap_minutes'].apply(categorize_break_duration)
    break_categories = potential_breaks['break_category'].value_counts()
    
    print("Break duration distribution:")
    for category, count in break_categories.items():
        print(f"  {category}: {count} breaks")
    
    # 2. Time distribution of breaks
    print(f"\n--- TIME DISTRIBUTION OF BREAKS ---")
    potential_breaks['hour'] = potential_breaks['date'].dt.hour
    hourly_breaks = potential_breaks['hour'].value_counts().sort_index()
    
    print("Hourly distribution of trading breaks:")
    for hour, count in hourly_breaks.items():
        print(f"  {hour:02d}:00-{hour:02d}:59: {count} breaks")
    
    # 3. Break impact analysis
    print(f"\n--- BREAK IMPACT ANALYSIS ---")
    
    # Calculate price changes across breaks
    break_impacts = []
    for idx, (_, row) in enumerate(potential_breaks.iterrows()):
        start_time = row['date']
        end_time = row['next_trade_time']
        duration = row['time_gap_minutes']
        start_price = row['price']
        
        # Find the price at the end of the break
        end_price = df[df['date'] == end_time]['price'].iloc[0] if not df[df['date'] == end_time].empty else start_price
        price_change = end_price - start_price
        price_change_pct = (price_change / start_price) * 100 if start_price != 0 else 0
        
        break_impacts.append({
            'start_time': start_time,
            'end_time': end_time,
            'duration': duration,
            'start_price': start_price,
            'end_price': end_price,
            'price_change': price_change,
            'price_change_pct': price_change_pct
        })
    
    break_impacts_df = pd.DataFrame(break_impacts)
    
    if len(break_impacts_df) > 0:
        print(f"Price change statistics across breaks:")
        print(f"  Average price change: ₹{break_impacts_df['price_change'].mean():.4f}")
        print(f"  Average percentage change: {break_impacts_df['price_change_pct'].mean():.2f}%")
        print(f"  Largest price increase: ₹{break_impacts_df['price_change'].max():.4f}")
        print(f"  Largest price decrease: ₹{break_impacts_df['price_change'].min():.4f}")
        
        # Count positive vs negative price changes
        positive_changes = (break_impacts_df['price_change'] > 0).sum()
        negative_changes = (break_impacts_df['price_change'] < 0).sum()
        no_changes = (break_impacts_df['price_change'] == 0).sum()
        
        print(f"\nPrice change direction across breaks:")
        print(f"  Price increases: {positive_changes} breaks")
        print(f"  Price decreases: {negative_changes} breaks")
        print(f"  No change: {no_changes} breaks")
    
    # 4. Longest breaks analysis
    print(f"\n--- LONGEST BREAKS ANALYSIS ---")
    longest_breaks = potential_breaks.nlargest(5, 'time_gap_minutes')
    
    print("Top 5 longest trading breaks:")
    for idx, (_, row) in enumerate(longest_breaks.iterrows(), 1):
        print(f"{idx}. {row['date'].strftime('%H:%M:%S')} - {row['next_trade_time'].strftime('%H:%M:%S')} ({row['time_gap_minutes']:.1f} minutes)")
    
    # 5. Break clustering analysis
    print(f"\n--- BREAK CLUSTERING ANALYSIS ---")
    if len(potential_breaks) > 1:
        breaks_sorted = potential_breaks.sort_values('date')
        break_time_diffs = breaks_sorted['date'].diff().dt.total_seconds() / 60  # in minutes
        
        clustered_breaks = break_time_diffs[break_time_diffs <= 30]  # Within 30 minutes
        print(f"Breaks within 30 minutes of each other: {len(clustered_breaks)}")
        print(f"Clustering percentage: {(len(clustered_breaks)/len(potential_breaks)*100):.1f}%")

# Market hours analysis
print(f"\n=== MARKET HOURS ANALYSIS ===")
market_start = pd.Timestamp(df['date'].dt.date.iloc[0]).replace(hour=MARKET_HOURS[0], minute=0, second=0)
market_end = pd.Timestamp(df['date'].dt.date.iloc[0]).replace(hour=MARKET_HOURS[1], minute=59, second=59)

total_market_minutes = (market_end - market_start).total_seconds() / 60
trading_minutes = len(df)  # Assuming 1 tick = 1 minute (approximate)
break_minutes = total_market_minutes - trading_minutes

print(f"Market hours: {market_start.strftime('%H:%M')} - {market_end.strftime('%H:%M')}")
print(f"Total market minutes: {total_market_minutes:.0f}")
print(f"Trading minutes: {trading_minutes}")
print(f"Break minutes: {break_minutes:.0f}")
print(f"Break percentage: {(break_minutes/total_market_minutes*100):.1f}%")

# Summary statistics
print(f"\n=== SUMMARY STATISTICS ===")
print(f"Total trades analyzed: {len(df):,}")
print(f"Total potential breaks: {len(potential_breaks):,}")
print(f"Break detection threshold: {MIN_BREAK_DURATION_MINUTES} minutes")

if len(potential_breaks) > 0:
    print(f"Average break duration: {potential_breaks['time_gap_minutes'].mean():.1f} minutes")
    print(f"Total break time: {potential_breaks['time_gap_minutes'].sum():.1f} minutes")
    print(f"Break frequency: {len(potential_breaks)/len(df)*100:.2f}%")

# Export break data
print(f"\n=== EXPORT DATA ===")
print("Trading break data is available in:")
print("- 'potential_breaks': DataFrame with all detected breaks")
print("- 'break_impacts_df': DataFrame with price impact analysis")

# Display sample of break data
if len(potential_breaks) > 0:
    print(f"\n=== SAMPLE BREAK DATA (First 5 rows) ===")
    sample_breaks = potential_breaks[['date', 'next_trade_time', 'time_gap_minutes', 'price', 'break_category']].head(5)
    print(sample_breaks)

=== ZERO TRADING DURATION ANALYSIS ===
Detecting trading breaks longer than 5 minutes...
Market hours: 09:00 - 15:59
Calculating time gaps between trades...

=== TRADING BREAK DETECTION RESULTS ===
Total potential breaks detected: 0
Break threshold: 5 minutes

=== MARKET HOURS ANALYSIS ===
Market hours: 09:00 - 15:59
Total market minutes: 420
Trading minutes: 10956
Break minutes: -10536
Break percentage: -2508.7%

=== SUMMARY STATISTICS ===
Total trades analyzed: 10,956
Total potential breaks: 0
Break detection threshold: 5 minutes

=== EXPORT DATA ===
Trading break data is available in:
- 'potential_breaks': DataFrame with all detected breaks
- 'break_impacts_df': DataFrame with price impact analysis
