In [1]:
import pandas as pd
import os

# Define the file path
file_path = r"D:\Market Projects\options_data_analyzer\Aug '25\Aug 07 Exp\07 Aug\BEL_EQ.csv"

# Load CSV with pandas
# Using default encoding (utf-8) and comma delimiter
# The file appears to have standard CSV format
df = pd.read_csv(file_path)

# Display basic information about the loaded data
print("Data loaded successfully!")
print(f"File path: {file_path}")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print("\nFirst few rows:")
print(df.head())
print("\nData types:")
print(df.dtypes)
print("\nBasic statistics:")
print(df.describe())

Data loaded successfully!
File path: D:\Market Projects\options_data_analyzer\Aug '25\Aug 07 Exp\07 Aug\BEL_EQ.csv
Shape: (18250, 5)
Columns: ['date', 'price', 'qty', 'trnvr', 'cum_trnvr']

First few rows:
                     date   price    qty        trnvr    cum_trnvr
0  2025-08-07 09:15:00 AM  386.85  65740  25431519.00  25431519.00
1  2025-08-07 09:15:01 AM  386.65      0         0.00  25431519.00
2  2025-08-07 09:15:01 AM  386.30      0         0.00  25431519.00
3  2025-08-07 09:15:01 AM  386.30    895    345738.50  25777257.50
4  2025-08-07 09:15:01 AM  386.75   1401    541836.75  26319094.25

Data types:
date          object
price        float64
qty            int64
trnvr        float64
cum_trnvr    float64
dtype: object

Basic statistics:
              price            qty         trnvr     cum_trnvr
count  18250.000000   18250.000000  1.825000e+04  1.825000e+04
mean     386.537181     630.694301  2.439435e+05  2.317732e+09
std        1.570300    2651.792039  1.026096e+06  1.

In [2]:
# Preview first and last 10 rows to check ordering
print("=== FIRST 10 ROWS ===")
print(df.head(10))
print("\n" + "="*50 + "\n")
print("=== LAST 10 ROWS ===")
print(df.tail(10))

# Check if data is in chronological order
print("\n" + "="*50 + "\n")
print("=== CHRONOLOGICAL ORDER CHECK ===")

# Convert date column to datetime if not already
df['date'] = pd.to_datetime(df['date'])

# Check first and last timestamps
first_time = df['date'].iloc[0]
last_time = df['date'].iloc[-1]

print(f"First timestamp: {first_time}")
print(f"Last timestamp: {last_time}")

# Check if chronological (ascending) or reverse chronological (descending)
if first_time < last_time:
    print("✓ Data is in CHRONOLOGICAL order (ascending)")
    print("  - First row: Earliest time")
    print("  - Last row: Latest time")
else:
    print("✗ Data is in REVERSE CHRONOLOGICAL order (descending)")
    print("  - First row: Latest time")
    print("  - Last row: Earliest time")

# Show time range
time_range = last_time - first_time
print(f"\nTotal time range: {time_range}")

=== FIRST 10 ROWS ===
                     date   price    qty        trnvr    cum_trnvr
0  2025-08-07 09:15:00 AM  386.85  65740  25431519.00  25431519.00
1  2025-08-07 09:15:01 AM  386.65      0         0.00  25431519.00
2  2025-08-07 09:15:01 AM  386.30      0         0.00  25431519.00
3  2025-08-07 09:15:01 AM  386.30    895    345738.50  25777257.50
4  2025-08-07 09:15:01 AM  386.75   1401    541836.75  26319094.25
5  2025-08-07 09:15:02 AM  386.80   1795    694306.00  27013400.25
6  2025-08-07 09:15:02 AM  386.95    741    286729.95  27300130.20
7  2025-08-07 09:15:03 AM  386.85      0         0.00  27300130.20
8  2025-08-07 09:15:03 AM  386.50      0         0.00  27300130.20
9  2025-08-07 09:15:03 AM  386.90   2717   1051207.30  28351337.50


=== LAST 10 ROWS ===
                         date   price   qty       trnvr     cum_trnvr
18240  2025-08-07 03:29:31 PM  388.25   226    87744.50  4.446928e+09
18241  2025-08-07 03:29:31 PM  388.10     0        0.00  4.446928e+09
18242  2

In [3]:
# Convert date to datetime64[ns] and extract datetime features
print("=== DATETIME CONVERSION AND FEATURE EXTRACTION ===")

# Convert date column to datetime64[ns]
df['date'] = pd.to_datetime(df['date'])

# Extract additional datetime features
df['date_only'] = df['date'].dt.date
df['time'] = df['date'].dt.time
df['hour'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute
df['second'] = df['date'].dt.second

# Display the new datetime features
print("New datetime features added:")
print(f"  - date_only: {df['date_only'].dtype}")
print(f"  - time: {df['time'].dtype}")
print(f"  - hour: {df['hour'].dtype}")
print(f"  - minute: {df['minute'].dtype}")
print(f"  - second: {df['second'].dtype}")

# Show sample of the enhanced dataframe
print("\n=== SAMPLE DATA WITH NEW FEATURES ===")
print(df[['date', 'date_only', 'time', 'hour', 'minute', 'second', 'price', 'qty']].head(10))

# Verify datetime conversion
print(f"\n=== DATETIME VERIFICATION ===")
print(f"Original date column dtype: {df['date'].dtype}")
print(f"First timestamp: {df['date'].iloc[0]}")
print(f"Last timestamp: {df['date'].iloc[-1]}")
print(f"Total unique dates: {df['date_only'].nunique()}")
print(f"Date range: {df['date_only'].min()} to {df['date_only'].max()}")

=== DATETIME CONVERSION AND FEATURE EXTRACTION ===
New datetime features added:
  - date_only: object
  - time: object
  - hour: int32
  - minute: int32
  - second: int32

=== SAMPLE DATA WITH NEW FEATURES ===
                 date   date_only      time  hour  minute  second   price  \
0 2025-08-07 09:15:00  2025-08-07  09:15:00     9      15       0  386.85   
1 2025-08-07 09:15:01  2025-08-07  09:15:01     9      15       1  386.65   
2 2025-08-07 09:15:01  2025-08-07  09:15:01     9      15       1  386.30   
3 2025-08-07 09:15:01  2025-08-07  09:15:01     9      15       1  386.30   
4 2025-08-07 09:15:01  2025-08-07  09:15:01     9      15       1  386.75   
5 2025-08-07 09:15:02  2025-08-07  09:15:02     9      15       2  386.80   
6 2025-08-07 09:15:02  2025-08-07  09:15:02     9      15       2  386.95   
7 2025-08-07 09:15:03  2025-08-07  09:15:03     9      15       3  386.85   
8 2025-08-07 09:15:03  2025-08-07  09:15:03     9      15       3  386.50   
9 2025-08-07 09:15:0

In [4]:
# Handle zero-quantity trades - remove rows with 0 qty
print("=== HANDLING ZERO-QUANTITY TRADES ===")

# Check current data shape and zero qty count
print(f"Original data shape: {df.shape}")
zero_qty_count = (df['qty'] == 0).sum()
print(f"Rows with zero quantity: {zero_qty_count}")
print(f"Percentage of zero qty rows: {(zero_qty_count/len(df)*100):.2f}%")

# Show sample of zero qty rows before removal
print("\n=== SAMPLE OF ZERO QTY ROWS (BEFORE REMOVAL) ===")
zero_qty_sample = df[df['qty'] == 0][['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].head(5)
print(zero_qty_sample)

# Remove rows with zero quantity
df_clean = df[df['qty'] > 0].copy()

# Reset index after filtering
df_clean = df_clean.reset_index(drop=True)

# Display results after cleaning
print(f"\n=== AFTER CLEANING ===")
print(f"Cleaned data shape: {df_clean.shape}")
print(f"Rows removed: {len(df) - len(df_clean)}")
print(f"Remaining rows: {len(df_clean)}")

# Show sample of cleaned data
print("\n=== SAMPLE OF CLEANED DATA ===")
print(df_clean[['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].head(10))

# Verify no zero qty rows remain
remaining_zero_qty = (df_clean['qty'] == 0).sum()
print(f"\nZero qty rows remaining: {remaining_zero_qty}")

# Update the main dataframe reference
df = df_clean
print(f"\n✓ Main dataframe 'df' now contains {len(df)} rows with non-zero quantities")

=== HANDLING ZERO-QUANTITY TRADES ===
Original data shape: (18250, 10)
Rows with zero quantity: 7294
Percentage of zero qty rows: 39.97%

=== SAMPLE OF ZERO QTY ROWS (BEFORE REMOVAL) ===
                  date   price  qty  trnvr    cum_trnvr
1  2025-08-07 09:15:01  386.65    0    0.0  25431519.00
2  2025-08-07 09:15:01  386.30    0    0.0  25431519.00
7  2025-08-07 09:15:03  386.85    0    0.0  27300130.20
8  2025-08-07 09:15:03  386.50    0    0.0  27300130.20
13 2025-08-07 09:15:05  386.45    0    0.0  32995767.85

=== AFTER CLEANING ===
Cleaned data shape: (10956, 10)
Rows removed: 7294
Remaining rows: 10956

=== SAMPLE OF CLEANED DATA ===
                 date   price    qty        trnvr    cum_trnvr
0 2025-08-07 09:15:00  386.85  65740  25431519.00  25431519.00
1 2025-08-07 09:15:01  386.30    895    345738.50  25777257.50
2 2025-08-07 09:15:01  386.75   1401    541836.75  26319094.25
3 2025-08-07 09:15:02  386.80   1795    694306.00  27013400.25
4 2025-08-07 09:15:02  386.95    

In [5]:
# Validate numeric columns for negatives or outliers
print("=== NUMERIC COLUMN VALIDATION ===")

# List of numeric columns to validate
numeric_cols = ['price', 'qty', 'trnvr', 'cum_trnvr']

# Check for negative values
print("=== NEGATIVE VALUE CHECK ===")
for col in numeric_cols:
    negative_count = (df[col] < 0).sum()
    print(f"{col}: {negative_count} negative values")

# Check for zero values (after qty cleaning)
print("\n=== ZERO VALUE CHECK ===")
for col in numeric_cols:
    zero_count = (df[col] == 0).sum()
    print(f"{col}: {zero_count} zero values")

# Statistical summary for outlier detection
print("\n=== STATISTICAL SUMMARY ===")
print(df[numeric_cols].describe())

# Outlier detection using IQR method
print("\n=== OUTLIER DETECTION (IQR METHOD) ===")
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers_lower = (df[col] < lower_bound).sum()
    outliers_upper = (df[col] > upper_bound).sum()
    
    print(f"\n{col}:")
    print(f"  Q1: {Q1:.2f}, Q3: {Q3:.2f}, IQR: {IQR:.2f}")
    print(f"  Lower bound: {lower_bound:.2f}, Upper bound: {upper_bound:.2f}")
    print(f"  Outliers below lower bound: {outliers_lower}")
    print(f"  Outliers above upper bound: {outliers_upper}")

# Check for extreme values (beyond 3 standard deviations)
print("\n=== EXTREME VALUE CHECK (3 STD DEV) ===")
for col in numeric_cols:
    mean_val = df[col].mean()
    std_val = df[col].std()
    
    lower_3std = mean_val - 3 * std_val
    upper_3std = mean_val + 3 * std_val
    
    extreme_lower = (df[col] < lower_3std).sum()
    extreme_upper = (df[col] > upper_3std).sum()
    
    print(f"\n{col}:")
    print(f"  Mean: {mean_val:.2f}, Std: {std_val:.2f}")
    print(f"  Lower 3σ: {lower_3std:.2f}, Upper 3σ: {upper_3std:.2f}")
    print(f"  Extreme values below: {extreme_lower}")
    print(f"  Extreme values above: {extreme_upper}")

# Show sample of potential outliers
print("\n=== SAMPLE OF POTENTIAL OUTLIERS ===")
for col in numeric_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df[df[col] > upper_bound]
    if len(outliers) > 0:
        print(f"\n{col} outliers (top 5):")
        print(outliers[['date', col, 'qty', 'trnvr']].head())

# Data quality summary
print("\n=== DATA QUALITY SUMMARY ===")
print(f"Total rows: {len(df)}")
print(f"Columns with potential issues:")
for col in numeric_cols:
    issues = []
    if (df[col] < 0).any():
        issues.append("negative values")
    if (df[col] == 0).any() and col != 'qty':  # qty can legitimately be 0
        issues.append("zero values")
    
    if issues:
        print(f"  {col}: {', '.join(issues)}")
    else:
        print(f"  {col}: ✓ clean")

=== NUMERIC COLUMN VALIDATION ===
=== NEGATIVE VALUE CHECK ===
price: 0 negative values
qty: 0 negative values
trnvr: 0 negative values
cum_trnvr: 0 negative values

=== ZERO VALUE CHECK ===
price: 0 zero values
qty: 0 zero values
trnvr: 0 zero values
cum_trnvr: 0 zero values

=== STATISTICAL SUMMARY ===
              price            qty         trnvr     cum_trnvr
count  10956.000000   10956.000000  1.095600e+04  1.095600e+04
mean     386.561094    1050.581508  4.063499e+05  2.377823e+09
std        1.574324    3357.505716  1.299189e+06  1.165181e+09
min      383.500000       1.000000  3.836000e+02  2.543152e+07
25%      385.150000      26.000000  1.003177e+04  1.490814e+09
50%      386.450000     200.000000  7.732500e+04  2.355502e+09
75%      387.950000     799.000000  3.085895e+05  3.321858e+09
max      390.200000  153858.000000  5.955074e+07  4.451969e+09

=== OUTLIER DETECTION (IQR METHOD) ===

price:
  Q1: 385.15, Q3: 387.95, IQR: 2.80
  Lower bound: 380.95, Upper bound: 392.15


In [6]:
# Ensure sorting by datetime for time-series integrity
print("=== TIME-SERIES INTEGRITY CHECK AND SORTING ===")

# Check current sorting status
print("=== CURRENT SORTING STATUS ===")
print(f"First timestamp: {df['date'].iloc[0]}")
print(f"Last timestamp: {df['date'].iloc[-1]}")

# Check if data is already sorted
is_sorted = df['date'].is_monotonic_increasing
print(f"Data is already sorted chronologically: {is_sorted}")

# Check for any duplicate timestamps
duplicate_timestamps = df['date'].duplicated().sum()
print(f"Duplicate timestamps: {duplicate_timestamps}")

if duplicate_timestamps > 0:
    print("\n=== DUPLICATE TIMESTAMP ANALYSIS ===")
    duplicate_samples = df[df['date'].duplicated(keep=False)].sort_values('date')
    print("Sample duplicate timestamps:")
    print(duplicate_samples[['date', 'price', 'qty', 'trnvr']].head(10))

# Sort the dataframe by datetime
print("\n=== SORTING DATA BY DATETIME ===")
df_sorted = df.sort_values('date').reset_index(drop=True)

# Verify sorting
is_now_sorted = df_sorted['date'].is_monotonic_increasing
print(f"Data is now sorted chronologically: {is_now_sorted}")

# Display sorting verification
print(f"\n=== SORTING VERIFICATION ===")
print("First 5 rows after sorting:")
print(df_sorted[['date', 'price', 'qty', 'trnvr']].head())
print(f"\nLast 5 rows after sorting:")
print(df_sorted[['date', 'price', 'qty', 'trnvr']].tail())

# Check for any time gaps or irregularities
print(f"\n=== TIME SERIES CONTINUITY CHECK ===")
time_diffs = df_sorted['date'].diff().dropna()
print(f"Time differences between consecutive rows:")
print(f"  Min: {time_diffs.min()}")
print(f"  Max: {time_diffs.max()}")
print(f"  Mean: {time_diffs.mean()}")
print(f"  Most common: {time_diffs.mode().iloc[0] if len(time_diffs.mode()) > 0 else 'N/A'}")

# Check for any large time gaps
large_gaps = time_diffs[time_diffs > pd.Timedelta(minutes=5)]
if len(large_gaps) > 0:
    print(f"\n⚠️  Found {len(large_gaps)} time gaps larger than 5 minutes:")
    gap_indices = time_diffs[time_diffs > pd.Timedelta(minutes=5)].index
    for idx in gap_indices[:5]:  # Show first 5 gaps
        gap_start = df_sorted.loc[idx-1, 'date']
        gap_end = df_sorted.loc[idx, 'date']
        gap_duration = gap_end - gap_start
        print(f"  Gap: {gap_start} to {gap_end} (Duration: {gap_duration})")

# Update the main dataframe with sorted version
df = df_sorted
print(f"\n✓ Main dataframe 'df' is now properly sorted chronologically")
print(f"✓ Total rows: {len(df)}")
print(f"✓ Time range: {df['date'].min()} to {df['date'].max()}")

# Final verification
print(f"\n=== FINAL VERIFICATION ===")
print("✓ Data is sorted chronologically")
print("✓ Index is reset and sequential")
print("✓ Ready for time-series analysis")

=== TIME-SERIES INTEGRITY CHECK AND SORTING ===
=== CURRENT SORTING STATUS ===
First timestamp: 2025-08-07 09:15:00
Last timestamp: 2025-08-07 15:29:58
Data is already sorted chronologically: True
Duplicate timestamps: 1961

=== DUPLICATE TIMESTAMP ANALYSIS ===
Sample duplicate timestamps:
                  date   price   qty       trnvr
1  2025-08-07 09:15:01  386.30   895   345738.50
2  2025-08-07 09:15:01  386.75  1401   541836.75
3  2025-08-07 09:15:02  386.80  1795   694306.00
4  2025-08-07 09:15:02  386.95   741   286729.95
5  2025-08-07 09:15:03  386.90  2717  1051207.30
6  2025-08-07 09:15:03  386.80  9068  3507502.40
7  2025-08-07 09:15:04  386.75  1141   441281.75
8  2025-08-07 09:15:04  386.90  1798   695646.20
9  2025-08-07 09:15:05  386.65  2092   808871.80
10 2025-08-07 09:15:05  386.75  1679   649353.25

=== SORTING DATA BY DATETIME ===
Data is now sorted chronologically: True

=== SORTING VERIFICATION ===
First 5 rows after sorting:
                 date   price    qty 

In [7]:
# Get comprehensive descriptive statistics for numeric fields
print("=== COMPREHENSIVE DESCRIPTIVE STATISTICS ===")

# Get basic describe() for all numeric columns
print("=== BASIC DESCRIPTIVE STATISTICS ===")
print(df.describe())

# Get detailed statistics for each numeric column
print("\n" + "="*60)
print("=== DETAILED STATISTICS BY COLUMN ===")

numeric_cols = ['price', 'qty', 'trnvr', 'cum_trnvr']

for col in numeric_cols:
    print(f"\n--- {col.upper()} ---")
    col_stats = df[col].describe()
    
    print(f"Count: {col_stats['count']:,.0f}")
    print(f"Mean: {col_stats['mean']:,.2f}")
    print(f"Std: {col_stats['std']:,.2f}")
    print(f"Min: {col_stats['min']:,.2f}")
    print(f"25%: {col_stats['25%']:,.2f}")
    print(f"50% (Median): {col_stats['50%']:,.2f}")
    print(f"75%: {col_stats['75%']:,.2f}")
    print(f"Max: {col_stats['max']:,.2f}")
    
    # Additional useful statistics
    print(f"Range: {col_stats['max'] - col_stats['min']:,.2f}")
    print(f"IQR: {col_stats['75%'] - col_stats['25%']:,.2f}")
    print(f"Coefficient of Variation: {(col_stats['std']/col_stats['mean']*100):,.2f}%")

# Get statistics for datetime features
print("\n" + "="*60)
print("=== DATETIME FEATURE STATISTICS ===")

print("\n--- HOUR DISTRIBUTION ---")
hour_counts = df['hour'].value_counts().sort_index()
print(hour_counts)

print("\n--- MINUTE DISTRIBUTION (Sample) ---")
minute_counts = df['minute'].value_counts().sort_index().head(20)
print(minute_counts)

# Get statistics for specific time periods
print("\n" + "="*60)
print("=== TIME PERIOD ANALYSIS ===")

# Market hours analysis (assuming 9:15 AM to 3:30 PM)
market_hours = df[(df['hour'] >= 9) & (df['hour'] <= 15)]
print(f"Trades during market hours (9 AM - 3 PM): {len(market_hours):,}")
print(f"Trades outside market hours: {len(df) - len(market_hours):,}")

# Price range analysis
print(f"\n--- PRICE ANALYSIS ---")
print(f"Price range: ₹{df['price'].min():.2f} to ₹{df['price'].max():.2f}")
print(f"Price spread: ₹{df['price'].max() - df['price'].min():.2f}")

# Volume analysis
print(f"\n--- VOLUME ANALYSIS ---")
print(f"Total volume traded: {df['qty'].sum():,}")
print(f"Average trade size: {df['qty'].mean():.0f}")
print(f"Largest single trade: {df['qty'].max():,}")

# Turnover analysis
print(f"\n--- TURNOVER ANALYSIS ---")
print(f"Total turnover: ₹{df['trnvr'].sum():,.2f}")
print(f"Average trade value: ₹{df['trnvr'].mean():,.2f}")
print(f"Largest single trade value: ₹{df['trnvr'].max():,.2f}")

# Display summary table
print("\n" + "="*60)
print("=== SUMMARY TABLE ===")
summary_data = {
    'Metric': ['Total Rows', 'Price Range', 'Total Volume', 'Total Turnover', 'Time Range'],
    'Value': [
        f"{len(df):,}",
        f"₹{df['price'].min():.2f} - ₹{df['price'].max():.2f}",
        f"{df['qty'].sum():,}",
        f"₹{df['trnvr'].sum():,.2f}",
        f"{df['date'].min().strftime('%H:%M:%S')} to {df['date'].max().strftime('%H:%M:%S')}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

=== COMPREHENSIVE DESCRIPTIVE STATISTICS ===
=== BASIC DESCRIPTIVE STATISTICS ===
                                date         price            qty  \
count                          10956  10956.000000   10956.000000   
mean   2025-08-07 12:40:58.929810176    386.561094    1050.581508   
min              2025-08-07 09:15:00    383.500000       1.000000   
25%       2025-08-07 11:06:26.500000    385.150000      26.000000   
50%              2025-08-07 12:45:14    386.450000     200.000000   
75%    2025-08-07 14:26:31.249999872    387.950000     799.000000   
max              2025-08-07 15:29:58    390.200000  153858.000000   
std                              NaN      1.574324    3357.505716   

              trnvr     cum_trnvr          hour        minute        second  
count  1.095600e+04  1.095600e+04  10956.000000  10956.000000  10956.000000  
mean   4.063499e+05  2.377823e+09     12.209383     27.926798     29.543173  
min    3.836000e+02  2.543152e+07      9.000000      0.000000 

In [9]:
# Count total unique trading days
print("=== TRADING DAYS ANALYSIS ===")

# Extract unique dates from the datetime column
unique_dates = df['date_only'].unique()
total_trading_days = len(unique_dates)

print(f"Total unique trading days: {total_trading_days}")

# Display all unique trading dates
print(f"\n=== ALL TRADING DATES ===")
for i, date in enumerate(sorted(unique_dates), 1):
    print(f"{i:2d}. {date}")

# Get date range
date_range = f"{min(unique_dates)} to {max(unique_dates)}"
print(f"\nTrading date range: {date_range}")

# Check if all dates are from the same month/year
print(f"\n=== DATE ANALYSIS ===")
if total_trading_days == 1:
    print("Single trading day data")
    print(f"Date: {unique_dates[0]}")
    # Initialize these variables for single day to avoid errors
    months = {unique_dates[0].month}
    years = {unique_dates[0].year}
elif total_trading_days > 1:
    # Check month and year consistency
    months = set(date.month for date in unique_dates)
    years = set(date.year for date in unique_dates)
    
    print(f"Multiple trading days: {total_trading_days}")
    print(f"Months covered: {sorted(months)}")
    print(f"Years covered: {sorted(years)}")
    
    if len(months) == 1:
        month_name = pd.Timestamp(unique_dates[0]).strftime('%B')
        print(f"All dates are from: {month_name} {list(years)[0]}")
    
    if len(years) == 1:
        print(f"All dates are from year: {list(years)[0]}")

# Trading days by month (if multiple months)
if len(months) > 1:
    print(f"\n=== TRADING DAYS BY MONTH ===")
    monthly_counts = {}
    for date in unique_dates:
        month_key = f"{date.year}-{date.month:02d}"
        monthly_counts[month_key] = monthly_counts.get(month_key, 0) + 1
    
    for month_key in sorted(monthly_counts.keys()):
        year, month = month_key.split('-')
        month_name = pd.Timestamp(f"{year}-{month}-01").strftime('%B %Y')
        print(f"{month_name}: {monthly_counts[month_key]} trading days")

# Verify data consistency
print(f"\n=== DATA CONSISTENCY CHECK ===")
print(f"Total rows in dataset: {len(df):,}")
print(f"Average rows per trading day: {len(df)/total_trading_days:.1f}")

# Check for any missing dates in sequence (if multiple days)
if total_trading_days > 1:
    sorted_dates = sorted(unique_dates)
    date_diffs = []
    for i in range(1, len(sorted_dates)):
        diff = (sorted_dates[i] - sorted_dates[i-1]).days
        date_diffs.append(diff)
    
    if any(diff > 1 for diff in date_diffs):
        print(f"\n⚠️  Gaps detected in trading days:")
        for i, diff in enumerate(date_diffs):
            if diff > 1:
                gap_start = sorted_dates[i-1]
                gap_end = sorted_dates[i]
                print(f"  Gap: {gap_start} to {gap_end} ({diff-1} missing days)")
    else:
        print(f"\n✓ No gaps in trading days - consecutive trading days")

print(f"\n=== SUMMARY ===")
print(f"✓ Total unique trading days: {total_trading_days}")
print(f"✓ Date range: {date_range}")
print(f"✓ Ready for daily analysis and aggregation")

=== TRADING DAYS ANALYSIS ===
Total unique trading days: 1

=== ALL TRADING DATES ===
 1. 2025-08-07

Trading date range: 2025-08-07 to 2025-08-07

=== DATE ANALYSIS ===
Single trading day data
Date: 2025-08-07

=== DATA CONSISTENCY CHECK ===
Total rows in dataset: 10,956
Average rows per trading day: 10956.0

=== SUMMARY ===
✓ Total unique trading days: 1
✓ Date range: 2025-08-07 to 2025-08-07
✓ Ready for daily analysis and aggregation


In [10]:
# Identify earliest and latest timestamps
print("=== TIMESTAMP RANGE ANALYSIS ===")

# Get earliest and latest timestamps
earliest_timestamp = df['date'].min()
latest_timestamp = df['date'].max()

print(f"Earliest timestamp: {earliest_timestamp}")
print(f"Latest timestamp: {latest_timestamp}")

# Calculate total time duration
total_duration = latest_timestamp - earliest_timestamp
print(f"Total time duration: {total_duration}")

# Convert duration to more readable format
duration_hours = total_duration.total_seconds() / 3600
duration_minutes = total_duration.total_seconds() / 60

print(f"Duration in hours: {duration_hours:.2f} hours")
print(f"Duration in minutes: {duration_minutes:.0f} minutes")

# Display the actual rows with earliest and latest timestamps
print(f"\n=== EARLIEST TIMESTAMP ROW ===")
earliest_row = df[df['date'] == earliest_timestamp]
print(earliest_row[['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].to_string(index=False))

print(f"\n=== LATEST TIMESTAMP ROW ===")
latest_row = df[df['date'] == latest_timestamp]
print(latest_row[['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].to_string(index=False))

# Check if timestamps span across different time periods
print(f"\n=== TIME PERIOD ANALYSIS ===")
earliest_hour = earliest_timestamp.hour
earliest_minute = earliest_timestamp.minute
latest_hour = latest_timestamp.hour
latest_minute = latest_timestamp.minute

print(f"Earliest: {earliest_hour:02d}:{earliest_minute:02d}")
print(f"Latest: {latest_hour:02d}:{latest_minute:02d}")

# Market hours analysis (assuming 9:15 AM to 3:30 PM)
market_start = pd.Timestamp('2025-08-07 09:15:00')
market_end = pd.Timestamp('2025-08-07 15:30:00')

print(f"\nMarket hours: 09:15:00 to 15:30:00")
print(f"Data coverage:")

if earliest_timestamp < market_start:
    pre_market_duration = market_start - earliest_timestamp
    print(f"  Pre-market: {earliest_timestamp.strftime('%H:%M:%S')} to {market_start.strftime('%H:%M:%S')} ({pre_market_duration})")

if earliest_timestamp <= market_start and latest_timestamp >= market_end:
    market_duration = market_end - market_start
    print(f"  Market hours: {market_start.strftime('%H:%M:%S')} to {market_end.strftime('%H:%M:%S')} ({market_duration})")

if latest_timestamp > market_end:
    post_market_duration = latest_timestamp - market_end
    print(f"  Post-market: {market_end.strftime('%H:%M:%S')} to {latest_timestamp.strftime('%H:%M:%S')} ({post_market_duration})")

# Check for any gaps in the time series
print(f"\n=== TIME SERIES CONTINUITY ===")
time_diffs = df['date'].diff().dropna()
min_time_diff = time_diffs.min()
max_time_diff = time_diffs.max()

print(f"Minimum time difference between consecutive rows: {min_time_diff}")
print(f"Maximum time difference between consecutive rows: {max_time_diff}")

# Identify any unusually large time gaps
large_gaps = time_diffs[time_diffs > pd.Timedelta(minutes=1)]
if len(large_gaps) > 0:
    print(f"\n⚠️  Found {len(large_gaps)} time gaps larger than 1 minute:")
    gap_indices = large_gaps.index[:5]  # Show first 5 gaps
    for idx in gap_indices:
        gap_start = df.loc[idx-1, 'date']
        gap_end = df.loc[idx, 'date']
        gap_duration = gap_end - gap_start
        print(f"  Gap: {gap_start.strftime('%H:%M:%S')} to {gap_end.strftime('%H:%M:%S')} (Duration: {gap_duration})")

# Summary
print(f"\n=== SUMMARY ===")
print(f"✓ Data spans: {earliest_timestamp.strftime('%Y-%m-%d %H:%M:%S')} to {latest_timestamp.strftime('%Y-%m-%d %H:%M:%S')}")
print(f"✓ Total duration: {duration_hours:.2f} hours ({duration_minutes:.0f} minutes)")
print(f"✓ Total rows: {len(df):,}")
print(f"✓ Average frequency: {len(df)/duration_hours:.1f} ticks per hour")

=== TIMESTAMP RANGE ANALYSIS ===
Earliest timestamp: 2025-08-07 09:15:00
Latest timestamp: 2025-08-07 15:29:58
Total time duration: 0 days 06:14:58
Duration in hours: 6.25 hours
Duration in minutes: 375 minutes

=== EARLIEST TIMESTAMP ROW ===
               date  price   qty      trnvr  cum_trnvr
2025-08-07 09:15:00 386.85 65740 25431519.0 25431519.0

=== LATEST TIMESTAMP ROW ===
               date  price  qty    trnvr    cum_trnvr
2025-08-07 15:29:58 388.25 1740 675555.0 4.451969e+09

=== TIME PERIOD ANALYSIS ===
Earliest: 09:15
Latest: 15:29

Market hours: 09:15:00 to 15:30:00
Data coverage:

=== TIME SERIES CONTINUITY ===
Minimum time difference between consecutive rows: 0 days 00:00:00
Maximum time difference between consecutive rows: 0 days 00:00:12

=== SUMMARY ===
✓ Data spans: 2025-08-07 09:15:00 to 2025-08-07 15:29:58
✓ Total duration: 6.25 hours (375 minutes)
✓ Total rows: 10,956
✓ Average frequency: 1753.1 ticks per hour


In [11]:
# Calculate total traded volume and total turnover
print("=== VOLUME AND TURNOVER ANALYSIS ===")

# Calculate totals
total_volume = df['qty'].sum()
total_turnover = df['trnvr'].sum()

print(f"Total traded volume: {total_volume:,}")
print(f"Total turnover: ₹{total_turnover:,.2f}")

# Additional volume and turnover metrics
print(f"\n=== DETAILED METRICS ===")

# Volume analysis
print("--- VOLUME ANALYSIS ---")
avg_trade_size = df['qty'].mean()
median_trade_size = df['qty'].median()
max_trade_size = df['qty'].max()
min_trade_size = df['qty'].min()

print(f"Average trade size: {avg_trade_size:,.0f}")
print(f"Median trade size: {median_trade_size:,.0f}")
print(f"Largest single trade: {max_trade_size:,}")
print(f"Smallest single trade: {min_trade_size:,}")

# Turnover analysis
print(f"\n--- TURNOVER ANALYSIS ---")
avg_trade_value = df['trnvr'].mean()
median_trade_value = df['trnvr'].median()
max_trade_value = df['trnvr'].max()
min_trade_value = df['trnvr'].min()

print(f"Average trade value: ₹{avg_trade_value:,.2f}")
print(f"Median trade value: ₹{median_trade_value:,.2f}")
print(f"Largest single trade value: ₹{max_trade_value:,.2f}")
print(f"Smallest single trade value: ₹{min_trade_value:,.2f}")

# Price analysis
print(f"\n--- PRICE ANALYSIS ---")
avg_price = df['price'].mean()
weighted_avg_price = (df['price'] * df['qty']).sum() / df['qty'].sum()
price_range = df['price'].max() - df['price'].min()

print(f"Simple average price: ₹{avg_price:.2f}")
print(f"Weighted average price (by volume): ₹{weighted_avg_price:.2f}")
print(f"Price range: ₹{price_range:.2f}")

# Volume-weighted metrics
print(f"\n--- VOLUME-WEIGHTED METRICS ---")
vwap = (df['price'] * df['qty']).sum() / df['qty'].sum()
print(f"Volume Weighted Average Price (VWAP): ₹{vwap:.2f}")

# Efficiency metrics
print(f"\n--- EFFICIENCY METRICS ---")
trades_count = len(df)
print(f"Total number of trades: {trades_count:,}")
print(f"Average volume per trade: {total_volume/trades_count:,.0f}")
print(f"Average turnover per trade: ₹{total_turnover/trades_count:,.2f}")

# Time-based analysis
print(f"\n=== TIME-BASED ANALYSIS ===")
earliest_time = df['date'].min()
latest_time = df['date'].max()
duration_hours = (latest_time - earliest_time).total_seconds() / 3600

print(f"Trading duration: {duration_hours:.2f} hours")
print(f"Volume per hour: {total_volume/duration_hours:,.0f}")
print(f"Turnover per hour: ₹{total_turnover/duration_hours:,.2f}")

# Market activity intensity
print(f"\n--- MARKET ACTIVITY INTENSITY ---")
print(f"Trades per hour: {trades_count/duration_hours:.1f}")
print(f"Volume per trade: {total_volume/trades_count:,.0f}")
print(f"Turnover per trade: ₹{total_turnover/trades_count:,.2f}")

# Summary table
print(f"\n" + "="*60)
print("=== SUMMARY TABLE ===")
summary_data = {
    'Metric': ['Total Volume', 'Total Turnover', 'Total Trades', 'Avg Trade Size', 'Avg Trade Value', 'VWAP'],
    'Value': [
        f"{total_volume:,}",
        f"₹{total_turnover:,.2f}",
        f"{trades_count:,}",
        f"{avg_trade_size:,.0f}",
        f"₹{avg_trade_value:,.2f}",
        f"₹{vwap:.2f}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

print(f"\n=== KEY INSIGHTS ===")
print(f"✓ Total volume traded: {total_volume:,} shares")
print(f"✓ Total market value: ₹{total_turnover:,.2f}")
print(f"✓ Market activity: {trades_count:,} individual trades")
print(f"✓ Trading efficiency: ₹{total_turnover/total_volume:.2f} per share")

=== VOLUME AND TURNOVER ANALYSIS ===
Total traded volume: 11,510,171
Total turnover: ₹4,451,969,111.70

=== DETAILED METRICS ===
--- VOLUME ANALYSIS ---
Average trade size: 1,051
Median trade size: 200
Largest single trade: 153,858
Smallest single trade: 1

--- TURNOVER ANALYSIS ---
Average trade value: ₹406,349.86
Median trade value: ₹77,325.00
Largest single trade value: ₹59,550,738.90
Smallest single trade value: ₹383.60

--- PRICE ANALYSIS ---
Simple average price: ₹386.56
Weighted average price (by volume): ₹386.79
Price range: ₹6.70

--- VOLUME-WEIGHTED METRICS ---
Volume Weighted Average Price (VWAP): ₹386.79

--- EFFICIENCY METRICS ---
Total number of trades: 10,956
Average volume per trade: 1,051
Average turnover per trade: ₹406,349.86

=== TIME-BASED ANALYSIS ===
Trading duration: 6.25 hours
Volume per hour: 1,841,791
Turnover per hour: ₹712,378,380.39

--- MARKET ACTIVITY INTENSITY ---
Trades per hour: 1753.1
Volume per trade: 1,051
Turnover per trade: ₹406,349.86

=== SUMMA

In [12]:
# Check number of trades with qty > 0 (actual trades vs. zero-qty updates)
print("=== TRADE TYPE ANALYSIS ===")

# Count different types of records
total_records = len(df)
actual_trades = (df['qty'] > 0).sum()
zero_qty_updates = (df['qty'] == 0).sum()

print(f"Total records in dataset: {total_records:,}")
print(f"Actual trades (qty > 0): {actual_trades:,}")
print(f"Zero-quantity updates (qty = 0): {zero_qty_updates:,}")

# Calculate percentages
actual_trades_pct = (actual_trades / total_records) * 100
zero_qty_pct = (zero_qty_updates / total_records) * 100

print(f"\n=== PERCENTAGE BREAKDOWN ===")
print(f"Actual trades: {actual_trades_pct:.2f}%")
print(f"Zero-quantity updates: {zero_qty_pct:.2f}%")

# Analyze the data composition
print(f"\n=== DATA COMPOSITION ANALYSIS ===")
if zero_qty_updates > 0:
    print("⚠️  Dataset contains both actual trades and zero-quantity updates")
    print("   This suggests the data includes bid-ask spread updates")
else:
    print("✓ Dataset contains only actual trades (all qty > 0)")

# Show sample of actual trades
print(f"\n=== SAMPLE OF ACTUAL TRADES (qty > 0) ===")
actual_trades_df = df[df['qty'] > 0]
print(actual_trades_df[['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].head(10))

# Show sample of zero-quantity updates (if any exist)
if zero_qty_updates > 0:
    print(f"\n=== SAMPLE OF ZERO-QUANTITY UPDATES (qty = 0) ===")
    zero_qty_df = df[df['qty'] == 0]
    print(zero_qty_df[['date', 'price', 'qty', 'trnvr', 'cum_trnvr']].head(10))

# Analyze characteristics of each type
print(f"\n=== CHARACTERISTICS ANALYSIS ===")

# Actual trades characteristics
if actual_trades > 0:
    print("--- ACTUAL TRADES (qty > 0) ---")
    actual_trades_data = df[df['qty'] > 0]
    print(f"  Total volume: {actual_trades_data['qty'].sum():,}")
    print(f"  Total turnover: ₹{actual_trades_data['trnvr'].sum():,.2f}")
    print(f"  Average trade size: {actual_trades_data['qty'].mean():,.0f}")
    print(f"  Average trade value: ₹{actual_trades_data['trnvr'].mean():,.2f}")
    print(f"  Price range: ₹{actual_trades_data['price'].min():.2f} - ₹{actual_trades_data['price'].max():.2f}")

# Zero-quantity updates characteristics (if any exist)
if zero_qty_updates > 0:
    print(f"\n--- ZERO-QUANTITY UPDATES (qty = 0) ---")
    zero_qty_data = df[df['qty'] == 0]
    print(f"  Total records: {len(zero_qty_data):,}")
    print(f"  Price range: ₹{zero_qty_data['price'].min():.2f} - ₹{zero_qty_data['price'].max():.2f}")
    print(f"  Average price: ₹{zero_qty_data['price'].mean():.2f}")
    
    # Check if these are bid-ask spread updates
    if zero_qty_data['trnvr'].sum() == 0:
        print(f"  All have zero turnover (typical bid-ask updates)")
    else:
        print(f"  Some have non-zero turnover (data quality issue)")

# Data quality implications
print(f"\n=== DATA QUALITY IMPLICATIONS ===")
if zero_qty_updates > 0:
    print("⚠️  Mixed data types detected:")
    print("   - Actual trades: Use for volume, turnover, and price analysis")
    print("   - Zero-qty updates: Use for bid-ask spread analysis only")
    print("   - Consider filtering by qty > 0 for trade-based analysis")
else:
    print("✓ Clean dataset with only actual trades")

# Recommendations
print(f"\n=== RECOMMENDATIONS ===")
if zero_qty_updates > 0:
    print("For different types of analysis:")
    print("  📊 Volume/Turnover analysis: Use df[df['qty'] > 0]")
    print("  📈 Price movement analysis: Use df[df['qty'] > 0]")
    print("  🔍 Bid-ask spread analysis: Use df[df['qty'] == 0]")
    print("  📋 Complete market picture: Use full dataset")
else:
    print("  ✓ Dataset is ready for all types of analysis")

# Summary
print(f"\n=== SUMMARY ===")
print(f"✓ Total records: {total_records:,}")
print(f"✓ Actual trades: {actual_trades:,} ({actual_trades_pct:.1f}%)")
print(f"✓ Zero-qty updates: {zero_qty_updates:,} ({zero_qty_pct:.1f}%)")
print(f"✓ Data type: {'Mixed (trades + updates)' if zero_qty_updates > 0 else 'Pure trades only'}")

=== TRADE TYPE ANALYSIS ===
Total records in dataset: 10,956
Actual trades (qty > 0): 10,956
Zero-quantity updates (qty = 0): 0

=== PERCENTAGE BREAKDOWN ===
Actual trades: 100.00%
Zero-quantity updates: 0.00%

=== DATA COMPOSITION ANALYSIS ===
✓ Dataset contains only actual trades (all qty > 0)

=== SAMPLE OF ACTUAL TRADES (qty > 0) ===
                 date   price    qty        trnvr    cum_trnvr
0 2025-08-07 09:15:00  386.85  65740  25431519.00  25431519.00
1 2025-08-07 09:15:01  386.30    895    345738.50  25777257.50
2 2025-08-07 09:15:01  386.75   1401    541836.75  26319094.25
3 2025-08-07 09:15:02  386.80   1795    694306.00  27013400.25
4 2025-08-07 09:15:02  386.95    741    286729.95  27300130.20
5 2025-08-07 09:15:03  386.90   2717   1051207.30  28351337.50
6 2025-08-07 09:15:03  386.80   9068   3507502.40  31858839.90
7 2025-08-07 09:15:04  386.75   1141    441281.75  32300121.65
8 2025-08-07 09:15:04  386.90   1798    695646.20  32995767.85
9 2025-08-07 09:15:05  386.75 

In [13]:
# Create price_change = current price - previous price
print("=== PRICE CHANGE CALCULATION ===")

# Calculate price change (current price - previous price)
df['price_change'] = df['price'].diff()

# Display the first few rows to verify the calculation
print("=== SAMPLE DATA WITH PRICE CHANGE ===")
print(df[['date', 'price', 'price_change', 'qty', 'trnvr']].head(10))

# Basic statistics of price changes
print(f"\n=== PRICE CHANGE STATISTICS ===")
print(f"Total price changes calculated: {len(df['price_change'].dropna())}")
print(f"First price change: {df['price_change'].iloc[1]:.2f}")  # First change is at index 1
print(f"Last price change: {df['price_change'].iloc[-1]:.2f}")

# Statistical summary of price changes
print(f"\n=== PRICE CHANGE DESCRIPTIVE STATISTICS ===")
price_change_stats = df['price_change'].describe()
print(price_change_stats)

# Analyze price change distribution
print(f"\n=== PRICE CHANGE DISTRIBUTION ANALYSIS ===")
positive_changes = (df['price_change'] > 0).sum()
negative_changes = (df['price_change'] < 0).sum()
zero_changes = (df['price_change'] == 0).sum()
total_changes = len(df['price_change'].dropna())

print(f"Positive price changes: {positive_changes:,} ({(positive_changes/total_changes*100):.2f}%)")
print(f"Negative price changes: {negative_changes:,} ({(negative_changes/total_changes*100):.2f}%)")
print(f"No price changes: {zero_changes:,} ({(zero_changes/total_changes*100):.2f}%)")

# Price change magnitude analysis
print(f"\n=== PRICE CHANGE MAGNITUDE ANALYSIS ===")
abs_price_changes = df['price_change'].abs()
print(f"Average absolute price change: ₹{abs_price_changes.mean():.2f}")
print(f"Median absolute price change: ₹{abs_price_changes.median():.2f}")
print(f"Largest price increase: ₹{df['price_change'].max():.2f}")
print(f"Largest price decrease: ₹{df['price_change'].min():.2f}")

# Show examples of different types of price changes
print(f"\n=== EXAMPLES OF PRICE CHANGES ===")

# Largest price increases
print("Top 5 largest price increases:")
largest_increases = df.nlargest(5, 'price_change')[['date', 'price', 'price_change', 'qty']]
print(largest_increases.to_string(index=False))

# Largest price decreases
print(f"\nTop 5 largest price decreases:")
largest_decreases = df.nsmallest(5, 'price_change')[['date', 'price', 'price_change', 'qty']]
print(largest_decreases.to_string(index=False))

# No change examples
if zero_changes > 0:
    print(f"\nSample of rows with no price change:")
    no_change_sample = df[df['price_change'] == 0][['date', 'price', 'price_change', 'qty']].head(5)
    print(no_change_sample.to_string(index=False))

# Price change patterns
print(f"\n=== PRICE CHANGE PATTERNS ===")
print(f"Price changes per hour: {total_changes / ((df['date'].max() - df['date'].min()).total_seconds() / 3600):.1f}")
print(f"Average price change frequency: {total_changes / len(df):.2f} changes per tick")

# Verify calculation integrity
print(f"\n=== CALCULATION VERIFICATION ===")
print(f"✓ Price change column created successfully")
print(f"✓ First row price_change is NaN (no previous price to compare)")
print(f"✓ Total rows: {len(df):,}")
print(f"✓ Price changes calculated: {total_changes:,}")
print(f"✓ Ready for price movement analysis")

# Display final dataframe info
print(f"\n=== UPDATED DATAFRAME INFO ===")
print(f"Columns: {list(df.columns)}")
print(f"Shape: {df.shape}")
print(f"Data types:")
print(df.dtypes)

=== PRICE CHANGE CALCULATION ===
=== SAMPLE DATA WITH PRICE CHANGE ===
                 date   price  price_change    qty        trnvr
0 2025-08-07 09:15:00  386.85           NaN  65740  25431519.00
1 2025-08-07 09:15:01  386.30         -0.55    895    345738.50
2 2025-08-07 09:15:01  386.75          0.45   1401    541836.75
3 2025-08-07 09:15:02  386.80          0.05   1795    694306.00
4 2025-08-07 09:15:02  386.95          0.15    741    286729.95
5 2025-08-07 09:15:03  386.90         -0.05   2717   1051207.30
6 2025-08-07 09:15:03  386.80         -0.10   9068   3507502.40
7 2025-08-07 09:15:04  386.75         -0.05   1141    441281.75
8 2025-08-07 09:15:04  386.90          0.15   1798    695646.20
9 2025-08-07 09:15:05  386.75         -0.15   1679    649353.25

=== PRICE CHANGE STATISTICS ===
Total price changes calculated: 10955
First price change: -0.55
Last price change: 0.00

=== PRICE CHANGE DESCRIPTIVE STATISTICS ===
count    10955.000000
mean         0.000128
std          0.

In [14]:
# Create direction column: "Up", "Down", "No change"
print("=== DIRECTION COLUMN CREATION ===")

# Create direction column based on price_change
df['direction'] = df['price_change'].apply(lambda x: 
    'Up' if x > 0 else 
    'Down' if x < 0 else 
    'No change'
)

# Display the first few rows to verify the direction column
print("=== SAMPLE DATA WITH DIRECTION ===")
print(df[['date', 'price', 'price_change', 'direction', 'qty', 'trnvr']].head(10))

# Count the occurrences of each direction
print(f"\n=== DIRECTION DISTRIBUTION ===")
direction_counts = df['direction'].value_counts()
print(direction_counts)

# Calculate percentages
total_rows = len(df)
print(f"\n=== DIRECTION PERCENTAGES ===")
for direction, count in direction_counts.items():
    percentage = (count / total_rows) * 100
    print(f"{direction}: {count:,} ({percentage:.2f}%)")

# Analyze direction patterns
print(f"\n=== DIRECTION PATTERN ANALYSIS ===")

# Check for consecutive directions
print("Direction sequence analysis:")
consecutive_up = 0
consecutive_down = 0
consecutive_no_change = 0
max_consecutive_up = 0
max_consecutive_down = 0
max_consecutive_no_change = 0

current_up = 0
current_down = 0
current_no_change = 0

for direction in df['direction']:
    if direction == 'Up':
        current_up += 1
        current_down = 0
        current_no_change = 0
        max_consecutive_up = max(max_consecutive_up, current_up)
    elif direction == 'Down':
        current_down += 1
        current_up = 0
        current_no_change = 0
        max_consecutive_down = max(max_consecutive_down, current_down)
    else:  # No change
        current_no_change += 1
        current_up = 0
        current_down = 0
        max_consecutive_no_change = max(max_consecutive_no_change, current_no_change)

print(f"Maximum consecutive 'Up' movements: {max_consecutive_up}")
print(f"Maximum consecutive 'Down' movements: {max_consecutive_down}")
print(f"Maximum consecutive 'No change': {max_consecutive_no_change}")

# Direction by time periods
print(f"\n=== DIRECTION BY TIME PERIODS ===")
df['hour'] = df['date'].dt.hour
hourly_direction = df.groupby('hour')['direction'].value_counts().unstack(fill_value=0)

print("Direction distribution by hour:")
print(hourly_direction)

# Direction transitions (what follows what)
print(f"\n=== DIRECTION TRANSITIONS ===")
transitions = []
for i in range(1, len(df)):
    prev_direction = df['direction'].iloc[i-1]
    curr_direction = df['direction'].iloc[i]
    transitions.append((prev_direction, curr_direction))

transition_counts = pd.Series(transitions).value_counts().head(10)
print("Most common direction transitions:")
for transition, count in transition_counts.items():
    print(f"  {transition[0]} → {transition[1]}: {count:,} times")

# Direction with volume analysis
print(f"\n=== DIRECTION WITH VOLUME ANALYSIS ===")
direction_volume = df.groupby('direction')['qty'].agg(['sum', 'mean', 'count'])
print("Volume analysis by direction:")
print(direction_volume)

# Direction with price change magnitude
print(f"\n=== DIRECTION WITH PRICE CHANGE MAGNITUDE ===")
direction_magnitude = df.groupby('direction')['price_change'].agg(['mean', 'std', 'min', 'max'])
print("Price change magnitude by direction:")
print(direction_magnitude)

# Show examples of each direction
print(f"\n=== EXAMPLES OF EACH DIRECTION ===")

# Up movements
up_examples = df[df['direction'] == 'Up'][['date', 'price', 'price_change', 'qty']].head(3)
print("Sample 'Up' movements:")
print(up_examples.to_string(index=False))

# Down movements
down_examples = df[df['direction'] == 'Down'][['date', 'price', 'price_change', 'qty']].head(3)
print(f"\nSample 'Down' movements:")
print(down_examples.to_string(index=False))

# No change
no_change_examples = df[df['direction'] == 'No change'][['date', 'price', 'price_change', 'qty']].head(3)
print(f"\nSample 'No change':")
print(no_change_examples.to_string(index=False))

# Summary statistics
print(f"\n=== DIRECTION SUMMARY ===")
print(f"✓ Direction column created successfully")
print(f"✓ Total rows: {total_rows:,}")
print(f"✓ Up movements: {direction_counts.get('Up', 0):,}")
print(f"✓ Down movements: {direction_counts.get('Down', 0):,}")
print(f"✓ No change: {direction_counts.get('No change', 0):,}")
print(f"✓ Most common direction: {direction_counts.index[0]}")
print(f"✓ Ready for directional analysis and pattern recognition")

# Display final dataframe info
print(f"\n=== UPDATED DATAFRAME INFO ===")
print(f"Columns: {list(df.columns)}")
print(f"Shape: {df.shape}")

=== DIRECTION COLUMN CREATION ===
=== SAMPLE DATA WITH DIRECTION ===
                 date   price  price_change  direction    qty        trnvr
0 2025-08-07 09:15:00  386.85           NaN  No change  65740  25431519.00
1 2025-08-07 09:15:01  386.30         -0.55       Down    895    345738.50
2 2025-08-07 09:15:01  386.75          0.45         Up   1401    541836.75
3 2025-08-07 09:15:02  386.80          0.05         Up   1795    694306.00
4 2025-08-07 09:15:02  386.95          0.15         Up    741    286729.95
5 2025-08-07 09:15:03  386.90         -0.05       Down   2717   1051207.30
6 2025-08-07 09:15:03  386.80         -0.10       Down   9068   3507502.40
7 2025-08-07 09:15:04  386.75         -0.05       Down   1141    441281.75
8 2025-08-07 09:15:04  386.90          0.15         Up   1798    695646.20
9 2025-08-07 09:15:05  386.75         -0.15       Down   1679    649353.25

=== DIRECTION DISTRIBUTION ===
direction
No change    4441
Down         3299
Up           3216
Name: coun

In [15]:
# Calculate rolling averages (1 min, 5 min, 15 min) for price
print("=== ROLLING AVERAGE CALCULATION ===")

# First, ensure the dataframe is sorted by datetime
df = df.sort_values('date').reset_index(drop=True)

# Set datetime as index for time-based rolling operations
df_temp = df.set_index('date')

# Calculate rolling averages at different time intervals
print("Calculating rolling averages...")

# 1-minute rolling average
df_temp['price_1min_avg'] = df_temp['price'].rolling(window='1T', min_periods=1).mean()

# 5-minute rolling average
df_temp['price_5min_avg'] = df_temp['price'].rolling(window='5T', min_periods=1).mean()

# 15-minute rolling average
df_temp['price_15min_avg'] = df_temp['price'].rolling(window='15T', min_periods=1).mean()

# Reset index to get back to normal dataframe format
df = df_temp.reset_index()

# Display sample data with rolling averages
print("=== SAMPLE DATA WITH ROLLING AVERAGES ===")
print(df[['date', 'price', 'price_1min_avg', 'price_5min_avg', 'price_15min_avg', 'qty']].head(15))

# Basic statistics of rolling averages
print(f"\n=== ROLLING AVERAGE STATISTICS ===")
rolling_cols = ['price_1min_avg', 'price_5min_avg', 'price_15min_avg']

for col in rolling_cols:
    print(f"\n{col}:")
    print(f"  Min: ₹{df[col].min():.2f}")
    print(f"  Max: ₹{df[col].min():.2f}")
    print(f"  Mean: ₹{df[col].mean():.2f}")
    print(f"  Std: ₹{df[col].std():.2f}")

# Compare current price vs rolling averages
print(f"\n=== PRICE VS ROLLING AVERAGES ANALYSIS ===")

# Calculate differences from rolling averages
df['price_vs_1min'] = df['price'] - df['price_1min_avg']
df['price_vs_5min'] = df['price'] - df['price_5min_avg']
df['price_vs_15min'] = df['price'] - df['price_15min_avg']

# Show statistics of these differences
print("Price differences from rolling averages:")
print(f"  vs 1-min avg: Mean={df['price_vs_1min'].mean():.2f}, Std={df['price_vs_1min'].std():.2f}")
print(f"  vs 5-min avg: Mean={df['price_vs_5min'].mean():.2f}, Std={df['price_vs_5min'].std():.2f}")
print(f"  vs 15-min avg: Mean={df['price_vs_15min'].mean():.2f}, Std={df['price_vs_15min'].std():.2f}")

# Identify when price is above/below each moving average
print(f"\n=== MOVING AVERAGE CROSSOVER ANALYSIS ===")

# Price position relative to moving averages
df['above_1min'] = df['price'] > df['price_1min_avg']
df['above_5min'] = df['price'] > df['price_5min_avg']
df['above_15min'] = df['price'] > df['price_15min_avg']

# Count how many times price is above each moving average
above_1min_count = df['above_1min'].sum()
above_5min_count = df['above_5min'].sum()
above_15min_count = df['above_15min'].sum()

print(f"Price above 1-min average: {above_1min_count:,} times ({(above_1min_count/len(df)*100):.1f}%)")
print(f"Price above 5-min average: {above_5min_count:,} times ({(above_5min_count/len(df)*100):.1f}%)")
print(f"Price above 15-min average: {above_15min_count:,} times ({(above_15min_count/len(df)*100):.1f}%)")

# Moving average crossover signals
print(f"\n=== MOVING AVERAGE CROSSOVER SIGNALS ===")

# 1-min vs 5-min crossover
df['ma_1min_5min_cross'] = (df['price_1min_avg'] > df['price_5min_avg']).astype(int)
df['ma_1min_5min_signal'] = df['ma_1min_5min_cross'].diff()

# 5-min vs 15-min crossover
df['ma_5min_15min_cross'] = (df['price_5min_avg'] > df['price_15min_avg']).astype(int)
df['ma_5min_15min_signal'] = df['ma_5min_15min_cross'].diff()

# Count crossover signals
bullish_1min_5min = (df['ma_1min_5min_signal'] == 1).sum()
bearish_1min_5min = (df['ma_1min_5min_signal'] == -1).sum()

bullish_5min_15min = (df['ma_5min_15min_signal'] == 1).sum()
bearish_5min_15min = (df['ma_5min_15min_signal'] == -1).sum()

print(f"1-min vs 5-min crossovers:")
print(f"  Bullish (1-min crosses above 5-min): {bullish_1min_5min}")
print(f"  Bearish (1-min crosses below 5-min): {bearish_1min_5min}")

print(f"\n5-min vs 15-min crossovers:")
print(f"  Bullish (5-min crosses above 15-min): {bullish_5min_15min}")
print(f"  Bearish (5-min crosses below 15-min): {bearish_5min_15min}")

# Show sample of crossover signals
print(f"\n=== SAMPLE CROSSOVER SIGNALS ===")
crossover_sample = df[df['ma_1min_5min_signal'] != 0][['date', 'price', 'price_1min_avg', 'price_5min_avg', 'ma_1min_5min_signal']].head(10)
print("Recent 1-min vs 5-min crossovers:")
print(crossover_sample.to_string(index=False))

# Rolling average trends
print(f"\n=== ROLLING AVERAGE TRENDS ===")

# Calculate the slope of each moving average (trend direction)
df['ma_1min_slope'] = df['price_1min_avg'].diff()
df['ma_5min_slope'] = df['price_5min_avg'].diff()
df['ma_15min_slope'] = df['price_15min_avg'].diff()

# Count trending periods
trending_1min_up = (df['ma_1min_slope'] > 0).sum()
trending_1min_down = (df['ma_1min_slope'] < 0).sum()

print(f"1-min moving average trends:")
print(f"  Upward trending periods: {trending_1min_up:,}")
print(f"  Downward trending periods: {trending_1min_down:,}")

# Summary
print(f"\n=== ROLLING AVERAGE SUMMARY ===")
print(f"✓ 1-minute rolling average calculated")
print(f"✓ 5-minute rolling average calculated")
print(f"✓ 15-minute rolling average calculated")
print(f"✓ Crossover signals identified")
print(f"✓ Trend analysis completed")
print(f"✓ Ready for technical analysis and trading signals")

# Display final dataframe info
print(f"\n=== UPDATED DATAFRAME INFO ===")
print(f"Columns: {list(df.columns)}")
print(f"Shape: {df.shape}")
print(f"New columns added:")
new_cols = ['price_1min_avg', 'price_5min_avg', 'price_15min_avg', 'price_vs_1min', 
            'price_vs_5min', 'price_vs_15min', 'above_1min', 'above_5min', 'above_15min',
            'ma_1min_5min_cross', 'ma_1min_5min_signal', 'ma_5min_15min_cross', 
            'ma_5min_15min_signal', 'ma_1min_slope', 'ma_5min_slope', 'ma_15min_slope']
for col in new_cols:
    if col in df.columns:
        print(f"  ✓ {col}")

=== ROLLING AVERAGE CALCULATION ===
Calculating rolling averages...
=== SAMPLE DATA WITH ROLLING AVERAGES ===
                  date   price  price_1min_avg  price_5min_avg  \
0  2025-08-07 09:15:00  386.85      386.850000      386.850000   
1  2025-08-07 09:15:01  386.30      386.575000      386.575000   
2  2025-08-07 09:15:01  386.75      386.633333      386.633333   
3  2025-08-07 09:15:02  386.80      386.675000      386.675000   
4  2025-08-07 09:15:02  386.95      386.730000      386.730000   
5  2025-08-07 09:15:03  386.90      386.758333      386.758333   
6  2025-08-07 09:15:03  386.80      386.764286      386.764286   
7  2025-08-07 09:15:04  386.75      386.762500      386.762500   
8  2025-08-07 09:15:04  386.90      386.777778      386.777778   
9  2025-08-07 09:15:05  386.65      386.765000      386.765000   
10 2025-08-07 09:15:05  386.75      386.763636      386.763636   
11 2025-08-07 09:15:06  386.65      386.754167      386.754167   
12 2025-08-07 09:15:06  386.55  

  df_temp['price_1min_avg'] = df_temp['price'].rolling(window='1T', min_periods=1).mean()
  df_temp['price_5min_avg'] = df_temp['price'].rolling(window='5T', min_periods=1).mean()
  df_temp['price_15min_avg'] = df_temp['price'].rolling(window='15T', min_periods=1).mean()


In [16]:
# Calculate rolling sum of volume over time windows
print("=== ROLLING VOLUME SUM CALCULATION ===")

# Ensure the dataframe is sorted by datetime and has datetime index
df_temp = df.set_index('date')

# Calculate rolling sum of volume at different time intervals
print("Calculating rolling volume sums...")

# 1-minute rolling volume sum
df_temp['volume_1min_sum'] = df_temp['qty'].rolling(window='1T', min_periods=1).sum()

# 5-minute rolling volume sum
df_temp['volume_5min_sum'] = df_temp['qty'].rolling(window='5T', min_periods=1).sum()

# 15-minute rolling volume sum
df_temp['volume_15min_sum'] = df_temp['qty'].rolling(window='15T', min_periods=1).sum()

# 30-minute rolling volume sum
df_temp['volume_30min_sum'] = df_temp['qty'].rolling(window='30T', min_periods=1).sum()

# Reset index to get back to normal dataframe format
df = df_temp.reset_index()

# Display sample data with rolling volume sums
print("=== SAMPLE DATA WITH ROLLING VOLUME SUMS ===")
print(df[['date', 'qty', 'volume_1min_sum', 'volume_5min_sum', 'volume_15min_sum', 'volume_30min_sum']].head(15))

# Basic statistics of rolling volume sums
print(f"\n=== ROLLING VOLUME SUM STATISTICS ===")
volume_sum_cols = ['volume_1min_sum', 'volume_5min_sum', 'volume_15min_sum', 'volume_30min_sum']

for col in volume_sum_cols:
    print(f"\n{col}:")
    print(f"  Min: {df[col].min():,.0f}")
    print(f"  Max: {df[col].max():,.0f}")
    print(f"  Mean: {df[col].mean():,.0f}")
    print(f"  Std: {df[col].std():,.0f}")

# Volume analysis by time windows
print(f"\n=== VOLUME ANALYSIS BY TIME WINDOWS ===")

# Compare volume across different timeframes
print("Volume comparison across timeframes:")
print(f"  1-min average: {df['volume_1min_sum'].mean():,.0f}")
print(f"  5-min average: {df['volume_5min_sum'].mean():,.0f}")
print(f"  15-min average: {df['volume_15min_sum'].mean():,.0f}")
print(f"  30-min average: {df['volume_30min_sum'].mean():,.0f}")

# Volume intensity analysis
print(f"\n=== VOLUME INTENSITY ANALYSIS ===")

# Calculate volume per minute for each window
df['volume_1min_per_min'] = df['volume_1min_sum'] / 1
df['volume_5min_per_min'] = df['volume_5min_sum'] / 5
df['volume_15min_per_min'] = df['volume_15min_sum'] / 15
df['volume_30min_per_min'] = df['volume_30min_sum'] / 30

print("Volume per minute for each window:")
print(f"  1-min window: {df['volume_1min_per_min'].mean():,.0f} per minute")
print(f"  5-min window: {df['volume_5min_per_min'].mean():,.0f} per minute")
print(f"  15-min window: {df['volume_15min_per_min'].mean():,.0f} per minute")
print(f"  30-min window: {df['volume_30min_per_min'].mean():,.0f} per minute")

# Volume spikes detection
print(f"\n=== VOLUME SPIKE DETECTION ===")

# Find periods of unusually high volume (above 2 standard deviations)
for col in volume_sum_cols:
    mean_vol = df[col].mean()
    std_vol = df[col].std()
    threshold = mean_vol + 2 * std_vol
    
    high_volume_periods = (df[col] > threshold).sum()
    print(f"{col}: {high_volume_periods} periods above {threshold:,.0f} (2σ threshold)")

# Show examples of volume spikes
print(f"\n=== EXAMPLES OF VOLUME SPIKES ===")
for col in volume_sum_cols:
    mean_vol = df[col].mean()
    std_vol = df[col].std()
    threshold = mean_vol + 2 * std_vol
    
    spikes = df[df[col] > threshold][['date', col, 'qty', 'price']].head(3)
    if len(spikes) > 0:
        print(f"\n{col} spikes (top 3):")
        print(spikes.to_string(index=False))

# Volume trend analysis
print(f"\n=== VOLUME TREND ANALYSIS ===")

# Calculate volume trends (slopes) for each window
df['volume_1min_trend'] = df['volume_1min_sum'].diff()
df['volume_5min_trend'] = df['volume_5min_sum'].diff()
df['volume_15min_trend'] = df['volume_15min_sum'].diff()
df['volume_30min_trend'] = df['volume_30min_sum'].diff()

# Count increasing vs decreasing volume periods
for col in ['volume_1min_trend', 'volume_5min_trend', 'volume_15min_trend', 'volume_30min_trend']:
    increasing = (df[col] > 0).sum()
    decreasing = (df[col] < 0).sum()
    window_name = col.replace('_trend', '').replace('_', ' ').title()
    print(f"{window_name}:")
    print(f"  Increasing: {increasing:,} periods")
    print(f"  Decreasing: {decreasing:,} periods")

# Volume vs Price correlation
print(f"\n=== VOLUME-PRICE CORRELATION ===")

# Calculate correlation between volume and price for each timeframe
for col in volume_sum_cols:
    correlation = df[col].corr(df['price'])
    print(f"{col} vs Price correlation: {correlation:.4f}")

# Volume vs Price change correlation
print(f"\nVolume vs Price Change correlation:")
for col in volume_sum_cols:
    correlation = df[col].corr(df['price_change'])
    print(f"{col} vs Price Change correlation: {correlation:.4f}")

# Time-based volume analysis
print(f"\n=== TIME-BASED VOLUME ANALYSIS ===")

# Volume by hour of the day
df['hour'] = df['date'].dt.hour
hourly_volume = df.groupby('hour')['qty'].sum()
print("Total volume by hour:")
for hour, volume in hourly_volume.items():
    print(f"  {hour:02d}:00 - {hour:02d}:59: {volume:,}")

# Rolling volume vs cumulative volume comparison
print(f"\n=== ROLLING VS CUMULATIVE VOLUME ===")
print(f"Final cumulative volume: {df['cum_trnvr'].iloc[-1]/df['price'].iloc[-1]:,.0f}")
print(f"Final 30-min rolling volume: {df['volume_30min_sum'].iloc[-1]:,.0f}")
print(f"Rolling volume as % of total: {(df['volume_30min_sum'].iloc[-1]/(df['cum_trnvr'].iloc[-1]/df['price'].iloc[-1])*100):.1f}%")

# Summary
print(f"\n=== ROLLING VOLUME SUMMARY ===")
print(f"✓ 1-minute rolling volume sum calculated")
print(f"✓ 5-minute rolling volume sum calculated")
print(f"✓ 15-minute rolling volume sum calculated")
print(f"✓ 30-minute rolling volume sum calculated")
print(f"✓ Volume intensity analysis completed")
print(f"✓ Volume spike detection implemented")
print(f"✓ Volume-price correlation analyzed")
print(f"✓ Ready for volume-based analysis and trading signals")

# Display final dataframe info
print(f"\n=== UPDATED DATAFRAME INFO ===")
print(f"Columns: {list(df.columns)}")
print(f"Shape: {df.shape}")
print(f"New volume columns added:")
volume_new_cols = ['volume_1min_sum', 'volume_5min_sum', 'volume_15min_sum', 'volume_30min_sum',
                   'volume_1min_per_min', 'volume_5min_per_min', 'volume_15min_per_min', 'volume_30min_per_min',
                   'volume_1min_trend', 'volume_5min_trend', 'volume_15min_trend', 'volume_30min_trend']
for col in volume_new_cols:
    if col in df.columns:
        print(f"  ✓ {col}")

=== ROLLING VOLUME SUM CALCULATION ===
Calculating rolling volume sums...
=== SAMPLE DATA WITH ROLLING VOLUME SUMS ===
                  date    qty  volume_1min_sum  volume_5min_sum  \
0  2025-08-07 09:15:00  65740          65740.0          65740.0   
1  2025-08-07 09:15:01    895          66635.0          66635.0   
2  2025-08-07 09:15:01   1401          68036.0          68036.0   
3  2025-08-07 09:15:02   1795          69831.0          69831.0   
4  2025-08-07 09:15:02    741          70572.0          70572.0   
5  2025-08-07 09:15:03   2717          73289.0          73289.0   
6  2025-08-07 09:15:03   9068          82357.0          82357.0   
7  2025-08-07 09:15:04   1141          83498.0          83498.0   
8  2025-08-07 09:15:04   1798          85296.0          85296.0   
9  2025-08-07 09:15:05   2092          87388.0          87388.0   
10 2025-08-07 09:15:05   1679          89067.0          89067.0   
11 2025-08-07 09:15:06   4519          93586.0          93586.0   
12 2025-08

  df_temp['volume_1min_sum'] = df_temp['qty'].rolling(window='1T', min_periods=1).sum()
  df_temp['volume_5min_sum'] = df_temp['qty'].rolling(window='5T', min_periods=1).sum()
  df_temp['volume_15min_sum'] = df_temp['qty'].rolling(window='15T', min_periods=1).sum()
  df_temp['volume_30min_sum'] = df_temp['qty'].rolling(window='30T', min_periods=1).sum()


In [17]:
# Compute VWAP (Volume Weighted Average Price)
print("=== VWAP CALCULATION ===")

# Calculate VWAP for the entire dataset
print("Calculating VWAP...")

# Method 1: Simple VWAP for entire dataset
total_volume = df['qty'].sum()
total_price_volume = (df['price'] * df['qty']).sum()
vwap_total = total_price_volume / total_volume

print(f"=== OVERALL VWAP ===")
print(f"Total volume: {total_volume:,}")
print(f"Total price × volume: ₹{total_price_volume:,.2f}")
print(f"Overall VWAP: ₹{vwap_total:.2f}")

# Method 2: Rolling VWAP over different time windows
print(f"\n=== ROLLING VWAP CALCULATION ===")

# Ensure datetime index for time-based rolling
df_temp = df.set_index('date')

# Calculate rolling VWAP for different time windows
df_temp['vwap_1min'] = (df_temp['price'] * df_temp['qty']).rolling(window='1T', min_periods=1).sum() / df_temp['qty'].rolling(window='1T', min_periods=1).sum()

df_temp['vwap_5min'] = (df_temp['price'] * df_temp['qty']).rolling(window='5T', min_periods=1).sum() / df_temp['qty'].rolling(window='5T', min_periods=1).sum()

df_temp['vwap_15min'] = (df_temp['price'] * df_temp['qty']).rolling(window='15T', min_periods=1).sum() / df_temp['qty'].rolling(window='15T', min_periods=1).sum()

df_temp['vwap_30min'] = (df_temp['price'] * df_temp['qty']).rolling(window='30T', min_periods=1).sum() / df_temp['qty'].rolling(window='30T', min_periods=1).sum()

# Reset index
df = df_temp.reset_index()

# Display sample data with VWAP values
print("=== SAMPLE DATA WITH VWAP VALUES ===")
print(df[['date', 'price', 'qty', 'vwap_1min', 'vwap_5min', 'vwap_15min', 'vwap_30min']].head(15))

# VWAP Statistics
print(f"\n=== VWAP STATISTICS ===")
vwap_cols = ['vwap_1min', 'vwap_5min', 'vwap_15min', 'vwap_30min']

for col in vwap_cols:
    print(f"\n{col}:")
    print(f"  Min: ₹{df[col].min():.2f}")
    print(f"  Max: ₹{df[col].min():.2f}")
    print(f"  Mean: ₹{df[col].mean():.2f}")
    print(f"  Std: ₹{df[col].std():.2f}")

# Price vs VWAP Analysis
print(f"\n=== PRICE VS VWAP ANALYSIS ===")

# Calculate price position relative to each VWAP
df['price_vs_vwap_1min'] = df['price'] - df['vwap_1min']
df['price_vs_vwap_5min'] = df['price'] - df['vwap_5min']
df['price_vs_vwap_15min'] = df['price'] - df['vwap_15min']
df['price_vs_vwap_30min'] = df['price'] - df['vwap_30min']

# Count how many times price is above/below each VWAP
print("Price position relative to VWAP:")
for i, vwap_type in enumerate(['1min', '5min', '15min', '30min']):
    col = f'price_vs_vwap_{vwap_type}'
    above_count = (df[col] > 0).sum()
    below_count = (df[col] < 0).sum()
    equal_count = (df[col] == 0).sum()
    
    print(f"\n  {vwap_type} VWAP:")
    print(f"    Above: {above_count:,} times ({(above_count/len(df)*100):.1f}%)")
    print(f"    Below: {below_count:,} times ({(below_count/len(df)*100):.1f}%)")
    print(f"    Equal: {equal_count:,} times ({(equal_count/len(df)*100):.1f}%)")

# VWAP as Support/Resistance
print(f"\n=== VWAP AS SUPPORT/RESISTANCE ===")

# Find periods when price bounces off VWAP
for i, vwap_type in enumerate(['1min', '5min', '15min', '30min']):
    vwap_col = f'vwap_{vwap_type}'
    price_vs_col = f'price_vs_vwap_{vwap_type}'
    
    # Find when price is very close to VWAP (within 0.1%)
    close_to_vwap = (abs(df[price_vs_col]) / df[vwap_col] * 100) < 0.1
    close_count = close_to_vwap.sum()
    
    print(f"{vwap_type} VWAP: {close_count:,} times price within 0.1% of VWAP")

# VWAP Crossover Analysis
print(f"\n=== VWAP CROSSOVER ANALYSIS ===")

# Price crossing above/below VWAP
for i, vwap_type in enumerate(['1min', '5min', '15min', '30min']):
    price_vs_col = f'price_vs_vwap_{vwap_type}'
    
    # Create crossover signals
    df[f'above_vwap_{vwap_type}'] = df[price_vs_col] > 0
    df[f'vwap_{vwap_type}_cross'] = df[f'above_vwap_{vwap_type}'].astype(int)
    df[f'vwap_{vwap_type}_signal'] = df[f'vwap_{vwap_type}_cross'].diff()
    
    # Count crossovers
    bullish_crosses = (df[f'vwap_{vwap_type}_signal'] == 1).sum()
    bearish_crosses = (df[f'vwap_{vwap_type}_signal'] == -1).sum()
    
    print(f"{vwap_type} VWAP crossovers:")
    print(f"  Bullish (price crosses above): {bullish_crosses}")
    print(f"  Bearish (price crosses below): {bearish_crosses}")

# VWAP Trend Analysis
print(f"\n=== VWAP TREND ANALYSIS ===")

# Calculate VWAP trends (slopes)
df['vwap_1min_trend'] = df['vwap_1min'].diff()
df['vwap_5min_trend'] = df['vwap_5min'].diff()
df['vwap_15min_trend'] = df['vwap_15min'].diff()
df['vwap_30min_trend'] = df['vwap_30min'].diff()

# Count trending periods
for i, vwap_type in enumerate(['1min', '5min', '15min', '30min']):
    trend_col = f'vwap_{vwap_type}_trend'
    up_trend = (df[trend_col] > 0).sum()
    down_trend = (df[trend_col] < 0).sum()
    
    print(f"{vwap_type} VWAP trends:")
    print(f"  Upward: {up_trend:,} periods")
    print(f"  Downward: {down_trend:,} periods")

# VWAP vs Simple Moving Average Comparison
print(f"\n=== VWAP VS SIMPLE MOVING AVERAGE ===")

# Compare VWAP with price averages
print("VWAP vs Simple Price Average:")
print(f"  Overall VWAP: ₹{vwap_total:.2f}")
print(f"  Simple Price Average: ₹{df['price'].mean():.2f}")
print(f"  Difference: ₹{vwap_total - df['price'].mean():.2f}")

# Show correlation between VWAP and price
for col in vwap_cols:
    correlation = df[col].corr(df['price'])
    print(f"  {col} vs Price correlation: {correlation:.4f}")

# Trading Signals based on VWAP
print(f"\n=== VWAP TRADING SIGNALS ===")

# Generate basic trading signals
df['vwap_signal'] = 'Hold'
df.loc[df['price'] > df['vwap_15min'], 'vwap_signal'] = 'Buy'
df.loc[df['price'] < df['vwap_15min'], 'vwap_signal'] = 'Sell'

signal_counts = df['vwap_signal'].value_counts()
print("VWAP-based trading signals (using 15-min VWAP):")
for signal, count in signal_counts.items():
    print(f"  {signal}: {count:,} times ({(count/len(df)*100):.1f}%)")

# Summary
print(f"\n=== VWAP SUMMARY ===")
print(f"✓ Overall VWAP: ₹{vwap_total:.2f}")
print(f"✓ Rolling VWAP calculated for 1min, 5min, 15min, 30min windows")
print(f"✓ Price vs VWAP analysis completed")
print(f"✓ VWAP crossover signals generated")
print(f"✓ VWAP trend analysis completed")
print(f"✓ Trading signals based on VWAP generated")
print(f"✓ Ready for VWAP-based trading strategies")

# Display final dataframe info
print(f"\n=== UPDATED DATAFRAME INFO ===")
print(f"Columns: {list(df.columns)}")
print(f"Shape: {df.shape}")
print(f"New VWAP columns added:")
vwap_new_cols = ['vwap_1min', 'vwap_5min', 'vwap_15min', 'vwap_30min',
                  'price_vs_vwap_1min', 'price_vs_vwap_5min', 'price_vs_vwap_15min', 'price_vs_vwap_30min',
                  'above_vwap_1min', 'above_vwap_5min', 'above_vwap_15min', 'above_vwap_30min',
                  'vwap_1min_cross', 'vwap_5min_cross', 'vwap_15min_cross', 'vwap_30min_cross',
                  'vwap_1min_signal', 'vwap_5min_signal', 'vwap_15min_signal', 'vwap_30min_signal',
                  'vwap_1min_trend', 'vwap_5min_trend', 'vwap_15min_trend', 'vwap_30min_trend',
                  'vwap_signal']
for col in vwap_new_cols:
    if col in df.columns:
        print(f"  ✓ {col}")

=== VWAP CALCULATION ===
Calculating VWAP...
=== OVERALL VWAP ===
Total volume: 11,510,171
Total price × volume: ₹4,451,969,111.70
Overall VWAP: ₹386.79

=== ROLLING VWAP CALCULATION ===
=== SAMPLE DATA WITH VWAP VALUES ===
                  date   price    qty   vwap_1min   vwap_5min  vwap_15min  \
0  2025-08-07 09:15:00  386.85  65740  386.850000  386.850000  386.850000   
1  2025-08-07 09:15:01  386.30    895  386.842613  386.842613  386.842613   
2  2025-08-07 09:15:01  386.75   1401  386.840706  386.840706  386.840706   
3  2025-08-07 09:15:02  386.80   1795  386.839659  386.839659  386.839659   
4  2025-08-07 09:15:02  386.95    741  386.840818  386.840818  386.840818   
5  2025-08-07 09:15:03  386.90   2717  386.843012  386.843012  386.843012   
6  2025-08-07 09:15:03  386.80   9068  386.838276  386.838276  386.838276   
7  2025-08-07 09:15:04  386.75   1141  386.837070  386.837070  386.837070   
8  2025-08-07 09:15:04  386.90   1798  386.838396  386.838396  386.838396   
9  202

  df_temp['vwap_1min'] = (df_temp['price'] * df_temp['qty']).rolling(window='1T', min_periods=1).sum() / df_temp['qty'].rolling(window='1T', min_periods=1).sum()
  df_temp['vwap_5min'] = (df_temp['price'] * df_temp['qty']).rolling(window='5T', min_periods=1).sum() / df_temp['qty'].rolling(window='5T', min_periods=1).sum()
  df_temp['vwap_15min'] = (df_temp['price'] * df_temp['qty']).rolling(window='15T', min_periods=1).sum() / df_temp['qty'].rolling(window='15T', min_periods=1).sum()
  df_temp['vwap_30min'] = (df_temp['price'] * df_temp['qty']).rolling(window='30T', min_periods=1).sum() / df_temp['qty'].rolling(window='30T', min_periods=1).sum()


In [19]:
# Aggregate trades per minute: avg price, total qty, total trnvr
print("=== MINUTE-BY-MINUTE TRADE AGGREGATION ===")

# Create minute-level aggregation
print("Aggregating trades by minute...")

# Extract minute from datetime for grouping (using 'min' instead of 'T')
df['minute_key'] = df['date'].dt.floor('1min')  # Floor to nearest minute

# Group by minute and aggregate
minute_agg = df.groupby('minute_key').agg({
    'price': ['mean', 'min', 'max', 'first', 'last'],  # Price statistics
    'qty': ['sum', 'count'],                           # Total quantity and trade count
    'trnvr': 'sum'                                     # Total turnover
}).round(2)

# Check the actual column structure before flattening
print(f"Original aggregation columns: {minute_agg.columns.tolist()}")
print(f"Column levels: {minute_agg.columns.nlevels}")

# Flatten column names correctly
minute_agg.columns = ['avg_price', 'min_price', 'max_price', 'open_price', 'close_price', 
                      'total_qty', 'trade_count', 'total_trnvr']

# Reset index to make minute_key a column
minute_agg = minute_agg.reset_index()
minute_agg.rename(columns={'minute_key': 'minute'}, inplace=True)

# Display the aggregated data
print("=== MINUTE-BY-MINUTE AGGREGATED DATA ===")
print(f"Total minutes with trades: {len(minute_agg)}")
print(f"Time range: {minute_agg['minute'].min()} to {minute_agg['minute'].max()}")

# Show first 15 rows
print(f"\n=== FIRST 15 MINUTES ===")
print(minute_agg.head(15).to_string(index=False))

# Show last 15 rows
print(f"\n=== LAST 15 MINUTES ===")
print(minute_agg.tail(15).to_string(index=False))

# Basic statistics of aggregated data
print(f"\n=== AGGREGATED DATA STATISTICS ===")
print("Price Statistics:")
print(f"  Average price range: ₹{minute_agg['avg_price'].min():.2f} - ₹{minute_agg['avg_price'].max():.2f}")
print(f"  Overall average price: ₹{minute_agg['avg_price'].mean():.2f}")
print(f"  Price volatility (std): ₹{minute_agg['avg_price'].std():.2f}")

print(f"\nVolume Statistics:")
print(f"  Total volume across all minutes: {minute_agg['total_qty'].sum():,}")
print(f"  Average volume per minute: {minute_agg['total_qty'].mean():,.0f}")
print(f"  Highest volume minute: {minute_agg['total_qty'].max():,}")
print(f"  Lowest volume minute: {minute_agg['total_qty'].min():,}")

print(f"\nTurnover Statistics:")
print(f"  Total turnover across all minutes: ₹{minute_agg['total_trnvr'].sum():,.2f}")
print(f"  Average turnover per minute: ₹{minute_agg['total_trnvr'].mean():,.2f}")
print(f"  Highest turnover minute: ₹{minute_agg['total_trnvr'].max():,.2f}")
print(f"  Lowest turnover minute: ₹{minute_agg['total_trnvr'].min():,.2f}")

print(f"\nTrade Count Statistics:")
print(f"  Total trades across all minutes: {minute_agg['trade_count'].sum():,}")
print(f"  Average trades per minute: {minute_agg['trade_count'].mean():.1f}")
print(f"  Busiest minute: {minute_agg['trade_count'].max():,} trades")
print(f"  Quietest minute: {minute_agg['trade_count'].min():,} trades")

# Minute-by-minute analysis
print(f"\n=== MINUTE-BY-MINUTE ANALYSIS ===")

# Find highest and lowest volume minutes
highest_volume_minute = minute_agg.loc[minute_agg['total_qty'].idxmax()]
lowest_volume_minute = minute_agg.loc[minute_agg['total_qty'].idxmin()]

print("Highest volume minute:")
print(f"  Time: {highest_volume_minute['minute']}")
print(f"  Volume: {highest_volume_minute['total_qty']:,}")
print(f"  Turnover: ₹{highest_volume_minute['total_trnvr']:,.2f}")
print(f"  Avg Price: ₹{highest_volume_minute['avg_price']:.2f}")
print(f"  Trades: {highest_volume_minute['trade_count']}")

print(f"\nLowest volume minute:")
print(f"  Time: {lowest_volume_minute['minute']}")
print(f"  Volume: {lowest_volume_minute['total_qty']:,}")
print(f"  Turnover: ₹{lowest_volume_minute['total_trnvr']:,.2f}")
print(f"  Avg Price: ₹{lowest_volume_minute['avg_price']:.2f}")
print(f"  Trades: {lowest_volume_minute['trade_count']}")

# Find highest and lowest turnover minutes
highest_turnover_minute = minute_agg.loc[minute_agg['total_trnvr'].idxmax()]
lowest_turnover_minute = minute_agg.loc[minute_agg['total_trnvr'].idxmin()]

print(f"\nHighest turnover minute:")
print(f"  Time: {highest_turnover_minute['minute']}")
print(f"  Turnover: ₹{highest_turnover_minute['total_trnvr']:,.2f}")
print(f"  Volume: {highest_turnover_minute['total_qty']:,}")
print(f"  Avg Price: ₹{highest_turnover_minute['avg_price']:.2f}")

# Time-based patterns
print(f"\n=== TIME-BASED PATTERNS ===")

# Add hour column for hourly analysis
minute_agg['hour'] = minute_agg['minute'].dt.hour

# Hourly volume analysis
hourly_volume = minute_agg.groupby('hour')['total_qty'].sum()
print("Total volume by hour:")
for hour, volume in hourly_volume.items():
    print(f"  {hour:02d}:00 - {hour:02d}:59: {volume:,}")

# Hourly turnover analysis
hourly_turnover = minute_agg.groupby('hour')['total_trnvr'].sum()
print(f"\nTotal turnover by hour:")
for hour, turnover in hourly_turnover.items():
    print(f"  {hour:02d}:00 - {hour:02d}:59: ₹{turnover:,.2f}")

# Price movement analysis per minute
print(f"\n=== PRICE MOVEMENT ANALYSIS PER MINUTE ===")

# Calculate minute-to-minute price changes
minute_agg['price_change'] = minute_agg['close_price'] - minute_agg['open_price']
minute_agg['price_change_pct'] = (minute_agg['price_change'] / minute_agg['open_price']) * 100

# Price change statistics
print(f"Minute-to-minute price changes:")
print(f"  Average change: ₹{minute_agg['price_change'].mean():.2f}")
print(f"  Average change %: {minute_agg['price_change_pct'].mean():.2f}%")
print(f"  Largest increase: ₹{minute_agg['price_change'].max():.2f}")
print(f"  Largest decrease: ₹{minute_agg['price_change'].min():.2f}")

# Volume-weighted price analysis
print(f"\n=== VOLUME-WEIGHTED ANALYSIS ===")

# Calculate VWAP for each minute
minute_agg['minute_vwap'] = minute_agg['total_trnvr'] / minute_agg['total_qty']

# Compare VWAP with average price
minute_agg['vwap_vs_avg_diff'] = minute_agg['minute_vwap'] - minute_agg['avg_price']
print(f"VWAP vs Average Price analysis:")
print(f"  Average difference: ₹{minute_agg['vwap_vs_avg_diff'].mean():.2f}")
print(f"  Max difference: ₹{minute_agg['vwap_vs_avg_diff'].max():.2f}")
print(f"  Min difference: ₹{minute_agg['vwap_vs_avg_diff'].min():.2f}")

# Trading intensity analysis
print(f"\n=== TRADING INTENSITY ANALYSIS ===")

# Calculate volume per trade for each minute
minute_agg['volume_per_trade'] = minute_agg['total_qty'] / minute_agg['trade_count']

print(f"Volume per trade analysis:")
print(f"  Average volume per trade: {minute_agg['volume_per_trade'].mean():,.0f}")
print(f"  Highest volume per trade: {minute_agg['volume_per_trade'].max():,.0f}")
print(f"  Lowest volume per trade: {minute_agg['volume_per_trade'].min():,.0f}")

# Summary table
print(f"\n" + "="*80)
print("=== SUMMARY TABLE ===")
summary_data = {
    'Metric': ['Total Minutes', 'Total Volume', 'Total Turnover', 'Total Trades', 'Avg Price', 'Avg Volume/Min', 'Avg Turnover/Min'],
    'Value': [
        f"{len(minute_agg):,}",
        f"{minute_agg['total_qty'].sum():,}",
        f"₹{minute_agg['total_trnvr'].sum():,.2f}",
        f"{minute_agg['trade_count'].sum():,}",
        f"₹{minute_agg['avg_price'].mean():.2f}",
        f"{minute_agg['total_qty'].mean():,.0f}",
        f"₹{minute_agg['total_trnvr'].mean():,.2f}"
    ]
}

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

# Save aggregated data
print(f"\n=== DATA EXPORT ===")
print(f"✓ Minute-by-minute aggregation completed")
print(f"✓ {len(minute_agg)} minutes of aggregated data")
print(f"✓ Ready for time-series analysis and visualization")
print(f"✓ Data can be exported to CSV for further analysis")

# Display final aggregated dataframe info
print(f"\n=== AGGREGATED DATAFRAME INFO ===")
print(f"Columns: {list(minute_agg.columns)}")
print(f"Shape: {minute_agg.shape}")
print(f"Data types:")
print(minute_agg.dtypes)

=== MINUTE-BY-MINUTE TRADE AGGREGATION ===
Aggregating trades by minute...
Original aggregation columns: [('price', 'mean'), ('price', 'min'), ('price', 'max'), ('price', 'first'), ('price', 'last'), ('qty', 'sum'), ('qty', 'count'), ('trnvr', 'sum')]
Column levels: 2
=== MINUTE-BY-MINUTE AGGREGATED DATA ===
Total minutes with trades: 375
Time range: 2025-08-07 09:15:00 to 2025-08-07 15:29:00

=== FIRST 15 MINUTES ===
             minute  avg_price  min_price  max_price  open_price  close_price  total_qty  trade_count  total_trnvr
2025-08-07 09:15:00     386.24     385.70     386.95      386.85       386.00     240694          116  93007031.40
2025-08-07 09:16:00     387.07     385.60     387.75      386.00       387.60     176938           94  68447339.50
2025-08-07 09:17:00     388.06     387.55     388.55      387.60       388.55     109071           15  42309419.75
2025-08-07 09:18:00     389.00     388.45     389.45      388.85       389.00     140751           14  54755417.20
202

In [21]:
# Aggregate per hour: price volatility, volume, turnover
print("=== HOURLY AGGREGATION ANALYSIS ===")

# Create hourly aggregation
print("Aggregating data by hour...")

# Extract hour from datetime for grouping
df['hour_key'] = df['date'].dt.hour

# Group by hour and aggregate
hourly_agg = df.groupby('hour_key').agg({
    'price': ['mean', 'min', 'max', 'std', 'first', 'last'],  # Price statistics including volatility
    'qty': ['sum', 'count'],                                   # Total volume and trade count
    'trnvr': 'sum',                                            # Total turnover
    'price_change': ['mean', 'std', 'min', 'max']              # Price change statistics
}).round(2)

# Flatten column names
hourly_agg.columns = [
    'avg_price', 'min_price', 'max_price', 'price_std', 'open_price', 'close_price',
    'total_qty', 'trade_count', 'total_trnvr',
    'avg_price_change', 'price_change_std', 'min_price_change', 'max_price_change'
]

# Reset index to make hour_key a column
hourly_agg = hourly_agg.reset_index()
hourly_agg.rename(columns={'hour_key': 'hour'}, inplace=True)

# Display the hourly aggregation
print(f"\n=== HOURLY AGGREGATION RESULTS ===")
print(f"Shape: {hourly_agg.shape}")
print(hourly_agg)

# Find highest and lowest volume hours
print(f"\n=== VOLUME ANALYSIS BY HOUR ===")
highest_volume_hour = hourly_agg.loc[hourly_agg['total_qty'].idxmax()]
lowest_volume_hour = hourly_agg.loc[hourly_agg['total_qty'].idxmin()]

print("Highest volume hour:")
print(f"  Hour: {int(highest_volume_hour['hour']):02d}:00 - {int(highest_volume_hour['hour']):02d}:59")
print(f"  Volume: {highest_volume_hour['total_qty']:,}")
print(f"  Turnover: ₹{highest_volume_hour['total_trnvr']:,.2f}")
print(f"  Average price: ₹{highest_volume_hour['avg_price']:.2f}")

print("\nLowest volume hour:")
print(f"  Hour: {int(lowest_volume_hour['hour']):02d}:00 - {int(lowest_volume_hour['hour']):02d}:59")
print(f"  Volume: {lowest_volume_hour['total_qty']:,}")
print(f"  Turnover: ₹{lowest_volume_hour['total_trnvr']:,.2f}")
print(f"  Average price: ₹{lowest_volume_hour['avg_price']:.2f}")

# Volatility analysis by hour
print(f"\n=== VOLATILITY ANALYSIS BY HOUR ===")
highest_volatility_hour = hourly_agg.loc[hourly_agg['price_std'].idxmax()]
print(f"Highest volatility hour: {int(highest_volatility_hour['hour']):02d}:00")
print(f"  Price standard deviation: ₹{highest_volatility_hour['price_std']:.2f}")
print(f"  Price range: ₹{highest_volatility_hour['min_price']:.2f} - ₹{highest_volatility_hour['max_price']:.2f}")

# Summary statistics
print(f"\n=== SUMMARY STATISTICS ===")
print(f"Total trading hours: {len(hourly_agg)}")
print(f"Average hourly volume: {hourly_agg['total_qty'].mean():,.0f}")
print(f"Average hourly turnover: ₹{hourly_agg['total_trnvr'].mean():,.2f}")
print(f"Average hourly price volatility: ₹{hourly_agg['price_std'].mean():.2f}")

=== HOURLY AGGREGATION ANALYSIS ===
Aggregating data by hour...

=== HOURLY AGGREGATION RESULTS ===
Shape: (7, 14)
   hour  avg_price  min_price  max_price  price_std  open_price  close_price  \
0     9     388.60     385.60     390.20       1.13      386.85       388.30   
1    10     388.23     387.45     389.25       0.39      388.10       388.10   
2    11     386.51     385.25     388.05       0.65      388.00       385.55   
3    12     385.22     384.40     386.50       0.44      385.70       385.40   
4    13     384.43     383.50     385.60       0.49      385.55       384.75   
5    14     385.97     384.70     387.10       0.63      384.70       387.05   
6    15     387.92     387.00     388.75       0.46      387.35       388.25   

   total_qty  trade_count   total_trnvr  avg_price_change  price_change_std  \
0    2164111         1047  8.408010e+08               0.0              0.12   
1    1436774         1476  5.577489e+08              -0.0              0.09   
2    13

In [22]:
# Aggregate per trading session: OHLC + total volume
print("=== TRADING SESSION OHLC ANALYSIS ===")

# Create trading session aggregation (assuming single day data)
print("Aggregating data by trading session...")

# For single day data, we can create session-level OHLC
# If you have multiple days, you can group by date_only instead
if len(df['date_only'].unique()) == 1:
    # Single trading day - create session OHLC
    session_ohlc = {
        'date': df['date_only'].iloc[0],
        'open': df['price'].iloc[0],           # First price of the day
        'high': df['price'].max(),             # Highest price of the day
        'low': df['price'].min(),              # Lowest price of the day
        'close': df['price'].iloc[-1],         # Last price of the day
        'total_volume': df['qty'].sum(),       # Total volume for the day
        'total_turnover': df['trnvr'].sum(),   # Total turnover for the day
        'trade_count': len(df),                # Total number of trades
        'price_range': df['price'].max() - df['price'].min(),  # High - Low
        'avg_price': df['price'].mean()        # Average price for the day
    }
    
    # Create DataFrame from the session data
    session_df = pd.DataFrame([session_ohlc])
    
    print(f"=== SINGLE TRADING SESSION OHLC ===")
    print(f"Date: {session_df['date'].iloc[0]}")
    print(f"Open: ₹{session_df['open'].iloc[0]:.2f}")
    print(f"High: ₹{session_df['high'].iloc[0]:.2f}")
    print(f"Low: ₹{session_df['low'].iloc[0]:.2f}")
    print(f"Close: ₹{session_df['close'].iloc[0]:.2f}")
    print(f"Price Range: ₹{session_df['price_range'].iloc[0]:.2f}")
    print(f"Total Volume: {session_df['total_volume'].iloc[0]:,}")
    print(f"Total Turnover: ₹{session_df['total_turnover'].iloc[0]:,.2f}")
    print(f"Trade Count: {session_df['trade_count'].iloc[0]:,}")
    print(f"Average Price: ₹{session_df['avg_price'].iloc[0]:.2f}")
    
    # Calculate additional metrics
    print(f"\n=== ADDITIONAL METRICS ===")
    session_df['price_change'] = session_df['close'] - session_df['open']
    session_df['price_change_pct'] = (session_df['price_change'] / session_df['open']) * 100
    
    print(f"Price Change: ₹{session_df['price_change'].iloc[0]:.2f}")
    print(f"Price Change %: {session_df['price_change_pct'].iloc[0]:.2f}%")
    
    # VWAP calculation for the session
    vwap = (df['price'] * df['qty']).sum() / df['qty'].sum()
    print(f"Session VWAP: ₹{vwap:.2f}")
    
    # Display the complete session DataFrame
    print(f"\n=== COMPLETE SESSION DATA ===")
    print(session_df)
    
else:
    # Multiple trading days - group by date
    print("Multiple trading days detected, grouping by date...")
    
    # Group by date and create OHLC for each day
    daily_ohlc = df.groupby('date_only').agg({
        'price': ['first', 'max', 'min', 'last'],  # OHLC
        'qty': 'sum',                               # Total volume
        'trnvr': 'sum',                             # Total turnover
        'qty': 'count'                              # Trade count
    }).round(2)
    
    # Flatten column names
    daily_ohlc.columns = ['open', 'high', 'low', 'close', 'total_volume', 'total_turnover', 'trade_count']
    
    # Reset index
    daily_ohlc = daily_ohlc.reset_index()
    
    # Calculate additional metrics
    daily_ohlc['price_range'] = daily_ohlc['high'] - daily_ohlc['low']
    daily_ohlc['price_change'] = daily_ohlc['close'] - daily_ohlc['open']
    daily_ohlc['price_change_pct'] = (daily_ohlc['price_change'] / daily_ohlc['open']) * 100
    
    print(f"\n=== MULTI-DAY OHLC DATA ===")
    print(daily_ohlc)

=== TRADING SESSION OHLC ANALYSIS ===
Aggregating data by trading session...
=== SINGLE TRADING SESSION OHLC ===
Date: 2025-08-07
Open: ₹386.85
High: ₹390.20
Low: ₹383.50
Close: ₹388.25
Price Range: ₹6.70
Total Volume: 11,510,171
Total Turnover: ₹4,451,969,111.70
Trade Count: 10,956
Average Price: ₹386.56

=== ADDITIONAL METRICS ===
Price Change: ₹1.40
Price Change %: 0.36%
Session VWAP: ₹386.79

=== COMPLETE SESSION DATA ===
         date    open   high    low   close  total_volume  total_turnover  \
0  2025-08-07  386.85  390.2  383.5  388.25      11510171    4.451969e+09   

   trade_count  price_range   avg_price  price_change  price_change_pct  
0        10956          6.7  386.561094           1.4          0.361897  


In [23]:
# Count trades per time interval (activity density)
print("=== TRADE ACTIVITY DENSITY ANALYSIS ===")

# Create different time intervals for analysis
print("Analyzing trade activity density across different time intervals...")

# 1. Minute-by-minute trade count
print("\n=== MINUTE-BY-MINUTE TRADE COUNT ===")
df['minute_key'] = df['date'].dt.floor('1min')
minute_trades = df.groupby('minute_key').size().reset_index(name='trade_count')
minute_trades['time'] = minute_trades['minute_key'].dt.time

print(f"Total minutes with trades: {len(minute_trades)}")
print(f"Average trades per minute: {minute_trades['trade_count'].mean():.1f}")
print(f"Max trades in a minute: {minute_trades['trade_count'].max()}")
print(f"Min trades in a minute: {minute_trades['trade_count'].min()}")

# Display top 10 most active minutes
print(f"\n=== TOP 10 MOST ACTIVE MINUTES ===")
top_active_minutes = minute_trades.nlargest(10, 'trade_count')
for idx, row in top_active_minutes.iterrows():
    print(f"{row['time']}: {row['trade_count']} trades")

# 2. 5-minute interval trade count
print(f"\n=== 5-MINUTE INTERVAL TRADE COUNT ===")
df['five_min_key'] = df['date'].dt.floor('5min')
five_min_trades = df.groupby('five_min_key').size().reset_index(name='trade_count')
five_min_trades['time'] = five_min_trades['five_min_key'].dt.time

print(f"Total 5-minute intervals: {len(five_min_trades)}")
print(f"Average trades per 5-min: {five_min_trades['trade_count'].mean():.1f}")
print(f"Max trades in 5-min: {five_min_trades['trade_count'].max()}")

# 3. 15-minute interval trade count
print(f"\n=== 15-MINUTE INTERVAL TRADE COUNT ===")
df['fifteen_min_key'] = df['date'].dt.floor('15min')
fifteen_min_trades = df.groupby('fifteen_min_key').size().reset_index(name='trade_count')
fifteen_min_trades['time'] = fifteen_min_trades['fifteen_min_key'].dt.time

print(f"Total 15-minute intervals: {len(fifteen_min_trades)}")
print(f"Average trades per 15-min: {fifteen_min_trades['trade_count'].mean():.1f}")
print(f"Max trades in 15-min: {fifteen_min_trades['trade_count'].max()}")

# 4. Hourly trade count
print(f"\n=== HOURLY TRADE COUNT ===")
df['hour_key'] = df['date'].dt.hour
hourly_trades = df.groupby('hour_key').size().reset_index(name='trade_count')
hourly_trades['time_range'] = hourly_trades['hour_key'].apply(lambda x: f"{x:02d}:00-{x:02d}:59")

print(f"Total trading hours: {len(hourly_trades)}")
print(f"Average trades per hour: {hourly_trades['trade_count'].mean():.1f}")
print(f"Max trades in an hour: {hourly_trades['trade_count'].max()}")

# Display hourly breakdown
print(f"\n=== HOURLY BREAKDOWN ===")
for idx, row in hourly_trades.iterrows():
    print(f"{row['time_range']}: {row['trade_count']:,} trades")

# 5. Activity density analysis
print(f"\n=== ACTIVITY DENSITY ANALYSIS ===")

# Find peak activity periods
peak_minute = minute_trades.loc[minute_trades['trade_count'].idxmax()]
peak_five_min = five_min_trades.loc[five_min_trades['trade_count'].idxmax()]
peak_fifteen_min = fifteen_min_trades.loc[fifteen_min_trades['trade_count'].idxmax()]
peak_hour = hourly_trades.loc[hourly_trades['trade_count'].idxmax()]

print(f"Peak activity minute: {peak_minute['time']} ({peak_minute['trade_count']} trades)")
print(f"Peak activity 5-min: {peak_five_min['time']} ({peak_five_min['trade_count']} trades)")
print(f"Peak activity 15-min: {peak_fifteen_min['time']} ({peak_fifteen_min['trade_count']} trades)")
print(f"Peak activity hour: {peak_hour['time_range']} ({peak_hour['trade_count']:,} trades)")

# 6. Quiet periods analysis
print(f"\n=== QUIET PERIODS ANALYSIS ===")
quiet_minutes = minute_trades[minute_trades['trade_count'] == minute_trades['trade_count'].min()]
print(f"Minutes with minimum activity ({minute_trades['trade_count'].min()} trades):")
for idx, row in quiet_minutes.head(5).iterrows():
    print(f"  {row['time']}")

# 7. Summary statistics
print(f"\n=== SUMMARY STATISTICS ===")
total_trades = len(df)
total_minutes = len(minute_trades)
total_hours = len(hourly_trades)

print(f"Total trades: {total_trades:,}")
print(f"Total active minutes: {total_minutes}")
print(f"Total trading hours: {total_hours}")
print(f"Average trades per minute: {total_trades/total_minutes:.1f}")
print(f"Average trades per hour: {total_trades/total_hours:.1f}")

# Display sample of minute-by-minute data
print(f"\n=== SAMPLE MINUTE-BY-MINUTE DATA (First 10 rows) ===")
print(minute_trades.head(10))

=== TRADE ACTIVITY DENSITY ANALYSIS ===
Analyzing trade activity density across different time intervals...

=== MINUTE-BY-MINUTE TRADE COUNT ===
Total minutes with trades: 375
Average trades per minute: 29.2
Max trades in a minute: 116
Min trades in a minute: 9

=== TOP 10 MOST ACTIVE MINUTES ===
09:15:00: 116 trades
15:09:00: 108 trades
09:22:00: 105 trades
15:23:00: 98 trades
09:19:00: 96 trades
15:26:00: 96 trades
09:16:00: 94 trades
09:21:00: 94 trades
15:17:00: 94 trades
12:49:00: 93 trades

=== 5-MINUTE INTERVAL TRADE COUNT ===
Total 5-minute intervals: 75
Average trades per 5-min: 146.1
Max trades in 5-min: 383

=== 15-MINUTE INTERVAL TRADE COUNT ===
Total 15-minute intervals: 25
Average trades per 15-min: 438.2
Max trades in 15-min: 971

=== HOURLY TRADE COUNT ===
Total trading hours: 7
Average trades per hour: 1565.1
Max trades in an hour: 1807

=== HOURLY BREAKDOWN ===
09:00-09:59: 1,047 trades
10:00-10:59: 1,476 trades
11:00-11:59: 1,653 trades
12:00-12:59: 1,807 trades
13:

In [24]:
# Calculate intraday volatility using standard deviation of price per minute/hour
print("=== INTRADAY VOLATILITY ANALYSIS ===")

# Calculate volatility at different time intervals
print("Calculating price volatility across different time intervals...")

# 1. Minute-by-minute volatility
print("\n=== MINUTE-BY-MINUTE VOLATILITY ===")
df['minute_key'] = df['date'].dt.floor('1min')
minute_volatility = df.groupby('minute_key').agg({
    'price': ['mean', 'std', 'min', 'max', 'count']  # Price statistics including std dev
}).round(4)

# Flatten column names
minute_volatility.columns = ['avg_price', 'price_std', 'min_price', 'max_price', 'trade_count']
minute_volatility = minute_volatility.reset_index()
minute_volatility['time'] = minute_volatility['minute_key'].dt.time

# Filter out minutes with only 1 trade (std dev = 0 or NaN)
minute_volatility_filtered = minute_volatility[minute_volatility['trade_count'] > 1]

print(f"Total minutes analyzed: {len(minute_volatility)}")
print(f"Minutes with volatility (multiple trades): {len(minute_volatility_filtered)}")
print(f"Average minute volatility: ₹{minute_volatility_filtered['price_std'].mean():.4f}")
print(f"Max minute volatility: ₹{minute_volatility_filtered['price_std'].max():.4f}")

# 2. 5-minute interval volatility
print(f"\n=== 5-MINUTE INTERVAL VOLATILITY ===")
df['five_min_key'] = df['date'].dt.floor('5min')
five_min_volatility = df.groupby('five_min_key').agg({
    'price': ['mean', 'std', 'min', 'max', 'count']
}).round(4)

five_min_volatility.columns = ['avg_price', 'price_std', 'min_price', 'max_price', 'trade_count']
five_min_volatility = five_min_volatility.reset_index()
five_min_volatility['time'] = five_min_volatility['five_min_key'].dt.time

five_min_filtered = five_min_volatility[five_min_volatility['trade_count'] > 1]

print(f"Total 5-minute intervals: {len(five_min_volatility)}")
print(f"Intervals with volatility: {len(five_min_filtered)}")
print(f"Average 5-min volatility: ₹{five_min_filtered['price_std'].mean():.4f}")
print(f"Max 5-min volatility: ₹{five_min_filtered['price_std'].max():.4f}")

# 3. 15-minute interval volatility
print(f"\n=== 15-MINUTE INTERVAL VOLATILITY ===")
df['fifteen_min_key'] = df['date'].dt.floor('15min')
fifteen_min_volatility = df.groupby('fifteen_min_key').agg({
    'price': ['mean', 'std', 'min', 'max', 'count']
}).round(4)

fifteen_min_volatility.columns = ['avg_price', 'price_std', 'min_price', 'max_price', 'trade_count']
fifteen_min_volatility = fifteen_min_volatility.reset_index()
fifteen_min_volatility['time'] = fifteen_min_volatility['fifteen_min_key'].dt.time

fifteen_min_filtered = fifteen_min_volatility[fifteen_min_volatility['trade_count'] > 1]

print(f"Total 15-minute intervals: {len(fifteen_min_volatility)}")
print(f"Intervals with volatility: {len(fifteen_min_filtered)}")
print(f"Average 15-min volatility: ₹{fifteen_min_volatility['price_std'].mean():.4f}")
print(f"Max 15-min volatility: ₹{fifteen_min_volatility['price_std'].max():.4f}")

# 4. Hourly volatility
print(f"\n=== HOURLY VOLATILITY ===")
df['hour_key'] = df['date'].dt.hour
hourly_volatility = df.groupby('hour_key').agg({
    'price': ['mean', 'std', 'min', 'max', 'count']
}).round(4)

hourly_volatility.columns = ['avg_price', 'price_std', 'min_price', 'max_price', 'trade_count']
hourly_volatility = hourly_volatility.reset_index()
hourly_volatility['time_range'] = hourly_volatility['hour_key'].apply(lambda x: f"{x:02d}:00-{x:02d}:59")

print(f"Total trading hours: {len(hourly_volatility)}")
print(f"Average hourly volatility: ₹{hourly_volatility['price_std'].mean():.4f}")
print(f"Max hourly volatility: ₹{hourly_volatility['price_std'].max():.4f}")

# 5. Peak volatility periods identification
print(f"\n=== PEAK VOLATILITY PERIODS ===")

# Find highest volatility periods
highest_vol_minute = minute_volatility_filtered.loc[minute_volatility_filtered['price_std'].idxmax()]
highest_vol_five_min = five_min_filtered.loc[five_min_filtered['price_std'].idxmax()]
highest_vol_fifteen_min = fifteen_min_filtered.loc[fifteen_min_filtered['price_std'].idxmax()]
highest_vol_hour = hourly_volatility.loc[hourly_volatility['price_std'].idxmax()]

print(f"Highest minute volatility: {highest_vol_minute['time']} (₹{highest_vol_minute['price_std']:.4f})")
print(f"  Price range: ₹{highest_vol_minute['min_price']:.2f} - ₹{highest_vol_minute['max_price']:.2f}")
print(f"  Trades: {highest_vol_minute['trade_count']}")

print(f"Highest 5-min volatility: {highest_vol_five_min['time']} (₹{highest_vol_five_min['price_std']:.4f})")
print(f"  Price range: ₹{highest_vol_five_min['min_price']:.2f} - ₹{highest_vol_five_min['max_price']:.2f}")

print(f"Highest 15-min volatility: {highest_vol_fifteen_min['time']} (₹{highest_vol_fifteen_min['price_std']:.4f})")
print(f"  Price range: ₹{highest_vol_fifteen_min['min_price']:.2f} - ₹{highest_vol_fifteen_min['max_price']:.2f}")

print(f"Highest hourly volatility: {highest_vol_hour['time_range']} (₹{highest_vol_hour['price_std']:.4f})")
print(f"  Price range: ₹{highest_vol_hour['min_price']:.2f} - ₹{highest_vol_hour['max_price']:.2f}")

# 6. Volatility distribution analysis
print(f"\n=== VOLATILITY DISTRIBUTION ANALYSIS ===")

# Calculate volatility percentiles
volatility_percentiles = minute_volatility_filtered['price_std'].quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99])
print("Minute volatility percentiles:")
for p, v in volatility_percentiles.items():
    print(f"  {p*100:2.0f}th percentile: ₹{v:.4f}")

# 7. Summary statistics
print(f"\n=== SUMMARY STATISTICS ===")
print(f"Overall price standard deviation: ₹{df['price'].std():.4f}")
print(f"Overall price range: ₹{df['price'].max() - df['price'].min():.2f}")
print(f"Overall coefficient of variation: {(df['price'].std() / df['price'].mean() * 100):.2f}%")

# Display sample volatility data
print(f"\n=== SAMPLE MINUTE VOLATILITY DATA (First 10 rows) ===")
print(minute_volatility_filtered[['time', 'avg_price', 'price_std', 'min_price', 'max_price', 'trade_count']].head(10))

=== INTRADAY VOLATILITY ANALYSIS ===
Calculating price volatility across different time intervals...

=== MINUTE-BY-MINUTE VOLATILITY ===
Total minutes analyzed: 375
Minutes with volatility (multiple trades): 375
Average minute volatility: ₹0.0957
Max minute volatility: ₹0.7306

=== 5-MINUTE INTERVAL VOLATILITY ===
Total 5-minute intervals: 75
Intervals with volatility: 75
Average 5-min volatility: ₹0.1911
Max 5-min volatility: ₹1.2729

=== 15-MINUTE INTERVAL VOLATILITY ===
Total 15-minute intervals: 25
Intervals with volatility: 25
Average 15-min volatility: ₹0.3198
Max 15-min volatility: ₹1.3852

=== HOURLY VOLATILITY ===
Total trading hours: 7
Average hourly volatility: ₹0.5978
Max hourly volatility: ₹1.1315

=== PEAK VOLATILITY PERIODS ===
Highest minute volatility: 09:16:00 (₹0.7306)
  Price range: ₹385.60 - ₹387.75
  Trades: 94
Highest 5-min volatility: 09:15:00 (₹1.2729)
  Price range: ₹385.60 - ₹389.45
Highest 15-min volatility: 09:15:00 (₹1.3852)
  Price range: ₹385.60 - ₹390.