In [1]:
# Check package installation status programmatically

import importlib
import pkg_resources

# Method 1: Using importlib to check if the module can be imported
try:
    module = importlib.import_module('src')
    print(f"✓ 'src' package is importable")
    print(f"  Path: {module.__file__}")
except ImportError:
    print("✗ 'src' package is not importable")

# Method 2: Using pkg_resources to check installed packages
try:
    package = pkg_resources.get_distribution('row-match-recognize')
    print(f"✓ 'row-match-recognize' package is installed")
    print(f"  Version: {package.version}")
    print(f"  Location: {package.location}")
    print(f"  Editable: {package.location == '/home/monierashraf/Desktop/llm/Row_match_recognize'}")
except pkg_resources.DistributionNotFound:
    print("✗ 'row-match-recognize' package is not installed")

# Try importing match_recognize directly
try:
    from src import match_recognize
    print(f"✓ 'match_recognize' function is importable from 'src'")
except ImportError:
    print("✗ 'match_recognize' function is not importable from 'src'")

  import pkg_resources


✓ 'src' package is importable
  Path: /home/monierashraf/anaconda3/lib/python3.12/site-packages/src/__init__.py
✓ 'row-match-recognize' package is installed
  Version: 0.1.0
  Location: /home/monierashraf/anaconda3/lib/python3.12/site-packages
  Editable: False
✓ 'match_recognize' function is importable from 'src'


In [1]:
# Simple direct import - no path manipulation needed!
from match_recognize import match_recognize
print("✅ Success: from match_recognize import match_recognize")

# Test that it works
print("✅ Function available as: match_recognize()")
print("✅ Module location:", match_recognize.__module__)
print(f"✅ Function type: {type(match_recognize)}")

print("\n🎉 Clean and simple - just like any other Python package!")
print("📦 Package installed and ready to use!")

# Example usage
print("\n📖 Usage example:")
print("   from match_recognize import match_recognize")
print("   result = match_recognize(sql_query, dataframe)")

ModuleNotFoundError: No module named 'match_recognize'

In [2]:
!pip install -e .


Obtaining file:///home/monierashraf/Desktop/llm/Row_match_recognize/Examples%20Match_reocginze
[31mERROR: file:///home/monierashraf/Desktop/llm/Row_match_recognize/Examples%20Match_reocginze does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m[31mERROR: file:///home/monierashraf/Desktop/llm/Row_match_recognize/Examples%20Match_reocginze does not appear to be a Python project: neither 'setup.py' nor 'pyproject.toml' found.[0m[31m
[0m

In [2]:
from match_recognize import match_recognize

# Example 1

# Direct Usage Examples

Now you can use `match_recognize()` directly without any prefixes! The function is imported and ready to use.

In [3]:
import pandas as pd
# match_recognize is already imported in the previous cell, so we can use it directly

# Sample data for testing
data = {
    'customer_id': [1, 1, 1, 1, 2, 2, 2, 3, 3, 3],
    'order_date': pd.to_datetime([
        '2023-01-01', '2023-01-05', '2023-01-10', '2023-01-15',
        '2023-02-01', '2023-02-05', '2023-02-10',
        '2023-03-01', '2023-03-05', '2023-03-10'
    ]),
    'price': [100, 80, 60, 40, 200, 150, 100, 300, 250, 200]
}

df = pd.DataFrame(data)
print("Sample data:")
print(df)

# Simple pattern matching - find declining price patterns
sql = """
SELECT customer_id, start_date, end_date, bottom_price
FROM memory.default.orders
MATCH_RECOGNIZE (
    PARTITION BY customer_id
    ORDER BY order_date
    MEASURES
        A.order_date AS start_date,
        C.order_date AS end_date,
        C.price AS bottom_price
    PATTERN (A B C)
    DEFINE
        B AS B.price < A.price,
        C AS C.price < B.price
);
"""

print("\nSQL Query:")
print(sql)

# Execute the pattern matching
result = match_recognize(sql, df)
print(f"\nResult shape: {result.shape}")
print("\nMatched patterns:")
print(result)

Sample data:
   customer_id order_date  price
0            1 2023-01-01    100
1            1 2023-01-05     80
2            1 2023-01-10     60
3            1 2023-01-15     40
4            2 2023-02-01    200
5            2 2023-02-05    150
6            2 2023-02-10    100
7            3 2023-03-01    300
8            3 2023-03-05    250
9            3 2023-03-10    200

SQL Query:

SELECT customer_id, start_date, end_date, bottom_price
FROM memory.default.orders
MATCH_RECOGNIZE (
    PARTITION BY customer_id
    ORDER BY order_date
    MEASURES
        A.order_date AS start_date,
        C.order_date AS end_date,
        C.price AS bottom_price
    PATTERN (A B C)
    DEFINE
        B AS B.price < A.price,
        C AS C.price < B.price
);


Result shape: (3, 4)

Matched patterns:
   customer_id start_date   end_date  bottom_price
0            1 2023-01-01 2023-01-10            60
1            2 2023-02-01 2023-02-10           100
2            3 2023-03-01 2023-03-10           200


# check_fix

In [2]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Simple test case to debug PREV function issue
data = [
    ('cust_1', '2020-05-11', 100),
    ('cust_1', '2020-05-12', 200),
    ('cust_2', '2020-05-13',   8),
    ('cust_1', '2020-05-14', 100),
    ('cust_2', '2020-05-15',   4),
    ('cust_1', '2020-05-16',  50),
    ('cust_1', '2020-05-17', 100),
    ('cust_2', '2020-05-18',   6),
]
# Create DataFrame
df = pd.DataFrame(data, columns=['customer_id', 'order_date', 'price'])
df['order_date'] = pd.to_datetime(df['order_date'])

print("Test data:")
print(df)
print()

# Test simple query without PREV first
query_simple = """
SELECT customer_id, start_price, final_price, start_date, final_date
    FROM memory.default.orders
        MATCH_RECOGNIZE (
            PARTITION BY customer_id
            ORDER BY order_date
            MEASURES
                START.price AS start_price,
                LAST(DOWN.price) AS final_price,
                START.order_date AS start_date,
                LAST(DOWN.order_date) AS final_date
            ONE ROW PER MATCH
            AFTER MATCH SKIP PAST LAST ROW
            PATTERN (START DOWN+)
            DEFINE
                DOWN AS price < 150
            );
"""

print("Testing simple query without PREV:")
try:
    result = match_recognize(query_simple, df)
    print(result)
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()
print()

Test data:
  customer_id order_date  price
0      cust_1 2020-05-11    100
1      cust_1 2020-05-12    200
2      cust_2 2020-05-13      8
3      cust_1 2020-05-14    100
4      cust_2 2020-05-15      4
5      cust_1 2020-05-16     50
6      cust_1 2020-05-17    100
7      cust_2 2020-05-18      6

Testing simple query without PREV:
  customer_id  start_price  final_price start_date final_date
0      cust_1          200          100 2020-05-12 2020-05-17
1      cust_2            8            6 2020-05-13 2020-05-18



In [3]:
import pandas as pd
from src.executor.match_recognize import match_recognize
import pandas as pd

# Define the data
data = [
    ('cust_1', '2020-05-11', 100),
    ('cust_1', '2020-05-12', 200),
    ('cust_2', '2020-05-13',   8),
    ('cust_1', '2020-05-14', 100),
    ('cust_2', '2020-05-15',   4),
    ('cust_1', '2020-05-16',  50),
    ('cust_1', '2020-05-17', 100),
    ('cust_2', '2020-05-18',   6),
]

# Create DataFrame
df = pd.DataFrame(data, columns=['customer_id', 'order_date', 'price'])

# Convert order_date column to datetime
df['order_date'] = pd.to_datetime(df['order_date'])

# Display the DataFrame
print(df)

query_basic = """
SELECT customer_id, start_date, end_date, bottom_price
FROM memory.default.orders
MATCH_RECOGNIZE (
PARTITION BY customer_id
ORDER BY order_date
MEASURES
A.order_date AS start_date,
C.order_date AS end_date,
C.price AS bottom_price
PATTERN (A B C)
DEFINE
B AS B.price < A.price,
C AS C.price < B.price
);


"""

print("Test 1:")
output_df = match_recognize(query_basic, df)
print(output_df)
print("\n")

  customer_id order_date  price
0      cust_1 2020-05-11    100
1      cust_1 2020-05-12    200
2      cust_2 2020-05-13      8
3      cust_1 2020-05-14    100
4      cust_2 2020-05-15      4
5      cust_1 2020-05-16     50
6      cust_1 2020-05-17    100
7      cust_2 2020-05-18      6
Test 1:
  customer_id start_date   end_date  bottom_price
0      cust_1 2020-05-12 2020-05-16            50




In [4]:
import pandas as pd
from src.executor.match_recognize import match_recognize
import pandas as pd

# Define the data
data = [
    ('cust_1', '2020-05-11', 100),
    ('cust_1', '2020-05-12', 200),
    ('cust_2', '2020-05-13',   8),
    ('cust_1', '2020-05-14', 100),
    ('cust_2', '2020-05-15',   4),
    ('cust_1', '2020-05-16',  50),
    ('cust_1', '2020-05-17', 100),
    ('cust_2', '2020-05-18',   6),
]

# Create DataFrame
df = pd.DataFrame(data, columns=['customer_id', 'order_date', 'price'])

# Convert order_date column to datetime
df['order_date'] = pd.to_datetime(df['order_date'])

# Display the DataFrame
print(df)

query_basic = """
SELECT
customer_id,
start_price,
last_b_price AS peak_price,
last_c_price AS bottom_price,
last_d_price AS end_price,
start_date,
end_date
FROM memory.default.orders
MATCH_RECOGNIZE (
PARTITION BY customer_id
ORDER BY order_date
MEASURES
A.price AS start_price,
LAST(B.price) AS last_b_price,
LAST(C.price) AS last_c_price,
LAST(D.price) AS last_d_price,
A.order_date AS start_date,
LAST(D.order_date) AS end_date
PATTERN (A B+ C+ D+)
DEFINE
B AS B.price > PREV(price),
C AS C.price < PREV(price),
D AS D.price > PREV(price)
);

"""

print("Test 1:")
output_df = match_recognize(query_basic, df)
print(output_df)
print("\n")



  customer_id order_date  price
0      cust_1 2020-05-11    100
1      cust_1 2020-05-12    200
2      cust_2 2020-05-13      8
3      cust_1 2020-05-14    100
4      cust_2 2020-05-15      4
5      cust_1 2020-05-16     50
6      cust_1 2020-05-17    100
7      cust_2 2020-05-18      6
Test 1:
  customer_id  start_price  peak_price  bottom_price  end_price start_date  \
0      cust_1          100         200            50        100 2020-05-11   

    end_date  
0 2020-05-17  




In [5]:
import pandas as pd
from src.executor.match_recognize import match_recognize
import pandas as pd

# Define the data
data = [
    ('cust_1', '2020-05-11', 100),
    ('cust_1', '2020-05-12', 200),
    ('cust_2', '2020-05-13',   8),
    ('cust_1', '2020-05-14', 100),
    ('cust_2', '2020-05-15',   4),
    ('cust_1', '2020-05-16',  50),
    ('cust_1', '2020-05-17', 100),
    ('cust_2', '2020-05-18',   6),
]

# Create DataFrame
df = pd.DataFrame(data, columns=['customer_id', 'order_date', 'price'])

# Convert order_date column to datetime
df['order_date'] = pd.to_datetime(df['order_date'])

# Display the DataFrame
print(df)

query_basic = """
SELECT
customer_id,
a_date AS first_date,
b_date AS second_date,
a_price AS first_price,
b_price AS second_price
FROM memory.default.orders
MATCH_RECOGNIZE (
PARTITION BY customer_id
ORDER BY order_date
MEASURES
A.order_date AS a_date,
B.order_date AS b_date,
A.price AS a_price,
B.price AS b_price
PATTERN (^A B)
DEFINE
A AS price < 150,
B AS price > PREV(price)
);

"""

print("Test 1:")
output_df = match_recognize(query_basic, df)
print(output_df)
print("\n")



  customer_id order_date  price
0      cust_1 2020-05-11    100
1      cust_1 2020-05-12    200
2      cust_2 2020-05-13      8
3      cust_1 2020-05-14    100
4      cust_2 2020-05-15      4
5      cust_1 2020-05-16     50
6      cust_1 2020-05-17    100
7      cust_2 2020-05-18      6
Test 1:
  customer_id first_date second_date  first_price  second_price
0      cust_1 2020-05-11  2020-05-12          100           200




In [6]:
import pandas as pd
from src.executor.match_recognize import match_recognize
import pandas as pd

# Define the data
data = [
    ('cust_1', '2020-05-11', 100),
    ('cust_1', '2020-05-12', 200),
    ('cust_2', '2020-05-13',   8),
    ('cust_1', '2020-05-14', 100),
    ('cust_2', '2020-05-15',   4),
    ('cust_1', '2020-05-16',  50),
    ('cust_1', '2020-05-17', 100),
    ('cust_2', '2020-05-18',   6),
]

# Create DataFrame
df = pd.DataFrame(data, columns=['customer_id', 'order_date', 'price'])

# Convert order_date column to datetime
df['order_date'] = pd.to_datetime(df['order_date'])

# Display the DataFrame
print(df)

query_basic = """
SELECT
customer_id,
run_start,
run_end
FROM memory.default.orders
MATCH_RECOGNIZE (
PARTITION BY customer_id
ORDER BY order_date
MEASURES
A.order_date AS run_start,
LAST(B.order_date) AS run_end
PATTERN (A B+)
DEFINE
A AS price >= 100,
B AS price > PREV(price)
);

"""

print("Test 1:")
output_df = match_recognize(query_basic, df)
print(output_df)
print("\n")

  customer_id order_date  price
0      cust_1 2020-05-11    100
1      cust_1 2020-05-12    200
2      cust_2 2020-05-13      8
3      cust_1 2020-05-14    100
4      cust_2 2020-05-15      4
5      cust_1 2020-05-16     50
6      cust_1 2020-05-17    100
7      cust_2 2020-05-18      6
Test 1:
  customer_id  run_start    run_end
0      cust_1 2020-05-11 2020-05-12




In [7]:
import pandas as pd
from src.executor.match_recognize import match_recognize
import pandas as pd

# Define the data
data = [
    ('cust_1', '2020-05-11', 100),
    ('cust_1', '2020-05-12', 200),
    ('cust_2', '2020-05-13',   8),
    ('cust_1', '2020-05-14', 100),
    ('cust_2', '2020-05-15',   4),
    ('cust_1', '2020-05-16',  50),
    ('cust_1', '2020-05-17', 100),
    ('cust_2', '2020-05-18',   6),
]

# Create DataFrame
df = pd.DataFrame(data, columns=['customer_id', 'order_date', 'price'])

# Convert order_date column to datetime
df['order_date'] = pd.to_datetime(df['order_date'])

# Display the DataFrame
print(df)

query_basic = """
SELECT
customer_id,
first_a_date AS a_date,
first_b_date AS b_date,
first_c_date AS c_date
FROM memory.default.orders
MATCH_RECOGNIZE (
PARTITION BY customer_id
ORDER BY order_date
MEASURES
A.order_date AS first_a_date,
B.order_date AS first_b_date,
C.order_date AS first_c_date
PATTERN (PERMUTE(A, B, C))
DEFINE
A AS price = 100,
B AS price = 50,
C AS price = 200
);

"""

print("Test 1:")
output_df = match_recognize(query_basic, df)
print(output_df)
print("\n")



  customer_id order_date  price
0      cust_1 2020-05-11    100
1      cust_1 2020-05-12    200
2      cust_2 2020-05-13      8
3      cust_1 2020-05-14    100
4      cust_2 2020-05-15      4
5      cust_1 2020-05-16     50
6      cust_1 2020-05-17    100
7      cust_2 2020-05-18      6
Test 1:
  customer_id     a_date     b_date     c_date
0      cust_1 2020-05-14 2020-05-16 2020-05-12




In [8]:
import pandas as pd
from src.executor.match_recognize import match_recognize
import pandas as pd

# Define the data
data = [
    ('cust_1', '2020-05-11', 100),
    ('cust_1', '2020-05-12', 200),
    ('cust_2', '2020-05-13',   8),
    ('cust_1', '2020-05-14', 100),
    ('cust_2', '2020-05-15',   4),
    ('cust_1', '2020-05-16',  50),
    ('cust_1', '2020-05-17', 100),
    ('cust_2', '2020-05-18',   6),
]

# Create DataFrame
df = pd.DataFrame(data, columns=['customer_id', 'order_date', 'price'])

# Convert order_date column to datetime
df['order_date'] = pd.to_datetime(df['order_date'])

# Display the DataFrame
print(df)

query_basic = """
SELECT
  customer_id,
  start_price,
  last_b_price    AS peak_price,
  last_c_price    AS bottom_price,
  last_d_price    AS end_price,
  start_date,
  end_date
FROM memory.default.orders
MATCH_RECOGNIZE (
  PARTITION BY customer_id
  ORDER BY order_date
  MEASURES
    A.price               AS start_price,
    LAST(B.price)         AS last_b_price,
    LAST(C.price)         AS last_c_price,
    LAST(D.price)         AS last_d_price,
    A.order_date          AS start_date,
    LAST(D.order_date)    AS end_date
  PATTERN (A B+ C+ D+)
  DEFINE
    B AS B.price > PREV(price),
    C AS C.price < PREV(price),
    D AS D.price > PREV(price)
) AS t;

"""

print("Test 1:")
output_df = match_recognize(query_basic, df)
print(output_df)
print("\n")



  customer_id order_date  price
0      cust_1 2020-05-11    100
1      cust_1 2020-05-12    200
2      cust_2 2020-05-13      8
3      cust_1 2020-05-14    100
4      cust_2 2020-05-15      4
5      cust_1 2020-05-16     50
6      cust_1 2020-05-17    100
7      cust_2 2020-05-18      6
Test 1:




  customer_id  start_price  peak_price  bottom_price  end_price start_date  \
0      cust_1          100         200            50        100 2020-05-11   

    end_date  
0 2020-05-17  




In [9]:

import pandas as pd
from src.executor.match_recognize import match_recognize
import pandas as pd

# Define the data
data = [
    ('cust_1', '2020-05-11', 100),
    ('cust_1', '2020-05-12', 200),
    ('cust_2', '2020-05-13',   8),
    ('cust_1', '2020-05-14', 100),
    ('cust_2', '2020-05-15',   4),
    ('cust_1', '2020-05-16',  50),
    ('cust_1', '2020-05-17', 100),
    ('cust_2', '2020-05-18',   6),
]

# Create DataFrame
df = pd.DataFrame(data, columns=['customer_id', 'order_date', 'price'])

# Convert order_date column to datetime
df['order_date'] = pd.to_datetime(df['order_date'])

# Display the DataFrame
print(df)

query_basic_permute = """
SELECT customer_id, start_price, bottom_price, final_price, start_date, final_date
    FROM memory.default.orders
        MATCH_RECOGNIZE (
            PARTITION BY customer_id
            ORDER BY order_date
            MEASURES
                START.price AS start_price,
                LAST(DOWN.price) AS bottom_price,
                LAST(UP.price) AS final_price,
                START.order_date AS start_date,
                LAST(UP.order_date) AS final_date
            ONE ROW PER MATCH
            AFTER MATCH SKIP PAST LAST ROW
            PATTERN (START DOWN+ UP+)
            DEFINE
                DOWN AS price < PREV(price),
                UP AS price > PREV(price)
            );
"""

print("Test 1: Basic PERMUTE - Should match all sequences with A, B, C in any order")
output_df = match_recognize(query_basic_permute, df)
print(output_df)
print("\n")

  customer_id order_date  price
0      cust_1 2020-05-11    100
1      cust_1 2020-05-12    200
2      cust_2 2020-05-13      8
3      cust_1 2020-05-14    100
4      cust_2 2020-05-15      4
5      cust_1 2020-05-16     50
6      cust_1 2020-05-17    100
7      cust_2 2020-05-18      6
Test 1: Basic PERMUTE - Should match all sequences with A, B, C in any order
  customer_id  start_price  bottom_price  final_price start_date final_date
0      cust_1          200            50          100 2020-05-12 2020-05-17
1      cust_2            8             4            6 2020-05-13 2020-05-18




# Example #2

In [10]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Create test data with different permutation patterns
data = [
    # Sequence 1: Has A-B-C pattern
    {"id": 1, "seq": 1, "step": 1, "event_type": "start", "value": 100},  # A
    {"id": 2, "seq": 1, "step": 2, "event_type": "middle", "value": 200}, # B
    {"id": 3, "seq": 1, "step": 3, "event_type": "end", "value": 300},    # C
    
    # Sequence 2: Has B-A-C pattern
    {"id": 4, "seq": 2, "step": 1, "event_type": "middle", "value": 250}, # B
    {"id": 5, "seq": 2, "step": 2, "event_type": "start", "value": 150},  # A
    {"id": 6, "seq": 2, "step": 3, "event_type": "end", "value": 350},    # C
    
    # Sequence 3: Has A-C-B pattern
    {"id": 7, "seq": 3, "step": 1, "event_type": "start", "value": 175},  # A
    {"id": 8, "seq": 3, "step": 2, "event_type": "end", "value": 275},    # C
    {"id": 9, "seq": 3, "step": 3, "event_type": "middle", "value": 375}, # B
    
    # Sequence 4: Has C-B-A pattern
    {"id": 10, "seq": 4, "step": 1, "event_type": "end", "value": 225},   # C
    {"id": 11, "seq": 4, "step": 2, "event_type": "middle", "value": 325}, # B
    {"id": 12, "seq": 4, "step": 3, "event_type": "start", "value": 425},  # A
]

df = pd.DataFrame(data)

print("Testing PERMUTE Patterns\n")

# Test 1: Basic PERMUTE - Match any order of A, B, C
query_basic_permute = """
SELECT * FROM memory.default.op2 MATCH_RECOGNIZE(
    PARTITION BY seq
    ORDER BY step
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        A.value AS a_value,
        B.value AS b_value,
        C.value AS c_value
    ONE ROW PER MATCH
    PATTERN (PERMUTE(A, B, C))
    DEFINE 
        A AS event_type = 'start',
        B AS event_type = 'middle',
        C AS event_type = 'end'
);
"""

print("Test 1: Basic PERMUTE - Should match all sequences with A, B, C in any order")
output_df = match_recognize(query_basic_permute, df)
print(output_df)
print("\n")

Testing PERMUTE Patterns

Test 1: Basic PERMUTE - Should match all sequences with A, B, C in any order
   seq pattern_var  match_num  a_value  b_value  c_value
0    1           C          1      100      200      300
1    2           C          1      150      250      350
2    3           B          1      175      375      275
3    4           A          1      425      325      225




In [11]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Create test data with different permutation patterns
data = [
    # Sequence 1: Has A-B-C pattern
    {"id": 1, "seq": 1, "step": 1, "event_type": "start", "value": 100},  # A
    {"id": 2, "seq": 1, "step": 2, "event_type": "middle", "value": 200}, # B
    {"id": 3, "seq": 1, "step": 3, "event_type": "end", "value": 300},    # C
    
    # Sequence 2: Has B-A-C pattern
    {"id": 4, "seq": 2, "step": 1, "event_type": "middle", "value": 250}, # B
    {"id": 5, "seq": 2, "step": 2, "event_type": "start", "value": 150},  # A
    {"id": 6, "seq": 2, "step": 3, "event_type": "end", "value": 350},    # C
    
    # Sequence 3: Has A-C-B pattern
    {"id": 7, "seq": 3, "step": 1, "event_type": "start", "value": 175},  # A
    {"id": 8, "seq": 3, "step": 2, "event_type": "end", "value": 275},    # C
    {"id": 9, "seq": 3, "step": 3, "event_type": "middle", "value": 375}, # B
    
    # Sequence 4: Has C-B-A pattern
    {"id": 10, "seq": 4, "step": 1, "event_type": "end", "value": 225},   # C
    {"id": 11, "seq": 4, "step": 2, "event_type": "middle", "value": 325}, # B
    {"id": 12, "seq": 4, "step": 3, "event_type": "start", "value": 425},  # A
]

df = pd.DataFrame(data)

print("Testing PERMUTE Patterns\n")

# Test 1: Basic PERMUTE - Match any order of A, B, C
query_basic_permute = """
SELECT * FROM memory.default.op2 MATCH_RECOGNIZE(
    PARTITION BY seq
    ORDER BY step
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        A.value AS a_value,
        B.value AS b_value,
        C.value AS c_value
  
    PATTERN (PERMUTE(A, B, C))
    DEFINE 
        A AS event_type = 'start',
        B AS event_type = 'middle',
        C AS event_type = 'end'
);
"""

print("Test 1: Basic PERMUTE - Should match all sequences with A, B, C in any order")
output_df = match_recognize(query_basic_permute, df)
print(output_df)
print("\n")

Testing PERMUTE Patterns

Test 1: Basic PERMUTE - Should match all sequences with A, B, C in any order
   seq pattern_var  match_num  a_value  b_value  c_value
0    1           C          1      100      200      300
1    2           C          1      150      250      350
2    3           B          1      175      375      275
3    4           A          1      425      325      225




In [12]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Create test data with different permutation patterns
data = [
    # Sequence 1: Has A-B-C pattern
    {"id": 1, "seq": 1, "step": 1, "event_type": "start", "value": 100},  # A
    {"id": 2, "seq": 1, "step": 2, "event_type": "middle", "value": 200}, # B
    {"id": 3, "seq": 1, "step": 3, "event_type": "end", "value": 300},    # C
    
    # Sequence 2: Has B-A-C pattern
    {"id": 4, "seq": 2, "step": 1, "event_type": "middle", "value": 250}, # B
    {"id": 5, "seq": 2, "step": 2, "event_type": "start", "value": 150},  # A
    {"id": 6, "seq": 2, "step": 3, "event_type": "end", "value": 350},    # C
    
    # Sequence 3: Has A-C-B pattern
    {"id": 7, "seq": 3, "step": 1, "event_type": "start", "value": 175},  # A
    {"id": 8, "seq": 3, "step": 2, "event_type": "end", "value": 275},    # C
    {"id": 9, "seq": 3, "step": 3, "event_type": "middle", "value": 375}, # B
    
    # Sequence 4: Has C-B-A pattern
    {"id": 10, "seq": 4, "step": 1, "event_type": "end", "value": 225},   # C
    {"id": 11, "seq": 4, "step": 2, "event_type": "middle", "value": 325}, # B
    {"id": 12, "seq": 4, "step": 3, "event_type": "start", "value": 425},  # A
]

df = pd.DataFrame(data)

print("Testing PERMUTE Patterns\n")
# Test 2: PERMUTE with Quantifier
query_permute_quantifier = """
SELECT * FROM memory.default.op2 MATCH_RECOGNIZE(
    PARTITION BY seq
    ORDER BY step
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        FIRST(A.value) AS first_a_value,
        LAST(C.value) AS last_c_value
    ONE ROW PER MATCH
    PATTERN (PERMUTE(A, B, C)+)
    DEFINE 
        A AS event_type = 'start',
        B AS event_type = 'middle',
        C AS event_type = 'end'
);
"""

print("Test 2: PERMUTE with Quantifier - Should match one or more occurrences of permutations")
output_df = match_recognize(query_permute_quantifier, df)
print(output_df)
print("\n")

Testing PERMUTE Patterns

Test 2: PERMUTE with Quantifier - Should match one or more occurrences of permutations
   seq pattern_var  match_num  first_a_value  last_c_value
0    1           C          1            100           300
1    2           C          1            150           350
2    3           B          1            175           275
3    4           A          1            425           225




In [13]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Create test data with different permutation patterns
data = [
    # Sequence 1: Has A-B-C pattern
    {"id": 1, "seq": 1, "step": 1, "event_type": "start", "value": 100},  # A
    {"id": 2, "seq": 1, "step": 2, "event_type": "middle", "value": 200}, # B
    {"id": 3, "seq": 1, "step": 3, "event_type": "end", "value": 300},    # C
    
    # Sequence 2: Has B-A-C pattern
    {"id": 4, "seq": 2, "step": 1, "event_type": "middle", "value": 250}, # B
    {"id": 5, "seq": 2, "step": 2, "event_type": "start", "value": 150},  # A
    {"id": 6, "seq": 2, "step": 3, "event_type": "end", "value": 350},    # C
    
    # Sequence 3: Has A-C-B pattern
    {"id": 7, "seq": 3, "step": 1, "event_type": "start", "value": 175},  # A
    {"id": 8, "seq": 3, "step": 2, "event_type": "end", "value": 275},    # C
    {"id": 9, "seq": 3, "step": 3, "event_type": "middle", "value": 375}, # B
    
    # Sequence 4: Has C-B-A pattern
    {"id": 10, "seq": 4, "step": 1, "event_type": "end", "value": 225},   # C
    {"id": 11, "seq": 4, "step": 2, "event_type": "middle", "value": 325}, # B
    {"id": 12, "seq": 4, "step": 3, "event_type": "start", "value": 425},  # A
]

df = pd.DataFrame(data)

print("Testing PERMUTE Patterns\n")
# Test 3: PERMUTE with ALL ROWS PER MATCH
query_permute_all_rows = """
SELECT * FROM memory.default.op2 MATCH_RECOGNIZE(
    PARTITION BY seq
    ORDER BY step
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        RUNNING LAST(A.value) AS running_a_value
    ALL ROWS PER MATCH
    PATTERN (PERMUTE(A, B, C))
    DEFINE 
        A AS event_type = 'start',
        B AS event_type = 'middle',
        C AS event_type = 'end'
);
"""

print("Test 3: PERMUTE with ALL ROWS PER MATCH - Shows all matched rows")
output_df = match_recognize(query_permute_all_rows, df)
print(output_df)
print("\n")


Testing PERMUTE Patterns

Test 3: PERMUTE with ALL ROWS PER MATCH - Shows all matched rows
    seq  step pattern_var  match_num running_a_value event_type  value  id
0     1     1           A          1             100      start    100   1
1     1     2           B          1             100     middle    200   2
2     1     3           C          1             100        end    300   3
3     2     1           B          1            None     middle    250   4
4     2     2           A          1             150      start    150   5
5     2     3           C          1             150        end    350   6
6     3     1           A          1             175      start    175   7
7     3     2           C          1             175        end    275   8
8     3     3           B          1             175     middle    375   9
9     4     1           C          1            None        end    225  10
10    4     2           B          1            None     middle    325  11
11    4  

# Check_fix


In [1]:

import pandas as pd
from src.executor.match_recognize import match_recognize

# Create test data with different permutation patterns
data = [
    # Sequence 1: Has A-B-C pattern
    {"id": 1, "seq": 1, "step": 1, "event_type": "start", "value": 100},  # A
    {"id": 2, "seq": 1, "step": 2, "event_type": "middle", "value": 200}, # B
    {"id": 3, "seq": 1, "step": 3, "event_type": "end", "value": 300},    # C
    
    # Sequence 2: Has B-A-C pattern
    {"id": 4, "seq": 2, "step": 1, "event_type": "middle", "value": 250}, # B
    {"id": 5, "seq": 2, "step": 2, "event_type": "start", "value": 150},  # A
    {"id": 6, "seq": 2, "step": 3, "event_type": "end", "value": 350},    # C
    
    # Sequence 3: Has A-C-B pattern
    {"id": 7, "seq": 3, "step": 1, "event_type": "start", "value": 175},  # A
    {"id": 8, "seq": 3, "step": 2, "event_type": "end", "value": 275},    # C
    {"id": 9, "seq": 3, "step": 3, "event_type": "middle", "value": 375}, # B
    
    # Sequence 4: Has C-B-A pattern
    {"id": 10, "seq": 4, "step": 1, "event_type": "end", "value": 225},   # C
    {"id": 11, "seq": 4, "step": 2, "event_type": "middle", "value": 325}, # B
    {"id": 12, "seq": 4, "step": 3, "event_type": "start", "value": 425},  # A
]

df = pd.DataFrame(data)

print("Testing PERMUTE Patterns\n")
# Test 4: PERMUTE with Subset Variables
query_permute_subset = """
SELECT * FROM memory.default.op2 MATCH_RECOGNIZE(
    PARTITION BY seq
    ORDER BY step
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        X.value AS x_value,
        Y.value AS y_value
    ONE ROW PER MATCH
    PATTERN (PERMUTE(A, B, C))
    SUBSET
        X = (A, B),
        Y = (B, C)
    DEFINE 
        A AS event_type = 'start',
        B AS event_type = 'middle',
        C AS event_type = 'end'
);
"""

print("Test 4: PERMUTE with Subset Variables - Using subset groupings")
output_df = match_recognize(query_permute_subset, df)
print(output_df)
print("\n")


Testing PERMUTE Patterns

Test 4: PERMUTE with Subset Variables - Using subset groupings
   seq pattern_var  match_num  x_value  y_value
0    1           C          1      200      300
1    2           C          1      150      350
2    3           B          1      375      375
3    4           A          1      425      325




In [1]:

import pandas as pd
from src.executor.match_recognize import match_recognize

# Create test data with different permutation patterns
data = [
    # Sequence 1: Has A-B-C pattern
    {"id": 1, "seq": 1, "step": 1, "event_type": "start", "value": 100},  # A
    {"id": 2, "seq": 1, "step": 2, "event_type": "middle", "value": 200}, # B
    {"id": 3, "seq": 1, "step": 3, "event_type": "end", "value": 300},    # C
    
    # Sequence 2: Has B-A-C pattern
    {"id": 4, "seq": 2, "step": 1, "event_type": "middle", "value": 250}, # B
    {"id": 5, "seq": 2, "step": 2, "event_type": "start", "value": 150},  # A
    {"id": 6, "seq": 2, "step": 3, "event_type": "end", "value": 350},    # C
    
    # Sequence 3: Has A-C-B pattern
    {"id": 7, "seq": 3, "step": 1, "event_type": "start", "value": 175},  # A
    {"id": 8, "seq": 3, "step": 2, "event_type": "end", "value": 275},    # C
    {"id": 9, "seq": 3, "step": 3, "event_type": "middle", "value": 375}, # B
    
    # Sequence 4: Has C-B-A pattern
    {"id": 10, "seq": 4, "step": 1, "event_type": "end", "value": 225},   # C
    {"id": 11, "seq": 4, "step": 2, "event_type": "middle", "value": 325}, # B
    {"id": 12, "seq": 4, "step": 3, "event_type": "start", "value": 425},  # A
]

df = pd.DataFrame(data)

print("Testing PERMUTE Patterns\n")
# Test 5: Nested PERMUTE patterns
query_nested_permute = """
SELECT * FROM memory.default.op2 MATCH_RECOGNIZE(
    PARTITION BY seq
    ORDER BY step
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        A.value AS a_value,
        B.value AS b_value,
        C.value AS c_value
    ONE ROW PER MATCH
    PATTERN (PERMUTE(A, PERMUTE(B, C)))
    DEFINE 
        A AS event_type = 'start',
        B AS event_type = 'middle',
        C AS event_type = 'end'
);
"""

print("Test 5: Nested PERMUTE - Testing nested permutation patterns")
output_df = match_recognize(query_nested_permute, df)
print(output_df)
print("\n")

Testing PERMUTE Patterns

Test 5: Nested PERMUTE - Testing nested permutation patterns
   seq pattern_var  match_num  a_value  b_value  c_value
0    1           C          1      100      200      300
1    3           B          1      175      375      275
2    4           A          1      425      325      225




In [16]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Create test data with different permutation patterns
data = [
    # Sequence 1: Has A-B-C pattern
    {"id": 1, "seq": 1, "step": 1, "event_type": "start", "value": 100},  # A
    {"id": 2, "seq": 1, "step": 2, "event_type": "middle", "value": 200}, # B
    {"id": 3, "seq": 1, "step": 3, "event_type": "end", "value": 300},    # C
    
    # Sequence 2: Has B-A-C pattern
    {"id": 4, "seq": 2, "step": 1, "event_type": "middle", "value": 250}, # B
    {"id": 5, "seq": 2, "step": 2, "event_type": "start", "value": 150},  # A
    {"id": 6, "seq": 2, "step": 3, "event_type": "end", "value": 350},    # C
    
    # Sequence 3: Has A-C-B pattern
    {"id": 7, "seq": 3, "step": 1, "event_type": "start", "value": 175},  # A
    {"id": 8, "seq": 3, "step": 2, "event_type": "end", "value": 275},    # C
    {"id": 9, "seq": 3, "step": 3, "event_type": "middle", "value": 375}, # B
    
    # Sequence 4: Has C-B-A pattern
    {"id": 10, "seq": 4, "step": 1, "event_type": "end", "value": 225},   # C
    {"id": 11, "seq": 4, "step": 2, "event_type": "middle", "value": 325}, # B
    {"id": 12, "seq": 4, "step": 3, "event_type": "start", "value": 425},  # A
]

df = pd.DataFrame(data)

print("Testing PERMUTE Patterns\n")
# Test 6: PERMUTE with Complex Conditions
query_permute_complex = """
SELECT * FROM memory.default.op2 MATCH_RECOGNIZE(
    PARTITION BY seq
    ORDER BY step
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        A.value AS start_value,
        B.value AS middle_value,
        C.value AS end_value
    ONE ROW PER MATCH
    PATTERN (PERMUTE(A, B, C))
    DEFINE 
        A AS event_type = 'start' AND A.value < NEXT(A.value),
        B AS event_type = 'middle' AND B.value > PREV(B.value),
        C AS event_type = 'end' AND C.value > FIRST(A.value)
);
"""

print("Test 6: PERMUTE with Complex Conditions - Testing complex pattern definitions")
output_df = match_recognize(query_permute_complex, df)
print(output_df)
print("\n")

Testing PERMUTE Patterns

Test 6: PERMUTE with Complex Conditions - Testing complex pattern definitions


   seq pattern_var  match_num  start_value  middle_value  end_value
0    1           C          1          100           200        300
1    3           B          1          175           375        275




In [17]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Create test data with different permutation patterns
data = [
    # Sequence 1: Has A-B-C pattern
    {"id": 1, "seq": 1, "step": 1, "event_type": "start", "value": 100},  # A
    {"id": 2, "seq": 1, "step": 2, "event_type": "middle", "value": 200}, # B
    {"id": 3, "seq": 1, "step": 3, "event_type": "end", "value": 300},    # C
    
    # Sequence 2: Has B-A-C pattern
    {"id": 4, "seq": 2, "step": 1, "event_type": "middle", "value": 250}, # B
    {"id": 5, "seq": 2, "step": 2, "event_type": "start", "value": 150},  # A
    {"id": 6, "seq": 2, "step": 3, "event_type": "end", "value": 350},    # C
    
    # Sequence 3: Has A-C-B pattern
    {"id": 7, "seq": 3, "step": 1, "event_type": "start", "value": 175},  # A
    {"id": 8, "seq": 3, "step": 2, "event_type": "end", "value": 275},    # C
    {"id": 9, "seq": 3, "step": 3, "event_type": "middle", "value": 375}, # B
    
    # Sequence 4: Has C-B-A pattern
    {"id": 10, "seq": 4, "step": 1, "event_type": "end", "value": 225},   # C
    {"id": 11, "seq": 4, "step": 2, "event_type": "middle", "value": 325}, # B
    {"id": 12, "seq": 4, "step": 3, "event_type": "start", "value": 425},  # A
]

df = pd.DataFrame(data)

print("Testing PERMUTE Patterns\n")
# Test 6: PERMUTE with Complex Conditions
query_permute_complex = """
SELECT * FROM memory.default.op2 MATCH_RECOGNIZE(
    PARTITION BY seq
    ORDER BY step
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        A.value AS start_value,
        B.value AS middle_value,
        C.value AS end_value
    ONE ROW PER MATCH
    PATTERN (PERMUTE(A, B, C))
    DEFINE 
        A AS event_type = 'start' AND A.value < NEXT(A.value),
        B AS event_type = 'middle' AND B.value > PREV(B.value),
        C AS event_type = 'end' AND C.value > FIRST(A.value)
);
"""

print("Test 6: PERMUTE with Complex Conditions - Testing complex pattern definitions")
output_df = match_recognize(query_permute_complex, df)
print(output_df)
print("\n")

Testing PERMUTE Patterns

Test 6: PERMUTE with Complex Conditions - Testing complex pattern definitions
   seq pattern_var  match_num  start_value  middle_value  end_value
0    1           C          1          100           200        300
1    3           B          1          175           375        275




In [18]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Create test data for PERMUTE with subset variables
data = [
    # Sequence 1: Has A-B-C pattern
    {"id": 1, "seq": 1, "step": 1, "event_type": "start", "value": 100},  # A
    {"id": 2, "seq": 1, "step": 2, "event_type": "middle", "value": 200}, # B
    {"id": 3, "seq": 1, "step": 3, "event_type": "end", "value": 300},    # C
    
    # Sequence 2: Has B-A-C pattern
    {"id": 4, "seq": 2, "step": 1, "event_type": "middle", "value": 250}, # B
    {"id": 5, "seq": 2, "step": 2, "event_type": "start", "value": 150},  # A
    {"id": 6, "seq": 2, "step": 3, "event_type": "end", "value": 350},    # C
    
    # Sequence 3: Has A-C-B pattern
    {"id": 7, "seq": 3, "step": 1, "event_type": "start", "value": 175},  # A
    {"id": 8, "seq": 3, "step": 2, "event_type": "end", "value": 275},    # C
    {"id": 9, "seq": 3, "step": 3, "event_type": "middle", "value": 375}, # B
    
    # Sequence 4: Has C-B-A pattern
    {"id": 10, "seq": 4, "step": 1, "event_type": "end", "value": 225},   # C
    {"id": 11, "seq": 4, "step": 2, "event_type": "middle", "value": 325}, # B
    {"id": 12, "seq": 4, "step": 3, "event_type": "start", "value": 425},  # A
]

df = pd.DataFrame(data)

print("Testing PERMUTE with Subset Variables - Trino Compatibility\n")


# Test 7: PERMUTE with Edge Cases
query_permute_edge_cases = """
SELECT * FROM memory.default.op2 MATCH_RECOGNIZE(
    PARTITION BY seq
    ORDER BY step
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        A.value AS a_value,
        LAST(B.value) AS last_b_value,
        FIRST(C.value) AS first_c_value
    one ROW PER MATCH
    PATTERN (PERMUTE(A, B?, C?))
    DEFINE 
        A AS event_type = 'start',
        B AS event_type = 'middle',
        C AS event_type = 'end'
);
"""

print("Test 7: PERMUTE with Edge Cases - Testing optional elements")
output_df = match_recognize(query_permute_edge_cases, df)
print(output_df)

Testing PERMUTE with Subset Variables - Trino Compatibility

Test 7: PERMUTE with Edge Cases - Testing optional elements
   seq pattern_var  match_num  a_value  last_b_value  first_c_value
0    1           C          1      100         200.0            300
1    2           C          1      150         250.0            350
2    3           C          1      175           NaN            275
3    4           A          1      425         325.0            225


In [19]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Create test data for PERMUTE with subset variables
data = [
    # Sequence 1: Has A-B-C pattern
    {"id": 1, "seq": 1, "step": 1, "event_type": "start", "value": 100},  # A
    {"id": 2, "seq": 1, "step": 2, "event_type": "middle", "value": 200}, # B
    {"id": 3, "seq": 1, "step": 3, "event_type": "end", "value": 300},    # C
    
    # Sequence 2: Has B-A-C pattern
    {"id": 4, "seq": 2, "step": 1, "event_type": "middle", "value": 250}, # B
    {"id": 5, "seq": 2, "step": 2, "event_type": "start", "value": 150},  # A
    {"id": 6, "seq": 2, "step": 3, "event_type": "end", "value": 350},    # C
    
    # Sequence 3: Has A-C-B pattern
    {"id": 7, "seq": 3, "step": 1, "event_type": "start", "value": 175},  # A
    {"id": 8, "seq": 3, "step": 2, "event_type": "end", "value": 275},    # C
    {"id": 9, "seq": 3, "step": 3, "event_type": "middle", "value": 375}, # B
    
    # Sequence 4: Has C-B-A pattern
    {"id": 10, "seq": 4, "step": 1, "event_type": "end", "value": 225},   # C
    {"id": 11, "seq": 4, "step": 2, "event_type": "middle", "value": 325}, # B
    {"id": 12, "seq": 4, "step": 3, "event_type": "start", "value": 425},  # A
]

df = pd.DataFrame(data)

print("Testing PERMUTE with Subset Variables - Trino Compatibility\n")


# Test 7: PERMUTE with Edge Cases
query_permute_edge_cases = """
SELECT * FROM memory.default.op2 MATCH_RECOGNIZE(
    PARTITION BY seq
    ORDER BY step
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        A.value AS a_value,
        LAST(B.value) AS last_b_value,
        FIRST(C.value) AS first_c_value
    all ROWs PER MATCH
    PATTERN (PERMUTE(A, B?, C?))
    DEFINE 
        A AS event_type = 'start',
        B AS event_type = 'middle',
        C AS event_type = 'end'
);
"""

print("Test 7: PERMUTE with Edge Cases - Testing optional elements")
output_df = match_recognize(query_permute_edge_cases, df)
print(output_df)

Testing PERMUTE with Subset Variables - Trino Compatibility

Test 7: PERMUTE with Edge Cases - Testing optional elements
    seq  step pattern_var  match_num a_value last_b_value  first_c_value  \
0     1     1           A          1     100         None            300   
1     1     2           B          1     100          200            300   
2     1     3           C          1     100          200            300   
3     2     1           B          1    None          250            350   
4     2     2           A          1     150          250            350   
5     2     3           C          1     150          250            350   
6     3     1           A          1     175         None            275   
7     3     2           C          1     175         None            275   
8     4     1           C          1    None         None            225   
9     4     2           B          1    None          325            225   
10    4     3           A          1     42

# Example 3


In [20]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Validation query with ALL ROWS PER MATCH
query = """
    SELECT * FROM memory.default.employees MATCH_RECOGNIZE (
        PARTITION BY department, region
        ORDER BY hire_date
        MEASURES 
            salary AS current_salary,
            RUNNING SUM(salary) AS running_sum,
            MATCH_NUMBER() AS match_num
        ALL ROWS PER MATCH
        PATTERN (A+)
        DEFINE A AS salary > 1000
    );
    """
    
data = [
    {"id": 1, "name": "Alice",   "department": "Sales", "region": "West", "hire_date": "2021-01-01", "salary": 1200},
    {"id": 2, "name": "Bob",     "department": "Sales", "region": "West", "hire_date": "2021-01-02", "salary": 1300},
    {"id": 3, "name": "Charlie", "department": "Sales", "region": "West", "hire_date": "2021-01-03", "salary": 900},
    {"id": 4, "name": "Diana",   "department": "Sales", "region": "West", "hire_date": "2021-01-04", "salary": 1100},
]
    
output_df = match_recognize(query, pd.DataFrame(data))
print("Match Recognize Output:")
print(output_df)


Match Recognize Output:
  department region   hire_date  current_salary  running_sum  match_num  \
0      Sales   West  2021-01-01            1200         1200          1   
1      Sales   West  2021-01-02            1300         2500          1   
2      Sales   West  2021-01-04            1100         1100          2   

   salary   name  id  
0    1200  Alice   1  
1    1300    Bob   2  
2    1100  Diana   4  


In [21]:
import pandas as pd
from src.executor.match_recognize import match_recognize
# Use an absolute import for match_recognize.

query = """
    SELECT * FROM memory.default.employees MATCH_RECOGNIZE (
        PARTITION BY department, region
        ORDER BY hire_date
        MEASURES salary AS avg_salary
        PATTERN (A+)
        DEFINE A AS salary > 1000
    );
    """
    
data = [
        {"id": 1, "name": "Alice",   "department": "Sales", "region": "West", "hire_date": "2021-01-01", "salary": 1200},
        {"id": 2, "name": "Bob",     "department": "Sales", "region": "West", "hire_date": "2021-01-02", "salary": 1300},
        {"id": 3, "name": "Charlie", "department": "Sales", "region": "West", "hire_date": "2021-01-03", "salary": 900},
        {"id": 4, "name": "Diana",   "department": "Sales", "region": "West", "hire_date": "2021-01-04", "salary": 1100},
    ]
    
output_df = match_recognize(query, pd.DataFrame(data))
print("Match Recognize Output:")
print(output_df)


Match Recognize Output:
  department region  avg_salary
0      Sales   West        1300
1      Sales   West        1100


In [22]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Validation query with ALL ROWS PER MATCH
query = """
    SELECT * FROM memory.default.employees MATCH_RECOGNIZE (
        PARTITION BY department, region
        ORDER BY hire_date
        MEASURES 
            salary AS current_salary,
            RUNNING SUM(salary) AS running_sum,
            MATCH_NUMBER() AS match_num
        ALL ROWS PER MATCH
        PATTERN (A*)
        DEFINE A AS salary > 1000
    );
    """
    
data = [
    {"id": 1, "name": "Alice",   "department": "Sales", "region": "West", "hire_date": "2021-01-01", "salary": 1200},
    {"id": 2, "name": "Bob",     "department": "Sales", "region": "West", "hire_date": "2021-01-02", "salary": 1300},
    {"id": 3, "name": "Charlie", "department": "Sales", "region": "West", "hire_date": "2021-01-03", "salary": 900},
    {"id": 4, "name": "Diana",   "department": "Sales", "region": "West", "hire_date": "2021-01-04", "salary": 1100},
]
    
output_df = match_recognize(query, pd.DataFrame(data))
print("Match Recognize Output:")
print(output_df)


Match Recognize Output:
  department region   hire_date current_salary running_sum  match_num  salary  \
0      Sales   West  2021-01-01           1200        1200          1    1200   
1      Sales   West  2021-01-02           1300        2500          1    1300   
2      Sales   West  2021-01-03           None        None          2     900   
3      Sales   West  2021-01-04           1100        1100          3    1100   

      name  id  
0    Alice   1  
1      Bob   2  
2  Charlie   3  
3    Diana   4  


In [23]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Validation query with ALL ROWS PER MATCH
query = """
SELECT *
FROM memory.default.employees 
MATCH_RECOGNIZE (
  PARTITION BY department, region
  ORDER BY hire_date
  MEASURES 
    A.salary AS starting_salary,
    LAST(C.salary) AS ending_salary,
    MATCH_NUMBER() AS match_num
  ONE ROW PER MATCH
  AFTER MATCH SKIP PAST LAST ROW
  PATTERN (A B+ C+)
  DEFINE 
    A AS salary > 1000,
    B AS salary < 1000,
    C AS salary > 1000
);


    """
    
data = [
    {"id": 1, "name": "Alice",   "department": "Sales", "region": "West", "hire_date": "2021-01-01", "salary": 1200},
    {"id": 2, "name": "Bob",     "department": "Sales", "region": "West", "hire_date": "2021-01-02", "salary": 1300},
    {"id": 3, "name": "Charlie", "department": "Sales", "region": "West", "hire_date": "2021-01-03", "salary": 900},
    {"id": 4, "name": "Diana",   "department": "Sales", "region": "West", "hire_date": "2021-01-04", "salary": 1100},
]
    
output_df = match_recognize(query, pd.DataFrame(data))
print("Match Recognize Output:")
print(output_df)


Match Recognize Output:
  department region  starting_salary  ending_salary  match_num
0      Sales   West             1300           1100          1


In [24]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Example query with comprehensive CLASSIFIER usage
query = """
SELECT * FROM memory.default.employees  MATCH_RECOGNIZE(
    PARTITION BY department, region
    ORDER BY hire_date
    MEASURES 
        CLASSIFIER() AS pattern_var,
        salary AS current_salary,
        RUNNING SUM(salary) AS running_sum
    ALL ROWS PER MATCH
    PATTERN (A C* {- B+ -} C+)
    DEFINE 
        A AS salary > 1000,
        B AS salary < 1000,
        C AS salary > 1000
);

"""

data = [
    {"id": 1, "name": "Alice", "department": "Sales", "region": "West", "hire_date": "2021-01-01", "salary": 1200},
    {"id": 2, "name": "Bob",   "department": "Sales", "region": "West", "hire_date": "2021-01-02", "salary": 1300},
    {"id": 3, "name": "Charlie", "department": "Sales", "region": "West", "hire_date": "2021-01-03", "salary": 900},
    {"id": 4, "name": "Diana", "department": "Sales", "region": "West", "hire_date": "2021-01-04", "salary": 1100},
]


output_df = match_recognize(query, pd.DataFrame(data))
print("Match Recognize Output:")
print(output_df)

Match Recognize Output:
  department region   hire_date pattern_var  current_salary  running_sum  \
0      Sales   West  2021-01-01           A            1200         1200   
1      Sales   West  2021-01-02           C            1300         2500   
2      Sales   West  2021-01-04           C            1100         4500   

   salary   name  id  
0    1200  Alice   1  
1    1300    Bob   2  
2    1100  Diana   4  


In [25]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Example query with comprehensive CLASSIFIER usage
query = """
SELECT * FROM memory.default.employees  MATCH_RECOGNIZE(
    PARTITION BY department, region
    ORDER BY hire_date
    MEASURES 
        CLASSIFIER() AS pattern_var,
        salary AS current_salary,
        RUNNING SUM(salary) AS running_sum
    ALL ROWS PER MATCH
    PATTERN (A {- B+ -} C+)
    DEFINE 
        A AS salary > 1000,
        B AS salary < 1000,
        C AS salary > 1000
);

"""

data = [
    {"id": 1, "name": "Alice", "department": "Sales", "region": "West", "hire_date": "2021-01-01", "salary": 1200},
    {"id": 2, "name": "Bob",   "department": "Sales", "region": "West", "hire_date": "2021-01-02", "salary": 1300},
    {"id": 3, "name": "Charlie", "department": "Sales", "region": "West", "hire_date": "2021-01-03", "salary": 900},
    {"id": 4, "name": "Diana", "department": "Sales", "region": "West", "hire_date": "2021-01-04", "salary": 1100},
]


output_df = match_recognize(query, pd.DataFrame(data))
print("Match Recognize Output:")
print(output_df)

Match Recognize Output:
  department region   hire_date pattern_var  current_salary  running_sum  \
0      Sales   West  2021-01-02           A            1300         1300   
1      Sales   West  2021-01-04           C            1100         3300   

   salary   name  id  
0    1300    Bob   2  
1    1100  Diana   4  


In [26]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Example query with comprehensive CLASSIFIER usage
query = """
SELECT * FROM  memory.default.employees MATCH_RECOGNIZE(
    PARTITION BY department, region
    ORDER BY hire_date
    MEASURES 
        CLASSIFIER() AS pattern_var,
        CLASSIFIER(A) AS is_a_var,
        CLASSIFIER(C) AS is_c_var,
        salary AS current_salary,
        RUNNING SUM(salary) AS running_sum
    ONE ROW PER MATCH
    PATTERN (A {- B+ -} C+)
    DEFINE 
        A AS salary > 1000,
        B AS salary < 1000,
        C AS salary > 1000
);
"""

data = [
    {"id": 1, "name": "Alice", "department": "Sales", "region": "West", "hire_date": "2021-01-01", "salary": 1200},
    {"id": 2, "name": "Bob",   "department": "Sales", "region": "West", "hire_date": "2021-01-02", "salary": 1300},
    {"id": 3, "name": "Charlie", "department": "Sales", "region": "West", "hire_date": "2021-01-03", "salary": 900},
    {"id": 4, "name": "Diana", "department": "Sales", "region": "West", "hire_date": "2021-01-04", "salary": 1100},
]


output_df = match_recognize(query, pd.DataFrame(data))
print("Match Recognize Output:")
print(output_df)

Match Recognize Output:
  department region pattern_var is_a_var is_c_var  current_salary  running_sum
0      Sales   West           C        A        C            1100         3300


In [27]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Example query with comprehensive CLASSIFIER usage
query = """
SELECT * FROM memory.default.employees MATCH_RECOGNIZE(
    PARTITION BY department
    ORDER BY hire_date
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num
    ONE ROW PER MATCH
    PATTERN (^A+)
    DEFINE 
        A AS salary > 1000
);
"""

data = [
    {"id": 1, "name": "Alice", "department": "Sales", "region": "West", "hire_date": "2021-01-01", "salary": 1200},
    {"id": 2, "name": "Bob",   "department": "Sales", "region": "West", "hire_date": "2021-01-02", "salary": 1300},
    {"id": 3, "name": "Charlie", "department": "Sales", "region": "West", "hire_date": "2021-01-03", "salary": 900},
    {"id": 4, "name": "Diana", "department": "Sales", "region": "West", "hire_date": "2021-01-04", "salary": 1100},
]


output_df = match_recognize(query, pd.DataFrame(data))
print("Match Recognize Output:")
print(output_df)

Match Recognize Output:
  department pattern_var  match_num
0      Sales           A          1


# Example 4


In [28]:
import pandas as pd
from src.executor.match_recognize import match_recognize

# Create test data with different departments to test partition behavior
data = [
    # Sales department - First row has high salary
    {"id": 1, "name": "Alice", "department": "Sales", "region": "West", "hire_date": "2021-01-01", "salary": 1200},
    {"id": 2, "name": "Bob",   "department": "Sales", "region": "West", "hire_date": "2021-01-02", "salary": 1300},
    {"id": 3, "name": "Charlie", "department": "Sales", "region": "West", "hire_date": "2021-01-03", "salary": 900},
    {"id": 4, "name": "Diana", "department": "Sales", "region": "West", "hire_date": "2021-01-04", "salary": 1100},
    
    # Marketing department - Last row has high salary
    {"id": 5, "name": "Eve", "department": "Marketing", "region": "East", "hire_date": "2021-01-01", "salary": 900},
    {"id": 6, "name": "Frank", "department": "Marketing", "region": "East", "hire_date": "2021-01-02", "salary": 950},
    {"id": 7, "name": "Grace", "department": "Marketing", "region": "East", "hire_date": "2021-01-03", "salary": 980},
    {"id": 8, "name": "Henry", "department": "Marketing", "region": "East", "hire_date": "2021-01-04", "salary": 1200},
    
    # IT department - All rows have high salary
    {"id": 9, "name": "Ivy", "department": "IT", "region": "North", "hire_date": "2021-01-01", "salary": 1500},
    {"id": 10, "name": "Jack", "department": "IT", "region": "North", "hire_date": "2021-01-02", "salary": 1600},
    {"id": 11, "name": "Kate", "department": "IT", "region": "North", "hire_date": "2021-01-03", "salary": 1700},
    {"id": 12, "name": "Leo", "department": "IT", "region": "North", "hire_date": "2021-01-04", "salary": 1800},
    
    # HR department - No rows have high salary
    {"id": 13, "name": "Mike", "department": "HR", "region": "South", "hire_date": "2021-01-01", "salary": 950},
    {"id": 14, "name": "Nina", "department": "HR", "region": "South", "hire_date": "2021-01-02", "salary": 980},
    {"id": 15, "name": "Oscar", "department": "HR", "region": "South", "hire_date": "2021-01-03", "salary": 990},
    {"id": 16, "name": "Pam", "department": "HR", "region": "South", "hire_date": "2021-01-04", "salary": 995},
]

df = pd.DataFrame(data)

print("Testing Pattern Anchors\n")

# Test 1: Start anchor (^) - Should match patterns starting at the beginning of a partition
query_start_anchor = """
SELECT * FROM memory.default.employee_data MATCH_RECOGNIZE(
    PARTITION BY department
    ORDER BY hire_date
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num
    ONE ROW PER MATCH
    PATTERN (^A+)
    DEFINE 
        A AS salary > 1000
);
"""

print("Test 1: Start Anchor (^) - Should only match departments where first employee has salary > 1000")
output_df = match_recognize(query_start_anchor, df)
print(output_df)
print("\n")


Testing Pattern Anchors

Test 1: Start Anchor (^) - Should only match departments where first employee has salary > 1000
  department pattern_var  match_num
0         IT           A          1
1      Sales           A          1




In [29]:

# Test 2: End anchor ($) - Should match patterns ending at the end of a partition
query_end_anchor = """
SELECT * FROM memory.default.employee_data MATCH_RECOGNIZE(
    PARTITION BY department
    ORDER BY hire_date
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num
    ONE ROW PER MATCH
    PATTERN (A+$)
    DEFINE 
        A AS salary > 1000
);
"""

print("Test 2: End Anchor ($) - Should only match departments where last employee has salary > 1000")
output_df = match_recognize(query_end_anchor, df)
print(output_df)
print("\n")



Test 2: End Anchor ($) - Should only match departments where last employee has salary > 1000
  department pattern_var  match_num
0         IT           A          1
1  Marketing           A          1
2      Sales           A          1




In [30]:
# Test 3: Both anchors (^$) - Should match patterns spanning the entire partition
query_both_anchors = """
SELECT * FROM memory.default.employee_data MATCH_RECOGNIZE(
    PARTITION BY department
    ORDER BY hire_date
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num
    ONE ROW PER MATCH
    PATTERN (^A+$)
    DEFINE 
        A AS salary > 1000
);
"""

print("Test 3: Both Anchors (^$) - Should only match departments where ALL employees have salary > 1000")
output_df = match_recognize(query_both_anchors, df)
print(output_df)
print("\n")


Test 3: Both Anchors (^$) - Should only match departments where ALL employees have salary > 1000
  department pattern_var  match_num
0         IT           A          1




In [31]:

# Test 4: Start anchor with ALL ROWS PER MATCH to see the actual matched rows
query_start_all_rows = """
SELECT * FROM memory.default.employee_data MATCH_RECOGNIZE(
    PARTITION BY department
    ORDER BY hire_date
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num
    ALL ROWS PER MATCH
    PATTERN (^A+)
    DEFINE 
        A AS salary > 1000
);
"""

print("Test 4: Start Anchor (^) with ALL ROWS PER MATCH - Shows matched rows")
output_df = match_recognize(query_start_all_rows, df)
print(output_df)


Test 4: Start Anchor (^) with ALL ROWS PER MATCH - Shows matched rows
  department   hire_date pattern_var  match_num  salary   name  id region
0         IT  2021-01-01           A          1    1500    Ivy   9  North
1         IT  2021-01-02           A          1    1600   Jack  10  North
2         IT  2021-01-03           A          1    1700   Kate  11  North
3         IT  2021-01-04           A          1    1800    Leo  12  North
4      Sales  2021-01-01           A          1    1200  Alice   1   West
5      Sales  2021-01-02           A          1    1300    Bob   2   West


In [32]:
# Test PERMUTE functionality
query_permute = """
SELECT * FROM memory.default.employee_data MATCH_RECOGNIZE(
    PARTITION BY department
    ORDER BY hire_date
    MEASURES 
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num
    ONE ROW PER MATCH
    PATTERN (PERMUTE(A, B))
    DEFINE 
        A AS salary > 1200,
        B AS salary < 1000
);
"""

print("Test PERMUTE - Should match both orderings of A and B")
output_df = match_recognize(query_permute, df)
print(output_df)
print("\n")


Test PERMUTE - Should match both orderings of A and B
  department pattern_var  match_num
0      Sales           B          1


