## Import Required Libarires

In [2]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd
import numpy as np
import json

## Load Data and Normalize Ball's coordinates from event data


In [3]:
df = pd.read_csv('./extracted_sequences/all_matches_10_events_overlapped.csv')#df = pd.read_csv('./extracted_sequences/all_matches_10events.csv')#

df.head()

Unnamed: 0,match_id,sequence_id,num_events,coordinates_json
0,10502,2.0_subset0,10,"[{""x"": -9.38, ""y"": 35.11, ""z"": 1.53}, {""x"": 12..."
1,10502,2.0_subset1,10,"[{""x"": 12.77, ""y"": 27.69, ""z"": 0.04}, {""x"": 12..."
2,10502,2.0_subset2,10,"[{""x"": 12.55, ""y"": 14.14, ""z"": 0.24}, {""x"": 19..."
3,10502,2.0_subset3,10,"[{""x"": 19.53, ""y"": -3.98, ""z"": 0.21}, {""x"": 7...."
4,10502,2.0_subset4,10,"[{""x"": 7.18, ""y"": -5.03, ""z"": 0.26}, {""x"": -16..."


In [4]:

# Parse the coordinates_json column
df['coordinates'] = df['coordinates_json'].apply(json.loads)

# Extract all x, y, z values from all sequences to fit the scaler
all_x, all_y, all_z = [], [], []
for coords_list in df['coordinates']:
    for coord in coords_list:
        all_x.append(coord['x'])
        all_y.append(coord['y'])
        all_z.append(coord.get('z', 0))

# Fit scalers on all coordinates
scaler_x = MinMaxScaler()
scaler_y = MinMaxScaler()
scaler_z = MinMaxScaler()

scaler_x.fit(np.array(all_x).reshape(-1, 1))
scaler_y.fit(np.array(all_y).reshape(-1, 1))
scaler_z.fit(np.array(all_z).reshape(-1, 1))

# Normalize each coordinate in each sequence
def normalize_coordinates(coords_list):
    normalized = []
    for coord in coords_list:
        norm_coord = {
            'x': scaler_x.transform([[coord['x']]])[0][0],
            'y': scaler_y.transform([[coord['y']]])[0][0],
            'z': scaler_z.transform([[coord.get('z', 0)]])[0][0]
        }
        normalized.append(norm_coord)
    return normalized

df['normalized_coordinates'] = df['coordinates'].apply(normalize_coordinates)
df.head()

Unnamed: 0,match_id,sequence_id,num_events,coordinates_json,coordinates,normalized_coordinates
0,10502,2.0_subset0,10,"[{""x"": -9.38, ""y"": 35.11, ""z"": 1.53}, {""x"": 12...","[{'x': -9.38, 'y': 35.11, 'z': 1.53}, {'x': 12...","[{'x': 0.43274656784300514, 'y': 0.93784951703..."
1,10502,2.0_subset1,10,"[{""x"": 12.77, ""y"": 27.69, ""z"": 0.04}, {""x"": 12...","[{'x': 12.77, 'y': 27.69, 'z': 0.04}, {'x': 12...","[{'x': 0.6193043038827593, 'y': 0.843543467208..."
2,10502,2.0_subset2,10,"[{""x"": 12.55, ""y"": 14.14, ""z"": 0.24}, {""x"": 19...","[{'x': 12.55, 'y': 14.14, 'z': 0.24}, {'x': 19...","[{'x': 0.6174513602290912, 'y': 0.671326893746..."
3,10502,2.0_subset3,10,"[{""x"": 19.53, ""y"": -3.98, ""z"": 0.21}, {""x"": 7....","[{'x': 19.53, 'y': -3.98, 'z': 0.21}, {'x': 7....","[{'x': 0.6762402088772846, 'y': 0.441026944585..."
4,10502,2.0_subset4,10,"[{""x"": 7.18, ""y"": -5.03, ""z"": 0.26}, {""x"": -16...","[{'x': 7.18, 'y': -5.03, 'z': 0.26}, {'x': -16...","[{'x': 0.5722226901372863, 'y': 0.427681748856..."


In [6]:


from fastdtw import fastdtw
from scipy.spatial.distance import euclidean

Number_of_events = 10
# Create sequence identifiers (matchid_sequenceid)
df['sequence_identifier'] = df['match_id'].astype(str) + '_' + df['sequence_id'].astype(str)

# Convert normalized coordinates to numpy arrays for DTW
def coords_to_array(coords_list):
    """Convert list of coordinate dicts to numpy array"""
    return np.array([[c['x'], c['y'], c['z']] for c in coords_list])

df['coord_array'] = df['normalized_coordinates'].apply(coords_to_array)

print(f"Total sequences: {len(df)}")
df[['sequence_identifier', 'match_id', 'sequence_id']].head()
# Compute DTW distance matrix
n_sequences = len(df)
sequence_ids = df['sequence_identifier'].values
coord_arrays = df['coord_array'].values

# Initialize distance matrix with zeros
dtw_matrix = np.zeros((n_sequences, n_sequences))

print(f"Computing DTW distances for {n_sequences} sequences...")
print(f"Total comparisons needed: {n_sequences * (n_sequences - 1) // 2}")

# Compute DTW only for upper triangle (avoid redundant computation)
for i in range(n_sequences):
    if i % 10 == 0:
        print(f"Processing sequence {i+1}/{n_sequences}...")
    
    for j in range(i + 1, n_sequences):
        # Compute DTW distance
        distance, _ = fastdtw(coord_arrays[i], coord_arrays[j], dist=euclidean)
        
        # Fill both symmetric positions
        dtw_matrix[i, j] = distance / Number_of_events
        dtw_matrix[j, i] = distance / Number_of_events

print("DTW computation complete!")

# Create DataFrame with sequence identifiers
dtw_df = pd.DataFrame(dtw_matrix, 
                      index=sequence_ids, 
                      columns=sequence_ids)

print(f"\nDTW Matrix shape: {dtw_df.shape}")
dtw_df.head()
# Save DTW distance matrix to CSV
output_file = './extracted_sequences/dtw_distance_matrix_10_events_overlapped.csv'
dtw_df.to_csv(output_file)
print(f"Saved DTW distance matrix to: {output_file}")
print(f"Matrix size: {dtw_df.shape}")
print(f"\nSample distances:")
print(dtw_df.iloc[:5, :5])

Total sequences: 46682
Unique base sequences: 4422
Computing DTW distances for 46682 sequences...
Processing sequence 1/46682...
Processing sequence 11/46682...
Processing sequence 21/46682...
Processing sequence 31/46682...
Processing sequence 41/46682...


Total sequences: 46682
Unique base sequences: 4422
Computing DTW distances for 46682 sequences...
Processing sequence 1/46682...
Processing sequence 11/46682...
Processing sequence 21/46682...
Processing sequence 31/46682...
Processing sequence 41/46682...


KeyboardInterrupt: 

In [3]:
Threshold = 0.2
def find_similar_sequences(sequence_id, dtw_matrix_df, threshold=Threshold):
    """
    Find all sequences similar to a given sequence based on DTW distance.
    
    Parameters:
    -----------
    sequence_id : str
        The sequence identifier (format: matchid_sequenceid)
    dtw_matrix_df : DataFrame
        The DTW distance matrix
    threshold : float
        The DTW distance threshold for similarity
    
    Returns:
    --------
    DataFrame with similar sequences and their DTW distances, sorted ascending
    """
    
    if sequence_id not in dtw_matrix_df.columns:
        print(f"Error: Sequence '{sequence_id}' not found in DTW matrix")
        return None
    
    # Get distances from the column for the target sequence
    distances = dtw_matrix_df[sequence_id]
    
    # Filter sequences with distance < threshold (excluding itself with distance 0)
    similar = distances[(distances < threshold) & (distances > 0)]
    
    # Sort by distance (ascending - most similar first)
    similar_sorted = similar.sort_values()
    
    # Create a DataFrame with results
    result_df = pd.DataFrame({
        'sequence_id': similar_sorted.index,
        'dtw_distance': similar_sorted.values
    })
    
    return result_df



dtw_df = pd.read_csv('./extracted_sequences/dtw_distance_matrix_10_events_non_overlapped_normalized_direction.csv', index_col=0)
target_sequence = "10517_148.0"  # Change this to your desired sequence

if target_sequence in dtw_df.columns:
    print(f"Analyzing sequence: {target_sequence}\n")
    
    similar = find_similar_sequences(target_sequence, dtw_df, threshold=Threshold)
    
    if similar is not None and len(similar) > 0:
        print(f"\nTop 10 most similar sequences:")
        print(similar.head(10))
        
        # Show statistics
        print(f"\n--- Statistics ---")
        print(f"Total similar sequences (DTW < {Threshold}): {len(similar)}")
        print(f"Most similar distance: {similar['dtw_distance'].min():.4f}")
        print(f"Least similar (within threshold): {similar['dtw_distance'].max():.4f}")
        print(f"Average distance: {similar['dtw_distance'].mean():.4f}")
    else:
        print(f"No similar sequences found with DTW < {Threshold}")
        print("\nTry increasing the threshold or check DTW matrix values:")
        print(f"Min distance to other sequences: {dtw_df[target_sequence][dtw_df[target_sequence] > 0].min():.4f}")
else:
    print(f"Sequence '{target_sequence}' not found!")
    print("\nAvailable sequences:")
    print(dtw_df.columns.tolist()[:10], "...")

Analyzing sequence: 10517_148.0


Top 10 most similar sequences:
          sequence_id  dtw_distance
0  3827_216.0_subset0      0.104072
1  3855_108.0_subset1      0.107107
2  3822_100.0_subset0      0.115943
3   3855_18.0_subset1      0.116479
4           3848_80.0      0.116808
5  10503_45.0_subset0      0.118090
6  3827_130.0_subset1      0.118429
7  3856_280.0_subset0      0.122109
8   3854_24.0_subset5      0.122616
9  3833_240.0_subset0      0.125162

--- Statistics ---
Total similar sequences (DTW < 0.2): 353
Most similar distance: 0.1041
Least similar (within threshold): 0.1996
Average distance: 0.1728
