This notebook implements the method described in the research paper. The approach involves two main stages:

Feature Engineering:

Filtering features:
Selecting the most informative events based on a relevance metric.
Extracting sets of features: Mining recurring sets of events using an estimator of the Longest Common Subsequence (LCSS) algorithm.

Classification:
Building an ensemble classifier based on Naive Bayes, where each classifier corresponds to a time window before the incident.

In [1]:
import pandas as pd
import numpy as np
import ast
from collections import Counter
from itertools import combinations
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, classification_report
from tqdm import tqdm

In [2]:
df = pd.read_csv('sncb_data_challenge.csv', delimiter=';')
df.head()

Unnamed: 0.1,Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
0,0,4432881,"[609, 609, 609, 609, 609, 609, 609, 609, 609, ...","[2744, 4004, 2852, 4110, 2854, 4396, 1132, 414...","[-5510, -5510, -5507, -5507, -5506, -5506, -55...",50.876601,4.718143,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",4
1,1,4432943,"[526, 526, 526, 526, 526, 526, 526, 526, 526, ...","[2744, 4148, 4394, 1566, 1570, 4396, 3634, 412...","[-8573, -8573, -8032, -8032, -8032, -7859, -61...",51.037435,4.431218,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 29.1,...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",13
2,2,4432955,"[592, 592, 592, 592, 592, 592, 592, 592, 592, ...","[4394, 1566, 1570, 4114, 4168, 4168, 4156, 406...","[-12291, -12291, -12291, -10932, -10932, -1091...",50.864083,4.162115,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, False, True, Tr...",14
3,3,4433021,"[576, 576, 576, 576, 576, 576, 576, 576, 576, ...","[4066, 4066, 4066, 4066, 4068, 2742, 4026, 270...","[-14351, -14204, -13890, -13383, -12739, -1243...",51.18322,4.276025,"[0.0, 0.0, 0.0, 0.015625, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",2
4,4,4433129,"[634, 634, 634, 634, 634, 634, 634, 634, 634, ...","[4002, 4032, 4028, 2852, 4026, 4110, 2742, 285...","[-224, -224, -223, -222, -222, -222, -220, -22...",50.818727,3.253601,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",14


In [3]:
# Convert events_sequence from string to list
df['events_sequence'] = df['events_sequence'].apply(ast.literal_eval)

In [4]:
# Total occurrences of each event across all classes
total_event_counts = Counter()
# Occurrences of each event within each class
event_counts_per_class = {}

for incident_type in df['incident_type'].unique():
    event_counts_per_class[incident_type] = Counter()

for idx, row in df.iterrows():
    events = set(row['events_sequence'])  # Use set to avoid duplicate events in the same sequence
    incident_type = row['incident_type']
    total_event_counts.update(events)
    event_counts_per_class[incident_type].update(events)

In [5]:
# Compute r for each event
relevance_metric = {}

for event in total_event_counts:
    max_h_in_class = max([event_counts_per_class[cls][event] for cls in event_counts_per_class])
    h_in_all_classes = total_event_counts[event]
    relevance_metric[event] = max_h_in_class / h_in_all_classes

In [23]:
# Set initial threshold
tr = 0.5

# Select events with r >= tr
selected_events = [event for event, r in relevance_metric.items() if r >= tr]

print(f"Number of events selected with r >= {tr}: {len(selected_events)}")

Number of events selected with r >= 0.5: 635


In [24]:
# Events not selected
remaining_events = [event for event in total_event_counts if event not in selected_events]

# Placeholder for events that pass the OaT procedure
additional_events = []

In [25]:
def evaluate_classifier(events_subset):
    # Prepare the data
    df['filtered_events'] = df['events_sequence'].apply(lambda x: [e for e in x if e in events_subset])
    
    # Convert event lists to strings for vectorization
    df['events_str'] = df['filtered_events'].apply(lambda x: ' '.join(map(str, x)))
    
    # Vectorize the events
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['events_str'])
    y = df['incident_type']
    
    # Perform stratified 4-fold cross-validation
    skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)
    f1_scores = []
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        clf = MultinomialNB()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
        f1_scores.append(f1)
    
    avg_f1_score = np.mean(f1_scores)
    return avg_f1_score

In [26]:
# Current set of events
current_events = selected_events.copy()

# Evaluate classifier with current events
current_f1_score = evaluate_classifier(current_events)
print(f"Initial F1-score with selected events: {current_f1_score:.4f}")

# Iterate over remaining events
for event in tqdm(remaining_events, desc='OaT Procedure'):
    temp_events = current_events + [event]
    f1 = evaluate_classifier(temp_events)
    if f1 >= 0.6:
        current_events.append(event)
        additional_events.append(event)
        current_f1_score = f1  # Update the current F1-score

print(f"Final F1-score after OaT: {current_f1_score:.4f}")
print(f"Number of additional events added: {len(additional_events)}")

Initial F1-score with selected events: 0.6309


OaT Procedure: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 282/282 [08:00<00:00,  1.70s/it]

Final F1-score after OaT: 0.6006
Number of additional events added: 241





In [28]:
# Count event frequencies
event_counts = Counter([event for seq in df['events_sequence'] for event in seq])

# Define a minimum frequency threshold
min_event_frequency = 10  # Adjust based on dataset

# Filter out rare events
frequent_events = set([event for event, count in event_counts.items() if count >= min_event_frequency])

# Update events_sequence by keeping only frequent events
df['filtered_events_sequence'] = df['events_sequence'].apply(lambda seq: [event for event in seq if event in frequent_events])

In [29]:
# Prepare the data for frequent itemset mining
transactions = df['filtered_events_sequence'].tolist()

# Remove empty transactions
transactions = [t for t in transactions if t]

# Use TransactionEncoder
te = TransactionEncoder()
te_ary = te.fit(transactions).transform(transactions)
df_te = pd.DataFrame(te_ary, columns=te.columns_)

In [None]:
from mlxtend.frequent_patterns import fpgrowth
# Apply the FP-Growth algorithm to find frequent itemsets
min_support = 0.05  # Adjust as needed based on dataset size

frequent_itemsets = fpgrowth(df_te, min_support=min_support, use_colnames=True)

print(f"Number of frequent itemsets found: {len(frequent_itemsets)}")

In [None]:
# Convert frequent itemsets into a list of sets
frequent_sets = frequent_itemsets['itemsets'].tolist()

# Function to check if a frequent set is present in a transaction
def check_itemset_presence(transaction, itemset):
    return itemset.issubset(set(map(str, transaction)))

# Create new features for each frequent itemset
for idx, itemset in enumerate(frequent_sets):
    feature_name = f'itemset_{idx}'
    df[feature_name] = df['events_sequence'].apply(lambda x: int(check_itemset_presence(x, itemset)))

print("New features added based on frequent itemsets.")


In [None]:
# Define time windows in minutes (for example purposes)
time_windows = [5, 10, 15, 20, 30, 60]  # Adjust based on data granularity

# Assume each event in events_sequence has an associated timestamp
# For this example, we'll simulate event times (in minutes before the incident)
import random

def assign_event_times(events_sequence):
    return [(e, random.randint(0, 120)) for e in events_sequence]  # Events within 2 hours before incident

df['events_with_times'] = df['events_sequence'].apply(assign_event_times)


In [None]:
# Initialize a list to hold classifiers and their corresponding data
classifiers = []
window_features = []

for window in time_windows:
    # Extract events within the window
    def filter_events_by_time(events_with_times):
        return [str(e[0]) for e in events_with_times if e[1] <= window]
    
    df[f'events_window_{window}'] = df['events_with_times'].apply(filter_events_by_time)
    
    # Convert to string for vectorization
    df[f'events_window_{window}_str'] = df[f'events_window_{window}'].apply(lambda x: ' '.join(x))
    
    # Vectorize the events
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df[f'events_window_{window}_str'])
    y = df['incident_type']
    
    # Store the vectorizer and data
    window_features.append({'window': window, 'vectorizer': vectorizer, 'X': X, 'y': y})


In [None]:
# Train classifiers for each window
for wf in window_features:
    X = wf['X']
    y = wf['y']
    clf = MultinomialNB(alpha=1e-6)  # Smoothing parameter λ
    clf.fit(X, y)
    wf['classifier'] = clf


In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# Prepare the final predictions
y_true = df['incident_type']
y_pred = []

# Cross-validation setup
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
f1_scores = []

for train_index, test_index in skf.split(df, y_true):
    y_test_fold = y_true.iloc[test_index]
    y_pred_fold = []
    
    # For each sample in the test fold
    for idx in test_index:
        prediction_made = False
        # Go through classifiers in order of time windows
        for wf in window_features:
            # Get the features for this sample
            events_str = df.iloc[idx][f'events_window_{wf["window"]}_str']
            X_sample = wf['vectorizer'].transform([events_str])
            # Predict
            clf = wf['classifier']
            y_pred_sample = clf.predict(X_sample)
            # Check if the classifier can make a prediction
            if X_sample.nnz != 0:
                y_pred_fold.append(y_pred_sample[0])
                prediction_made = True
                break  # Exit the loop once a prediction is made
        if not prediction_made:
            # If no classifier could make a prediction, use the most frequent class as a fallback
            y_pred_fold.append(y_true.mode()[0])
    
    # Compute F1-score for this fold
    f1 = f1_score(y_test_fold, y_pred_fold, average='weighted', zero_division=0)
    f1_scores.append(f1)

print(f"Average F1-score of the ensemble classifier: {np.mean(f1_scores):.4f}")