# Decision algorithm


In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np
from collections import Counter
import ast

# Visualization
import plotly.graph_objects as go
import plotly.express as px

# Machine learning model selection, preprocessing, and evaluation
from sklearn.model_selection import train_test_split, cross_val_score, LeaveOneOut, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics import classification_report, accuracy_score, make_scorer, f1_score
from sklearn.preprocessing import LabelEncoder

# Machine learning classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from xgboost import XGBClassifier

# Imbalanced data handling
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

<div class="alert alert-block alert-info"><b>Tip: </b> Use blue boxes for Tips and notes. If it's a note, you don't have to include the word "Note". </div> 

<div class="alert alert-block alert-warning"><b>Example: </b> Use yellow boxes for examples that are not inside code cells, or use for mathematical formulas if needed. </div>

<div class="alert alert-block alert-success"><b>Up to you: </b>Use green boxes sparingly, and only for some specific purpose that the other boxes can't cover. For example, if you have a lot of related content to link to, maybe you decide to use green boxes for related links from each section of a notebook. </div>

<div class="alert alert-block alert-danger"><b>Just don't: </b>In general, just avoid the red boxes. </div>

## 1. Data procesing

### 1.1. Data loading

In [2]:
df = pd.read_csv('data/sncb_data_challenge.csv', delimiter=';')
df.sample(2)

Unnamed: 0.1,Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
348,348,4450241,"[1003, 1003, 1003, 1003, 1003, 1003, 1003, 100...","[4004, 2852, 4110, 2854, 4026, 4092, 4094, 270...","[-13732, -13730, -13730, -13728, -13726, -1372...",50.488103,5.507671,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",13
198,198,4442059,"[1040, 1040, 1040, 1040, 1040, 1040, 1040, 104...","[4068, 3658, 4068, 3658, 4066, 2744, 4026, 415...","[-14299, -14269, -14048, -14016, -13767, -1355...",50.104083,5.035728,"[1.7, 0.0, 1.4, 0.0, 1.5, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",14


### 1.2. Events filter

Following the same idea mentioned on the paper, we are going to remove the events that are too common. The critera used is: The event appears in more than the 85% of the events sequences. 

In [3]:
events_types_dict = {}
for events_sequence in df['events_sequence']:
    row_list = ast.literal_eval(events_sequence) #transforming string into actual list
    unique_events = set(row_list)
    for event in unique_events:
        if not events_types_dict.get(event):
            events_types_dict[event] = 0
        events_types_dict[event] += 1
sorted_dict = dict(sorted(events_types_dict.items(), key=lambda item: item[1], reverse=True))
# Convert the sorted dictionary to a DataFrame
sorted_events_perc_df = pd.DataFrame(list(sorted_dict.items()), columns=['event_type', 'frequency'])
sorted_events_perc_df['percentage'] = sorted_events_perc_df['frequency'] / df.shape[0] * 100
# Cast the 'event_type' column to string
sorted_events_perc_df['event_type'] = sorted_events_perc_df['event_type'].astype(str)

In [4]:
events_low_frequency = list(map(int, list(sorted_events_perc_df[sorted_events_perc_df.percentage<=85].event_type)))

In [5]:
df['clean_events_sequence'] = df.events_sequence.apply(ast.literal_eval).apply(lambda x: [i for i in x if i in events_low_frequency]).astype(str)

### 1.3. Binary sequences

In this step we consider that the list of event could be treated as binary column in the dataframe. This variables are going to be the predictors we are going to use to predict the incident type.  

In [7]:
# Step 1: Convert each string list into an actual Python list
df['clean_events_sequence'] = df['clean_events_sequence'].apply(lambda x: list(map(int, x.strip('[]').split(','))))

# Step 2: Convert each list of events into a binary-encoded DataFrame
binary_encoded_df = pd.get_dummies(df['clean_events_sequence'].apply(pd.Series).stack(), prefix='event').groupby(level=0).max()

# Step 3: Concatenate the binary-encoded columns back to the original DataFrame
df = pd.concat([df, binary_encoded_df], axis=1).fillna(0)

In [9]:
df.drop(columns=[
    'incident_id',
    'vehicles_sequence', 
    'Unnamed: 0', 
    'events_sequence', 
    'seconds_to_incident_sequence', 
    'approx_lat',
    'approx_lon',
    'train_kph_sequence',
    'dj_ac_state_sequence',
    'dj_dc_state_sequence',
   'clean_events_sequence' 
], inplace=True)

In [10]:
df.sample(5)

Unnamed: 0,incident_type,event_10.0,event_12.0,event_28.0,event_30.0,event_42.0,event_52.0,event_60.0,event_64.0,event_66.0,...,event_4394.0,event_4396.0,event_4406.0,event_4408.0,event_4410.0,event_4412.0,event_4414.0,event_4416.0,event_4418.0,event_4420.0
169,9,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,False,False,False,False
194,99,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
155,14,False,False,False,False,False,False,False,False,False,...,True,True,False,False,False,False,False,False,False,False
118,13,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
135,13,False,False,False,False,False,False,False,False,False,...,True,False,True,True,True,True,False,False,False,False


<div class="alert alert-block alert-info">Hence, for the models we are going to use the dataset with this format, only considering the binary events as predictors. </div> 

## 2. Models: Naive approach

### 2.1. Decission tree

In [12]:
# Features and target
X = df.drop(columns=['incident_type'])
y = df['incident_type']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [13]:
# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier(random_state=42)

# Train the model
clf.fit(X_train, y_train)

In [14]:
# Make predictions
y_pred = clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5723684210526315
Classification Report:
               precision    recall  f1-score   support

           2       0.65      0.71      0.68        31
           3       0.00      0.00      0.00         0
           4       0.47      0.47      0.47        19
           6       0.00      0.00      0.00         1
           9       0.42      0.34      0.38        32
          11       0.50      0.17      0.25        12
          13       0.75      0.76      0.76       102
          14       0.51      0.59      0.55        49
          16       0.00      0.00      0.00         0
          17       0.00      0.00      0.00         3
          99       0.43      0.42      0.43        55

    accuracy                           0.57       304
   macro avg       0.34      0.32      0.32       304
weighted avg       0.57      0.57      0.57       304



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.2. Random forest

In [16]:
# Features and target
X = df.drop(columns=['incident_type'])
y = df['incident_type']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [17]:
# Initialize the Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_clf.fit(X_train, y_train)

In [18]:
# Make predictions
y_pred = rf_clf.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6743421052631579
Classification Report:
               precision    recall  f1-score   support

           2       0.60      0.94      0.73        31
           4       0.79      0.58      0.67        19
           6       0.00      0.00      0.00         1
           9       0.85      0.34      0.49        32
          11       0.00      0.00      0.00        12
          13       0.69      0.93      0.79       102
          14       0.69      0.71      0.70        49
          17       0.00      0.00      0.00         3
          99       0.60      0.44      0.51        55

    accuracy                           0.67       304
   macro avg       0.47      0.44      0.43       304
weighted avg       0.65      0.67      0.64       304



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### 2.3. XGBoost

In [36]:
# Features and target
X = df.drop(columns=['incident_type'])
y = df['incident_type']
# Initialize the encoder
le = LabelEncoder()

# Transform the target labels to consecutive integers
y_encoded = le.fit_transform(y)

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

In [38]:
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
xgb_clf.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.



In [39]:
# Predict on the test set
y_pred_encoded = xgb_clf.predict(X_test)

# Decode the predicted labels back to the original labels
y_pred = le.inverse_transform(y_pred_encoded)
y_test_original = le.inverse_transform(y_test)  # Optional: decode y_test for comparison

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_original, y_pred))
print("Classification Report:\n", classification_report(y_test_original, y_pred))

Accuracy: 0.6973684210526315
Classification Report:
               precision    recall  f1-score   support

           2       0.71      0.77      0.74        31
           4       0.69      0.58      0.63        19
           6       0.00      0.00      0.00         1
           9       0.68      0.47      0.56        32
          11       1.00      0.25      0.40        12
          13       0.78      0.91      0.84       102
          14       0.69      0.76      0.72        49
          17       0.00      0.00      0.00         3
          99       0.59      0.53      0.56        55

    accuracy                           0.70       304
   macro avg       0.57      0.47      0.49       304
weighted avg       0.70      0.70      0.69       304



Overall Accuracy: **0.697** (or ~70% accuracy) This means that the model correctly classified about 70% of the instances in the test set. 
Some observatins:
- Class 2: Precision is **0.71**, recall is 0.77, and the F1-score is **0.74** with 31 instances in the test set. This indicates that the model performs fairly well for this class, with 71% precision and 77% recall.
- Class 6: Precision, recall, and F1-score are all 0.00, with only one instance in the test set. This suggests the model was unable to correctly classify any examples of this class, likely due to having very few examples.
- Class 13: Precision is **0.78**, recall is **0.91**, and the F1-score is **0.84** with 102 instances. This indicates that the model performs strongly on this class, capturing 91% of instances with high precision.
- Class 17: All metrics are 0.00 with 3 instances, meaning the model didn’t correctly classify any instances for this class.



Weighted Average:
- Weighted Precision: 0.70
- Weighted Recall: 0.70
- Weighted F1-Score: 0.69

The weighted average accounts for the support (number of instances) of each class. This is often a better overall indicator for imbalanced data, as it places more emphasis on classes with more instances.

Class Imbalance: The low recall for some classes (e.g., 6, 11, and 17) could indicate an issue with class imbalance, where the model has trouble recognizing these underrepresented classes.

In summary:

The model performs well on the more frequent classes (e.g., 13 and 14) but struggles with less frequent classes.
The overall accuracy and weighted F1-score of about 70% suggest a reasonable model fit, though improving recall for underrepresented classes could further enhance performance.

## 3. Improving model: XGBoost

In [29]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [10,25,50],
    'learning_rate': [0.01, 0.05,0.1, 0.15],
    'max_depth': [1,2,3],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.6,0.8,1.0]
}

# Initialize the classifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb_clf, param_grid=param_grid, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)

In [30]:
# Fit GridSearchCV
# Features and target
X = df.drop(columns=['incident_type'])
y = df['incident_type']
# Initialize the encoder
le = LabelEncoder()

# Transform the target labels to consecutive integers
y_encoded = le.fit_transform(y)

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 648 candidates, totalling 1944 fits


Parameters: { "use_label_encoder" } are not used.



In [31]:
# Display the best parameters found by grid search
print("Best parameters:", grid_search.best_params_)

# Evaluate the best model
best_xgb_clf = grid_search.best_estimator_
y_pred_encoded = best_xgb_clf.predict(X_test)

# Decode the predicted labels back to the original labels
y_pred = le.inverse_transform(y_pred_encoded)
y_test_original = le.inverse_transform(y_test)  # Optional: decode y_test for comparison

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_original, y_pred))
print("Classification Report:\n", classification_report(y_test_original, y_pred))

Best parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 50, 'subsample': 0.8}
Accuracy: 0.7269736842105263
Classification Report:
               precision    recall  f1-score   support

           2       0.72      0.90      0.80        31
           4       0.76      0.68      0.72        19
           6       0.00      0.00      0.00         1
           9       0.82      0.44      0.57        32
          11       1.00      0.17      0.29        12
          13       0.79      0.90      0.84       102
          14       0.70      0.78      0.74        49
          17       0.00      0.00      0.00         3
          99       0.61      0.62      0.61        55

    accuracy                           0.73       304
   macro avg       0.60      0.50      0.51       304
weighted avg       0.73      0.73      0.71       304



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
