<a href="https://colab.research.google.com/github/stef4k/train-maintenance-data-mining/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text classification


In [10]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
from collections import Counter
import ast


#from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, LeaveOneOut
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

Manually remove the first ';' from the first row in csv file

In [11]:
df = pd.read_csv('sncb_data_challenge.csv', delimiter=';', index_col=0)
df.sample(2)

Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
634,4462103,"[516, 516, 516, 516, 516, 516, 516, 516, 516, ...","[2956, 2956, 2956, 2956, 2956, 2956, 2956, 295...","[-14396, -14371, -14358, -14342, -14331, -1430...",50.634509,4.621612,"[17.3, 73.0, 84.8, 91.5, 89.1, 86.2, 86.3, 88....","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",99
723,4466677,"[568, 568, 568, 568, 568, 568, 568, 568, 568, ...","[4002, 4032, 4026, 4028, 2852, 4110, 2854, 249...","[-11543, -11543, -11541, -11541, -11540, -1154...",50.949979,5.072721,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",9


Now I will analyze the percentage of each event type appearing at least once in an event sequence:

In [12]:
events_types_dict = {}
for events_sequence in df['events_sequence']:
    row_list = ast.literal_eval(events_sequence) #transforming string into actual list
    unique_events = set(row_list)
    for event in unique_events:
        if not events_types_dict.get(event):
            events_types_dict[event] = 0
        events_types_dict[event] += 1
sorted_dict = dict(sorted(events_types_dict.items(), key=lambda item: item[1], reverse=True))
# Convert the sorted dictionary to a DataFrame
sorted_events_perc_df = pd.DataFrame(list(sorted_dict.items()), columns=['event_type', 'frequency'])
sorted_events_perc_df['percentage'] = sorted_events_perc_df['frequency'] / df.shape[0] * 100
# Cast the 'event_type' column to string
sorted_events_perc_df['event_type'] = sorted_events_perc_df['event_type'].astype(str)

We save in a list all event codes that appear in less than 85% of the event sequences:

In [13]:
events_low_frequency = list(map(int, list(sorted_events_perc_df[sorted_events_perc_df.percentage<=85].event_type)))

## Text preprocessing
Before we start with text classification we need to clean the sequences of events. As seen one value of `events_sequence` contains commas and brackets even though it is a string

In [14]:
df.events_sequence.iloc[0]

'[2744, 4004, 2852, 4110, 2854, 4396, 1132, 4140, 4148, 2708, 4026, 1032, 1082, 4152, 4030, 4018, 4168, 4156, 4394, 152, 2742, 4410, 4406, 4068, 4408, 4412, 4066, 2744, 4026, 4148, 4168, 4140, 3986, 2744, 4002, 2852, 4110, 2854, 4148, 2708, 4026, 4140, 4152, 4030, 4018, 4140, 4168, 4156, 2852, 2854, 4124, 2858, 2658, 2688, 3254, 3254, 3254, 2970, 4082, 4090, 4092, 2982, 3236, 4100, 2702, 4394, 1250, 2970, 2980, 2970, 2980, 2970, 2982, 2970, 2982, 4168, 4140, 3986, 2742, 4004, 2852, 4110, 2854, 2982, 2708, 4026, 4030, 4018, 4148, 4140, 4152, 4168, 4156, 4120, 2858, 2658, 2688, 3254, 3254, 2970, 2982, 2708, 2970, 2982, 4100, 2702, 1250, 4394, 2744, 4026, 4148, 2970, 2980, 4168, 4140, 4168, 3986, 2744, 4002, 2852, 4110, 2854, 2980, 2708, 4026, 4148, 2552, 4168, 4140, 4152, 4030, 4018, 4026, 4140, 4168, 4156, 2970, 2982, 2708, 2970, 4082, 4092, 4090, 4084, 4094, 4090, 3236, 2982, 4100, 2702, 1250, 4394, 4168, 4140, 3986, 2744, 4004, 2852, 4110, 2854, 2982, 2708, 4026, 4140, 4030, 4018, 414

Also, as observed before some event types are so common they do not actually bring a lot of value (as mentioned in the paper as well). We remove those common event types

The steps to clean the event sequences are:
- keep non-common event types mentioned in list `events_low_frequency`
- remove symbols: [] , and store sequences of events as a string without brackets and commas:

In [15]:
df['clean_events_sequence'] = df.events_sequence.apply(ast.literal_eval).apply(lambda x: [i for i in x if i in events_low_frequency]).astype(str)\
                .replace(r'[\[\],]', '', regex=True)

## Text classification

Now we try to experiment using text techniques to transform the list events sequence:

In [16]:
df['incident_type'].value_counts()
df = df[~df["incident_type"].isin([7, 16, 3, 6, 17])].copy()
df.reset_index(drop=True, inplace=True)

In [17]:
target = df['incident_type'].copy() # target column separated
#le = LabelEncoder()
#target = le.fit_transform(target)
X_train, X_test, y_train, y_test = train_test_split(df.clean_events_sequence, target, test_size=0.2,  random_state=7)

Since the dataset is imbalanced we will use different strategies to battle that. Here we set a new sampling strategy based on a basic script:

In [18]:
# Define custom sampling strategy based on class distribution
# Each non-majority class will have equal samples to 15% of the majority class plus their previous samples
class_counts = pd.Series(y_train).value_counts()
max_class_count = max(class_counts.values)
sampling_strategy = {class_counts.index[i]: int(max_class_count * 0.15) + class_counts.values[i]
                     for i in range(len(pd.Series(y_train).value_counts().index)) if class_counts.values[i] < max_class_count}
sampling_strategy

{99: 183, 14: 157, 9: 131, 2: 129, 4: 97, 11: 59}

Starting with CountVectorizer:
- Tokenization: Splits text into individual words (tokens).
- Builds a Vocabulary: Creates a dictionary of unique words (tokens) from the entire corpus.
- Counts the Occurrence: Calculates the frequency (count) of each word in each document.
- Transforms Text into a Sparse Matrix: Returns a matrix of shape (n_samples, n_features), where n_samples is the number of documents and n_features is the number of unique words in the vocabulary.

  We firstly set the sampling strategy for SMOTE:

Now we set the pipeline to be used:

In [19]:
text_clf = Pipeline([
                    ('vect', CountVectorizer()),
                     #('decision_tree', DecisionTreeClassifier()),
                    ('smote', SMOTE(sampling_strategy=sampling_strategy, random_state=1, k_neighbors=2)),
                    ('extra_trees', ExtraTreesClassifier()),
                    #('random_forest', RandomForestClassifier())
                    ])

Training the model:

In [20]:
text_clf.fit(X_train, y_train)

Print the results for the particular split of test data:

In [21]:
clf_predict = text_clf.predict(X_test)
print(classification_report(y_test, clf_predict, zero_division=1))

              precision    recall  f1-score   support

           2       0.79      0.82      0.81        28
           4       0.68      0.79      0.73        19
           9       0.91      0.42      0.57        24
          11       1.00      0.00      0.00         5
          13       0.64      0.89      0.74        61
          14       0.73      0.53      0.62        30
          99       0.54      0.50      0.52        30

    accuracy                           0.68       197
   macro avg       0.75      0.56      0.57       197
weighted avg       0.70      0.68      0.66       197



In [22]:
vec = TfidfVectorizer()
embeddings = vec.fit_transform(X_train).toarray()
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=1, k_neighbors=2)
trees = ExtraTreesClassifier()
X_train_res, y_train_res = smote.fit_resample(embeddings, y_train)

trees.fit(X_train_res, y_train_res)
predictions = trees.predict(vec.transform(X_test).toarray())
print(classification_report(y_test, predictions, zero_division=1))

              precision    recall  f1-score   support

           2       0.79      0.79      0.79        28
           4       0.92      0.63      0.75        19
           9       0.83      0.42      0.56        24
          11       1.00      0.00      0.00         5
          13       0.61      0.89      0.72        61
          14       0.81      0.57      0.67        30
          99       0.46      0.53      0.49        30

    accuracy                           0.66       197
   macro avg       0.77      0.55      0.57       197
weighted avg       0.71      0.66      0.65       197



In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X_train.values)


In [52]:
vectorizer = TfidfVectorizer()
embeddings = vectorizer.fit_transform(X_train)
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=1, k_neighbors=2)
X_train_res, y_train_res = smote.fit_resample(embeddings, y_train)
le = LabelEncoder()
y_train_res_e = le.fit_transform(y_train_res)
y_test_e = le.transform(y_test)
dtrain = xgb.DMatrix(data=X_train_res, label=y_train_res_e)
dtest = xgb.DMatrix(data=vectorizer.transform(X_test), label=y_test_e)
num_classes = len(np.unique(target))
params = {
    'objective': 'multi:softmax',   # Cambiar a 'multi:softprob' si necesitas probabilidades en lugar de etiquetas
    'num_class': num_classes,       # Número de clases
    'max_depth': 4,                 # Profundidad máxima del árbol
    'learning_rate': 0.1,           # Tasa de aprendizaje
    'n_estimators': 100,            # Número de árboles
    'eval_metric': 'mlogloss'       # Métrica para clasificación multiclase
}
bst = xgb.train(params, dtrain)
predictions = bst.predict(dtest)
accuracy = accuracy_score(y_test_e, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")

NameError: name 'LabelEncoder' is not defined

In [None]:
!pip install -U imbalanced-learn



In [105]:
df["incident_type"].value_counts()

Unnamed: 0_level_0,count
incident_type,Unnamed: 1_level_1
13,318
99,175
14,149
2,119
9,117
4,78
11,26


In [36]:
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from gensim.models import Word2Vec
from scipy.optimize import minimize
from sklearn.base import TransformerMixin
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN, RandomOverSampler
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from scipy.sparse import csr_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from xgboost import XGBClassifier
from tqdm import tqdm
from scipy.optimize import differential_evolution
import numpy as np
import pandas as pd
from copy import deepcopy

In [50]:
class Word2VecVectorizer(TransformerMixin):
    def __init__(self, size=100, window=5, min_count=1, workers=4):
        self.size = size
        self.window = window
        self.min_count = min_count
        self.workers = workers
        self.w2v_model = None

    def fit(self, X, y=None):
        sentences = [sentence.split() for sentence in X]
        self.w2v_model = Word2Vec(sentences, vector_size=self.size, window=self.window,
                                  min_count=self.min_count, workers=self.workers)
        return self

    def transform(self, X, y=None):
        transformed_data = np.array([
            np.mean([self.w2v_model.wv[word] for word in sentence.split() if word in self.w2v_model.wv]
                    or [np.zeros(self.size)], axis=0)
            for sentence in X
        ])
        return csr_matrix(transformed_data)

    def fit_transform(self, X, y=None):
        self.fit(X, y)
        return self.transform(X, y)

class Experiment:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=0.2, random_state=7, stratify=y)
        self.trained_models = {}  # To store trained models
        self.results = []
        self.sampling_strategies = {
            "SMOTE": SMOTE(sampling_strategy='auto', random_state=1, k_neighbors=3),
            "Borderline-SMOTE": BorderlineSMOTE(sampling_strategy='auto', random_state=1, k_neighbors=3),
            "ADASYN": ADASYN(sampling_strategy='auto', random_state=1, n_neighbors=3),
            "RandomOversampler": RandomOverSampler(sampling_strategy='auto', random_state=1),
            "SMOTE-ENN": SMOTEENN(sampling_strategy='auto', random_state=1),
            "SMOTE-Tomek": SMOTETomek(sampling_strategy='auto', random_state=1)
        }

        self.vectorizers = {
            "TFIDF": TfidfVectorizer(),
            "Count": CountVectorizer(),
            "Word2Vec": Word2VecVectorizer(size=100, window=5, min_count=1)
        }
        self.classifiers = {
            'LogisticRegression': LogisticRegression(),
            'DecisionTree': DecisionTreeClassifier(),
            'RandomForest': RandomForestClassifier(),
            'ExtraTreesClassifier': ExtraTreesClassifier(),
            'GradientBoostingClassifier': GradientBoostingClassifier(),
            'AdaBoostClassifier': AdaBoostClassifier(),
            'GaussianNB': GaussianNB(),
            'KNN': KNeighborsClassifier(),
            'SVM': SVC(probability=True),
            'XGBoost': XGBClassifier(),
        }

    def test(self, model, model_name, vectorizer, vectorizer_name, sampler, sampler_name):
        """Test a model with Stratified K-Fold and return an array with metric results."""
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
        accuracies, recalls, precisions, f1s = [], [], [], []

        for train_index, test_index in skf.split(self.X, self.y):
            X_train, X_test = self.X[train_index], self.X[test_index]
            y_train, y_test = self.y[train_index], self.y[test_index]

            X_train = vectorizer.fit_transform(X_train).toarray()
            X_test = vectorizer.transform(X_test).toarray()

            X_resampled, y_resampled = sampler.fit_resample(X_train, y_train)

            model.fit(X_resampled, y_resampled)

            y_pred = model.predict(X_test)

            # Collect metrics for each fold
            accuracies.append(accuracy_score(y_test, y_pred))
            recalls.append(recall_score(y_test, y_pred, average='weighted'))
            precisions.append(precision_score(y_test, y_pred, average='weighted'))
            f1s.append(f1_score(y_test, y_pred, average='weighted'))

        # Return average metrics as a result array
        return [
            model_name, vectorizer_name, sampler_name,
            np.mean(accuracies), np.std(accuracies),
            np.mean(recalls), np.std(recalls),
            np.mean(precisions), np.std(precisions),
            np.mean(f1s), np.std(f1s)
        ]

    def run(self):
        results = []

        # Iterate over vectorizers, samplers, and classifiers
        for vect_name, ovectorizer in self.vectorizers.items():
            vectorizer = deepcopy(ovectorizer)
            for samp_name, osampler in self.sampling_strategies.items():
                sampler = deepcopy(osampler)
                for clf_name, omodel in tqdm(self.classifiers.items()):
                    model = deepcopy(omodel)
                    # Run the test function for each combination and collect results
                    print(f"\n=== Vectorizer: {vect_name} ===")
                    print(f"\n=== Sampling Strategy: {samp_name} ===")
                    print(f"\n=== Classifier: {clf_name} ===\n")
                    result = self.test(
                        model=model,
                        model_name=clf_name,
                        vectorizer=vectorizer,
                        vectorizer_name=vect_name,
                        sampler=sampler,
                        sampler_name=samp_name
                    )
                    # Store the trained model and add result to the results list
                    results.append(result)

                vectorizer = deepcopy(ovectorizer)
                sampler = deepcopy(osampler)
                model = deepcopy(omodel)
                X_train = vectorizer.fit_transform(self.X_train).toarray()
                X_resampled, y_resampled = sampler.fit_resample(self.X_train, self.y_train)
                model.fit(X_resampled, y_resampled)
                self.trained_models[(vect_name, samp_name, clf_name)] = model

        # Convert results to a DataFrame for better readability
        columns = [
            'Model', 'Vectorizer', 'Sampler',
            'Accuracy Mean', 'Accuracy Std',
            'Recall Mean', 'Recall Std',
            'Precision Mean', 'Precision Std',
            'F1 Mean', 'F1 Std'
        ]
        results_df = pd.DataFrame(results, columns=columns)
        self.results = results_df
        return results_df

    def create_ensemble(self):
        # Gather predictions from all models
        predictions = {}
        for (vect_name, samp_name, clf_name), model in self.trained_models.items():
            vectorizer = deepcopy(self.vectorizers[vect_name])
            X_test_vect = vectorizer.transform(self.X_test)
            predictions[(vect_name, samp_name, clf_name)] = model.predict(X_test_vect)

        # Convert predictions to a 2D array
        pred_matrix = np.array(list(predictions.values())).T

        # Define fitness function for genetic algorithm
        def fitness(weights):
            weighted_pred = np.average(pred_matrix, axis=1, weights=weights)
            final_pred = (weighted_pred > 0.5).astype(int)
            return -f1_score(self.y_test, final_pred, average='weighted')

        # Genetic algorithm for weight optimization
        bounds = [(0, 1)] * pred_matrix.shape[1]
        result = differential_evolution(fitness, bounds)

        # Final ensemble prediction using optimized weights
        optimized_weights = result.x
        final_weighted_pred = np.average(pred_matrix, axis=1, weights=optimized_weights)
        ensemble_pred = (final_weighted_pred > 0.5).astype(int)

        # Calculate final F1 score
        ensemble_f1 = f1_score(self.y_test, ensemble_pred, average='weighted')
        return ensemble_pred, ensemble_f1

le = LabelEncoder()
target = le.fit_transform(target)
exp = Experiment(df.clean_events_sequence, target)
results = exp.run()

  0%|          | 0/10 [00:00<?, ?it/s]


=== Vectorizer: TFIDF ===

=== Sampling Strategy: SMOTE ===

=== Classifier: LogisticRegression ===



 10%|█         | 1/10 [00:03<00:28,  3.17s/it]


=== Vectorizer: TFIDF ===

=== Sampling Strategy: SMOTE ===

=== Classifier: DecisionTree ===



 20%|██        | 2/10 [00:05<00:21,  2.72s/it]


=== Vectorizer: TFIDF ===

=== Sampling Strategy: SMOTE ===

=== Classifier: RandomForest ===



 30%|███       | 3/10 [00:11<00:30,  4.32s/it]


=== Vectorizer: TFIDF ===

=== Sampling Strategy: SMOTE ===

=== Classifier: ExtraTreesClassifier ===



 40%|████      | 4/10 [00:17<00:28,  4.69s/it]


=== Vectorizer: TFIDF ===

=== Sampling Strategy: SMOTE ===

=== Classifier: GradientBoostingClassifier ===






=== Vectorizer: TFIDF ===

=== Sampling Strategy: SMOTE ===

=== Classifier: AdaBoostClassifier ===



 60%|██████    | 6/10 [05:28<05:09, 77.33s/it] 


=== Vectorizer: TFIDF ===

=== Sampling Strategy: SMOTE ===

=== Classifier: GaussianNB ===



 70%|███████   | 7/10 [05:29<02:36, 52.28s/it]


=== Vectorizer: TFIDF ===

=== Sampling Strategy: SMOTE ===

=== Classifier: KNN ===



 80%|████████  | 8/10 [05:30<01:11, 35.88s/it]


=== Vectorizer: TFIDF ===

=== Sampling Strategy: SMOTE ===

=== Classifier: SVM ===



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
 90%|█████████ | 9/10 [05:58<00:33, 33.52s/it]


=== Vectorizer: TFIDF ===

=== Sampling Strategy: SMOTE ===

=== Classifier: XGBoost ===



100%|██████████| 10/10 [07:02<00:00, 42.22s/it]


ValueError: could not convert string to float: '2684 2682 2682 2682 2682 2892 4054 2736 4020 4016 4028 3354 4024 3506 4056 4032 3634 2740 4030 4018 4126 2682 3982 4054 2736 3354 4020 4028 2740 4396 2740 4030 4020 2972 3234 2976 4100 4396 3008 3980 4180 2682 4396 2682 4180 4016 4020 3364 150 3354 4396 4028 3354 3354 4028 3354 4016 3354 1266 3354 4028 4028 3354 4016 4020 148 148 4396 4020 2682 3354 4396 148 3620 2682 3982 4054 2686 2736 4020 4028 3354 3354 4028 4032 4056 3354 4054 4016 3354 1266 3354 4028 4028 3354 4016 4020'

In [49]:
exp.trained_models[('TFIDF', 'SMOTE', 'LogisticRegression')].coef_.shape

(7, 100)

In [39]:
results.to_csv('results.csv')

In [None]:
vectorizer = TfidfVectorizer()
embeddings = vectorizer.fit_transform(df.clean_events_sequence)
class_counts = pd.Series(target).value_counts()
max_class_count = max(class_counts.values)
sampling_strategy = {class_counts.index[i]: int(max_class_count * 0.15) + class_counts.values[i]
                     for i in range(len(pd.Series(target).value_counts().index)) if class_counts.values[i] < max_class_count}
smote = SMOTE(sampling_strategy=sampling_strategy, random_state=1, k_neighbors=2)
X_res, y_res = smote.fit_resample(embeddings, target)
le = LabelEncoder()
y_res_e = le.fit_transform(y_res)


In [None]:
experiment = SCNB()
X_train_res, y_train_res_e = pre_process_data(X_train, y_train)
experiment.train_classifiers(X_train_res, y_train_res_e)
X_res, y_res = pre_process_data(df.clean_events_sequence, target)
experiment.evaluate_models(X_res, y_res)

All classifiers trained successfully.
Evaluation complete.


Unnamed: 0,Model,Accuracy Mean,Accuracy Std,Accuracy Median,Recall Mean,Recall Std,Recall Median,Precision Mean,Precision Std,Precision Median
0,LogisticRegression,0.742848,0.046359,0.735294,0.742848,0.046359,0.735294,0.756782,0.045676,0.753547
1,DecisionTree,0.68728,0.096798,0.650327,0.68728,0.096798,0.650327,0.696068,0.093068,0.659223
2,RandomForest,0.812945,0.079332,0.764706,0.812945,0.079332,0.764706,0.822176,0.073905,0.777173
3,GaussianNB,0.680684,0.044453,0.653595,0.680684,0.044453,0.653595,0.705803,0.038073,0.696893
4,KNN,0.638828,0.068662,0.611111,0.638828,0.068662,0.611111,0.646472,0.071641,0.630387
5,SVM,0.79128,0.040835,0.781046,0.79128,0.040835,0.781046,0.799608,0.03545,0.791265
6,XGBoost,0.81164,0.085974,0.77451,0.81164,0.085974,0.77451,0.817087,0.083693,0.783971


## Cross validation

Now we calculate the cross validation:

In [None]:
class_counts = target.value_counts()
max_class_count = max(class_counts.values)
sampling_strategy_cross_val = {class_counts.index[i]: int(max_class_count * 0.15) + class_counts.values[i]
                     for i in range(len(y_train.value_counts().index)) if class_counts.values[i] < max_class_count}
cross_val_clf = Pipeline([
                    ('vect', CountVectorizer()),
                    ('smote', SMOTE(sampling_strategy=sampling_strategy_cross_val, random_state=1, k_neighbors=2)),
                    ('extra_trees', ExtraTreesClassifier()),
                    ])

In [None]:
scores = cross_val_score(cross_val_clf, df.clean_events_sequence.sample(frac=1, random_state=1), target.sample(frac=1, random_state=1),
                        cv=4, scoring='accuracy',n_jobs = -1)
scores.mean()

0.6409043854696029

Create a custom scoring f1 function with zero_division parameter for cross validation to avoid nan values:

In [None]:
# Create a custom scoring function with zero_division parameter
def custom_f1_score(y_true, y_pred):
    return f1_score(y_true, y_pred, average='weighted', zero_division=0)

# Wrap the custom scoring function using make_scorer
f1_scorer = make_scorer(custom_f1_score)

In [None]:
scores = cross_val_score(cross_val_clf, df.clean_events_sequence.sample(frac=1, random_state=1), target.sample(frac=1, random_state=1),
                        cv=4, scoring=f1_scorer,n_jobs = -1)
print(scores)
print(scores.mean())

[0.61823676 0.61774323 0.65055584 0.57616849]
0.615676079863001


F1 is calculated as:
$$ F1 Score= 2×\frac{Precision×Recall}{Precision+Recall}
​
$$
There are some minority classes with no correct predictions ($recall=0$) resulting in a null value for the whole f1 score when using a non-custom f1 scorer

## GridsearchCV
Now we use gridsearchCV to find the optimal parameters

In [None]:
class_counts = target.value_counts()
max_class_count = max(class_counts.values)
sampling_strategy_grid = {class_counts.index[i]: int(max_class_count * 0.15) + class_counts.values[i]
                     for i in range(len(y_train.value_counts().index)) if class_counts.values[i] < max_class_count}
sampling_strategy_grid

{99: 222,
 14: 196,
 2: 166,
 9: 164,
 4: 125,
 11: 73,
 17: 57,
 6: 53,
 3: 52,
 16: 51,
 7: 51}

In [None]:
grid_clf = Pipeline([
                    ('vect', CountVectorizer()),
                    ('smote', SMOTE(sampling_strategy=sampling_strategy_grid, random_state=1, k_neighbors=2)),
                    ('extra_trees', ExtraTreesClassifier()),
                    ])

In [None]:
# Define the parameter grid for GridSearchCV 15%
param_grid = {
    'vect__max_features': [500, 1000],       # Example parameter for CountVectorizer
    'vect__ngram_range': [(1, 1), (1, 2), (1,3)],   # Unigrams, bigrams, trigrams
    'extra_trees__n_estimators': [100, 200, 300, 400],        # Number of trees in ExtraTrees
    'extra_trees__max_depth': [None, 10]        # Depth of each tree
}

# Cross-validation strategy set here to replicate results
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Define GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(grid_clf, param_grid, cv=cv, scoring=f1_scorer, n_jobs=-1)

# Fit GridSearchCV to the data
grid_search.fit(df.clean_events_sequence, target)

# Output the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score F1:", grid_search.best_score_)
print("Accuracy:", str(np.mean(cross_val_score(grid_search.best_estimator_, df.clean_events_sequence, target, cv=cv, scoring='accuracy'))))

Best Parameters: {'extra_trees__max_depth': None, 'extra_trees__n_estimators': 300, 'vect__max_features': 1000, 'vect__ngram_range': (1, 1)}
Best Score F1: 0.6406886229518796
Accuracy: 0.6548246439550787


In [None]:
! pip install hmmlearn umap-learn umap-learn[plot] xgboost imbalanced-learn



In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from gensim.models import Word2Vec
from hmmlearn import hmm
import umap
import umap.plot
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import ast
import xgboost as xgb
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report


In [None]:
df = pd.read_csv('sncb_data_challenge.csv', delimiter=';')
df.sample(5)

Unnamed: 0.1,Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
895,895,4604847,"[537, 537, 537, 537, 537, 537, 537, 537, 537, ...","[3658, 4068, 3658, 4068, 3658, 4068, 3658, 406...","[-14388, -14257, -14222, -14127, -14090, -1381...",50.782892,4.421971,"[0.0, 0.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.3, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",2
384,384,4451781,"[609, 609, 609, 609, 609, 609, 609, 609, 609, ...","[4068, 3658, 4068, 3658, 4066, 3658, 4068, 365...","[-14137, -14078, -13904, -13892, -13519, -1258...",50.911038,4.151967,"[0.2, 0.0, 0.1, 0.0, 0.1, 0.0, 0.1, 0.0, 0.2, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",9
690,690,4465347,"[638, 638, 638, 638, 638, 638, 638, 638, 638, ...","[2742, 4002, 4110, 2708, 4026, 4148, 4140, 412...","[-8280, -8280, -8278, -8275, -8275, -8275, -82...",50.854381,2.737718,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",13
970,970,4610465,"[529, 529, 529, 529, 529, 529, 529, 529, 529, ...","[4066, 4068, 4068, 3658, 4068, 3658, 4068, 365...","[-14271, -14121, -13619, -13600, -13028, -1292...",50.720086,4.397469,"[1.6, 2.7, 3.2, 0.0, 0.1, 0.0, 1.8, 0.0, 1.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",9
389,389,4451923,"[637, 637, 637, 637, 637, 637, 637, 637, 637, ...","[3636, 3658, 2956, 2956, 4066, 3636, 3658, 295...","[-14397, -14397, -14342, -14284, -14241, -1422...",50.805012,4.600712,"[0.0, 0.0, 31.7, 25.0, 1.9, 0.0, 0.0, 22.0, 27...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",13


In [None]:
events_list = []
events_pre_incident = []
events_post_incident = []

for i, (events, seconds_to_incident_sequence, vehicles_sequence, train_kph_sequence, dj_ac_state_sequence, dj_dc_state_sequence) in tqdm(enumerate(zip(df["events_sequence"],
                                                                                                                                                      df["seconds_to_incident_sequence"],
                                                                                                                                                      df["vehicles_sequence"],
                                                                                                                                                      df["train_kph_sequence"],
                                                                                                                                                      df["dj_ac_state_sequence"],
                                                                                                                                                      df["dj_dc_state_sequence"])), total=len(df)):
    events = ast.literal_eval(events)
    seconds_to_incident_sequence = ast.literal_eval(seconds_to_incident_sequence)
    vehicles_sequence = ast.literal_eval(vehicles_sequence)
    train_kph_sequence = ast.literal_eval(train_kph_sequence)
    dj_ac_state_sequence = ast.literal_eval(dj_ac_state_sequence)
    dj_dc_state_sequence = ast.literal_eval(dj_dc_state_sequence)


    pre_incidents = []
    pre_incidents_vehicles = []
    pre_incidents_kph = []
    pre_incidents_ac = []
    pre_incidents_dc = []


    post_incidents = []
    post_incidents_vehicles = []
    post_incidents_kph = []
    post_incidents_ac = []
    post_incidents_dc = []

    event_seq = []
    prev_event = 0

    for event, time_to_incident, vehicle, kph, ac, dc in zip(events, seconds_to_incident_sequence, vehicles_sequence, train_kph_sequence, dj_ac_state_sequence, dj_dc_state_sequence):
      #if event != prev_event:
      event_seq.append(str(event))
      if time_to_incident <= 0:
          pre_incidents.append(str(event))
          pre_incidents_vehicles.append(str(vehicle))
          pre_incidents_kph.append(str(kph))
          pre_incidents_ac.append(str(ac))
          pre_incidents_dc.append(str(dc))
      else:
          post_incidents.append(str(event))
          post_incidents_vehicles.append(str(vehicle))
          post_incidents_kph.append(str(kph))
          post_incidents_ac.append(str(ac))
          post_incidents_dc.append(str(dc))
      #prev_event = event

    # Append the pre and post incident lists to the main lists
    events_pre_incident.append(pre_incidents)
    events_post_incident.append(post_incidents)
    events_list.append(event_seq)

100%|██████████| 1011/1011 [00:15<00:00, 64.88it/s] 


In [None]:
from itertools import combinations
tuples_post = []
tuples_pre = []

for post_events in tqdm(events_post_incident):
  prev_post_event = 0
  for post_event in post_events:
    if prev_post_event != 0:
      tup = [prev_post_event, post_event]
      prev_post_event = post_event
      if tup not in tuples_post:
        tuples_post.append(tup)
    else:
      prev_post_event = post_event

for pre_events in tqdm(events_pre_incident):
  prev_pre_event = 0
  for pre_event in pre_events:
    if prev_pre_event != 0:
      tup = [prev_pre_event, pre_event]
      prev_pre_event = pre_event
      if tup not in tuples_pre:
        tuples_pre.append(tup)
    else:
      prev_pre_event = pre_event

100%|██████████| 1011/1011 [00:08<00:00, 123.91it/s]
100%|██████████| 1011/1011 [00:10<00:00, 97.33it/s]


In [None]:
c = 0
for post in tuples_post:
  if post in tuples_pre:
    c += 1
c/len(tuples_post)

0.4953556864521976

In [None]:
c = 0
for pre in tuples_pre:
  if pre in tuples_post:
    c += 1
c/len(tuples_pre)

0.4728078711212023

In [None]:
cleaned_events_pre_incident = []
tuples_removed = 0

for prev_events in tqdm(events_pre_incident):
  cleaned_events = []
  prev_pre_event = 0
  for pre_event in prev_events:
    if prev_pre_event != 0:
      tup = [prev_pre_event, pre_event]
      if tup not in tuples_post:
        cleaned_events.append(prev_pre_event)
      else:
        tuples_removed += 1
    else:
      prev_pre_event = pre_event
  cleaned_events.append(pre_event)
  cleaned_events_pre_incident.append(cleaned_events)
cleaned_events_pre_incident[0]

100%|██████████| 1011/1011 [01:07<00:00, 14.95it/s]


In [None]:
from tqdm import tqdm

cleaned_events_pre_incident = []
tuples_removed = 0

for prev_events in tqdm(events_pre_incident):
    cleaned_events = []
    prev_pre_event = None  # Usa None para identificar el primer elemento

    for pre_event in prev_events:
        if prev_pre_event is not None:
            tup = [prev_pre_event, pre_event]

            if tup not in tuples_post:
                cleaned_events.append(prev_pre_event)
            else:
                tuples_removed += 1

        prev_pre_event = pre_event

    if prev_pre_event is not None:
        cleaned_events.append(prev_pre_event)
    cleaned_events_pre_incident.append(cleaned_events)

cleaned_events_pre_incident[0]


100%|██████████| 1011/1011 [00:16<00:00, 60.09it/s]


['1132',
 '4026',
 '1082',
 '2742',
 '4092',
 '2982',
 '1250',
 '1250',
 '2982',
 '4394',
 '2708',
 '3036',
 '3986']

In [None]:
tuples_removed

191311

In [None]:
df.groupby("incident_type").count()["incident_id"]

Unnamed: 0_level_0,incident_id
incident_type,Unnamed: 1_level_1
2,119
3,5
4,78
6,6
7,4
9,117
11,26
13,318
14,149
16,4


In [None]:
lens = []
for events in cleaned_events_pre_incident:
  lens.append(len(events))
print(np.mean(lens))

9.22650840751731


In [None]:
vector_size = 4
word2vec = Word2Vec(sentences=cleaned_events_pre_incident, vector_size=vector_size, window=9, sg=1, min_count=4, workers=-1)
embeddings = []
labels = df["incident_type"]
actual_events = []

for events in tqdm(cleaned_events_pre_incident):
  embedding = np.zeros(vector_size)
  denominator = 0
  for event in events:
    if event in word2vec.wv:
      embedding += word2vec.wv[event]
      denominator += 1

  embeddings.append(embedding)

100%|██████████| 1011/1011 [00:00<00:00, 24517.17it/s]
