<a href="https://colab.research.google.com/github/stef4k/train-maintenance-data-mining/blob/main/text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text classification


In [None]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import numpy as np
from collections import Counter
import ast


#from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score, LeaveOneOut
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

Manually remove the first ';' from the first row in csv file

In [None]:
df = pd.read_csv('sncb_data_challenge.csv', delimiter=';')
df.sample(2)

Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
525,4457555,"[604, 604, 604, 604, 604, 604, 604, 604, 604, ...","[2434, 4002, 4032, 2852, 4110, 2854, 4028, 402...","[-13421, -13421, -13421, -13418, -13418, -1341...",50.936962,5.311587,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[False, False, False, False, False, False, Fal...",99
473,4455349,"[702, 702, 702, 702, 702, 702, 702, 702, 702, ...","[4066, 4068, 4124, 3634, 2682, 3620, 4148, 412...","[-12509, -12509, -11526, -11525, -11522, -1152...",50.903678,4.388016,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",99


Now I will analyze the percentage of each event type appearing at least once in an event sequence:

In [None]:
events_types_dict = {}
for events_sequence in df['events_sequence']:
    row_list = ast.literal_eval(events_sequence) #transforming string into actual list
    unique_events = set(row_list)
    for event in unique_events:
        if not events_types_dict.get(event):
            events_types_dict[event] = 0
        events_types_dict[event] += 1
sorted_dict = dict(sorted(events_types_dict.items(), key=lambda item: item[1], reverse=True))
# Convert the sorted dictionary to a DataFrame
sorted_events_perc_df = pd.DataFrame(list(sorted_dict.items()), columns=['event_type', 'frequency'])
sorted_events_perc_df['percentage'] = sorted_events_perc_df['frequency'] / df.shape[0] * 100
# Cast the 'event_type' column to string
sorted_events_perc_df['event_type'] = sorted_events_perc_df['event_type'].astype(str)

We save in a list all event codes that appear in less than 85% of the event sequences:

In [None]:
events_low_frequency = list(map(int, list(sorted_events_perc_df[sorted_events_perc_df.percentage<=85].event_type)))

## Text preprocessing
Before we start with text classification we need to clean the sequences of events. As seen one value of `events_sequence` contains commas and brackets even though it is a string

In [None]:
df.events_sequence.iloc[0]

'[2744, 4004, 2852, 4110, 2854, 4396, 1132, 4140, 4148, 2708, 4026, 1032, 1082, 4152, 4030, 4018, 4168, 4156, 4394, 152, 2742, 4410, 4406, 4068, 4408, 4412, 4066, 2744, 4026, 4148, 4168, 4140, 3986, 2744, 4002, 2852, 4110, 2854, 4148, 2708, 4026, 4140, 4152, 4030, 4018, 4140, 4168, 4156, 2852, 2854, 4124, 2858, 2658, 2688, 3254, 3254, 3254, 2970, 4082, 4090, 4092, 2982, 3236, 4100, 2702, 4394, 1250, 2970, 2980, 2970, 2980, 2970, 2982, 2970, 2982, 4168, 4140, 3986, 2742, 4004, 2852, 4110, 2854, 2982, 2708, 4026, 4030, 4018, 4148, 4140, 4152, 4168, 4156, 4120, 2858, 2658, 2688, 3254, 3254, 2970, 2982, 2708, 2970, 2982, 4100, 2702, 1250, 4394, 2744, 4026, 4148, 2970, 2980, 4168, 4140, 4168, 3986, 2744, 4002, 2852, 4110, 2854, 2980, 2708, 4026, 4148, 2552, 4168, 4140, 4152, 4030, 4018, 4026, 4140, 4168, 4156, 2970, 2982, 2708, 2970, 4082, 4092, 4090, 4084, 4094, 4090, 3236, 2982, 4100, 2702, 1250, 4394, 4168, 4140, 3986, 2744, 4004, 2852, 4110, 2854, 2982, 2708, 4026, 4140, 4030, 4018, 414

Also, as observed before some event types are so common they do not actually bring a lot of value (as mentioned in the paper as well). We remove those common event types

The steps to clean the event sequences are:
- keep non-common event types mentioned in list `events_low_frequency`
- remove symbols: [] , and store sequences of events as a string without brackets and commas:

In [None]:
df['clean_events_sequence'] = df.events_sequence.apply(ast.literal_eval).apply(lambda x: [i for i in x if i in events_low_frequency]).astype(str)\
                .replace(r'[\[\],]', '', regex=True)

## Text classification

Now we try to experiment using text techniques to transform the list events sequence:

In [None]:
target = df['incident_type'].copy() # target column separated
X_train, X_test, y_train, y_test = train_test_split(df.clean_events_sequence, target, test_size=0.2,  random_state=7)

Since the dataset is imbalanced we will use different strategies to battle that. Here we set a new sampling strategy based on a basic script:

In [None]:
# Define custom sampling strategy based on class distribution
# Each non-majority class will have equal samples to 15% of the majority class plus their previous samples
class_counts = y_train.value_counts()
max_class_count = max(class_counts.values)
sampling_strategy = {class_counts.index[i]: int(max_class_count * 0.15) + class_counts.values[i]
                     for i in range(len(y_train.value_counts().index)) if class_counts.values[i] < max_class_count}
sampling_strategy

{99: 167,
 14: 155,
 2: 135,
 9: 135,
 4: 105,
 11: 58,
 17: 47,
 6: 45,
 3: 43,
 7: 43,
 16: 43}

Starting with CountVectorizer:
- Tokenization: Splits text into individual words (tokens).
- Builds a Vocabulary: Creates a dictionary of unique words (tokens) from the entire corpus.
- Counts the Occurrence: Calculates the frequency (count) of each word in each document.
- Transforms Text into a Sparse Matrix: Returns a matrix of shape (n_samples, n_features), where n_samples is the number of documents and n_features is the number of unique words in the vocabulary.

  We firstly set the sampling strategy for SMOTE:

Now we set the pipeline to be used:

In [None]:
text_clf = Pipeline([
                    ('vect', CountVectorizer()),
                     #('decision_tree', DecisionTreeClassifier()),
                    ('smote', SMOTE(sampling_strategy=sampling_strategy, random_state=1, k_neighbors=2)),
                    ('extra_trees', ExtraTreesClassifier()),
                    #('random_forest', RandomForestClassifier())
                    ])

Training the model:

In [None]:
text_clf.fit(X_train, y_train)

Print the results for the particular split of test data:

In [None]:
clf_predict = text_clf.predict(X_test)
print(classification_report(y_test, clf_predict, zero_division=1))

              precision    recall  f1-score   support

           2       0.61      0.87      0.71        23
           3       0.00      0.00      0.00         1
           4       0.50      0.42      0.45        12
           6       0.00      1.00      0.00         0
           9       0.61      0.52      0.56        21
          11       0.50      0.14      0.22         7
          13       0.64      0.79      0.71        57
          14       0.66      0.58      0.61        33
          17       1.00      0.00      0.00         2
          99       0.56      0.47      0.51        47

    accuracy                           0.61       203
   macro avg       0.51      0.48      0.38       203
weighted avg       0.61      0.61      0.59       203



## Cross validation

Now we calculate the cross validation:

In [None]:
class_counts = target.value_counts()
max_class_count = max(class_counts.values)
sampling_strategy_cross_val = {class_counts.index[i]: int(max_class_count * 0.15) + class_counts.values[i]
                     for i in range(len(y_train.value_counts().index)) if class_counts.values[i] < max_class_count}
cross_val_clf = Pipeline([
                    ('vect', CountVectorizer()),
                    ('smote', SMOTE(sampling_strategy=sampling_strategy_cross_val, random_state=1, k_neighbors=2)),
                    ('extra_trees', ExtraTreesClassifier()),
                    ])

In [None]:
scores = cross_val_score(cross_val_clf, df.clean_events_sequence.sample(frac=1, random_state=1), target.sample(frac=1, random_state=1),
                        cv=4, scoring='accuracy',n_jobs = -1)
scores.mean()

0.6409043854696029

Create a custom scoring f1 function with zero_division parameter for cross validation to avoid nan values:

In [None]:
# Create a custom scoring function with zero_division parameter
def custom_f1_score(y_true, y_pred):
    return f1_score(y_true, y_pred, average='weighted', zero_division=0)

# Wrap the custom scoring function using make_scorer
f1_scorer = make_scorer(custom_f1_score)

In [None]:
scores = cross_val_score(cross_val_clf, df.clean_events_sequence.sample(frac=1, random_state=1), target.sample(frac=1, random_state=1),
                        cv=4, scoring=f1_scorer,n_jobs = -1)
print(scores)
print(scores.mean())

[0.61823676 0.61774323 0.65055584 0.57616849]
0.615676079863001


F1 is calculated as:
$$ F1 Score= 2×\frac{Precision×Recall}{Precision+Recall}
​
$$
There are some minority classes with no correct predictions ($recall=0$) resulting in a null value for the whole f1 score when using a non-custom f1 scorer

## GridsearchCV
Now we use gridsearchCV to find the optimal parameters

In [None]:
class_counts = target.value_counts()
max_class_count = max(class_counts.values)
sampling_strategy_grid = {class_counts.index[i]: int(max_class_count * 0.15) + class_counts.values[i]
                     for i in range(len(y_train.value_counts().index)) if class_counts.values[i] < max_class_count}
sampling_strategy_grid

{99: 222,
 14: 196,
 2: 166,
 9: 164,
 4: 125,
 11: 73,
 17: 57,
 6: 53,
 3: 52,
 16: 51,
 7: 51}

In [None]:
grid_clf = Pipeline([
                    ('vect', CountVectorizer()),
                    ('smote', SMOTE(sampling_strategy=sampling_strategy_grid, random_state=1, k_neighbors=2)),
                    ('extra_trees', ExtraTreesClassifier()),
                    ])

In [None]:
# Define the parameter grid for GridSearchCV 15%
param_grid = {
    'vect__max_features': [500, 1000],       # Example parameter for CountVectorizer
    'vect__ngram_range': [(1, 1), (1, 2), (1,3)],   # Unigrams, bigrams, trigrams
    'extra_trees__n_estimators': [100, 200, 300, 400],        # Number of trees in ExtraTrees
    'extra_trees__max_depth': [None, 10]        # Depth of each tree
}

# Cross-validation strategy set here to replicate results
cv = StratifiedKFold(n_splits=4, shuffle=True, random_state=42)

# Define GridSearchCV with the pipeline and parameter grid
grid_search = GridSearchCV(grid_clf, param_grid, cv=cv, scoring=f1_scorer, n_jobs=-1)

# Fit GridSearchCV to the data
grid_search.fit(df.clean_events_sequence, target)

# Output the best parameters and the best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score F1:", grid_search.best_score_)
print("Accuracy:", str(np.mean(cross_val_score(grid_search.best_estimator_, df.clean_events_sequence, target, cv=cv, scoring='accuracy'))))

Best Parameters: {'extra_trees__max_depth': None, 'extra_trees__n_estimators': 300, 'vect__max_features': 1000, 'vect__ngram_range': (1, 1)}
Best Score F1: 0.6406886229518796
Accuracy: 0.6548246439550787


In [15]:
! pip install hmmlearn umap-learn umap-learn[plot]

Collecting datashader (from umap-learn[plot])
  Downloading datashader-0.16.3-py2.py3-none-any.whl.metadata (12 kB)
Collecting pyct (from datashader->umap-learn[plot])
  Downloading pyct-0.5.0-py2.py3-none-any.whl.metadata (7.4 kB)
Downloading datashader-0.16.3-py2.py3-none-any.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyct-0.5.0-py2.py3-none-any.whl (15 kB)
Installing collected packages: pyct, datashader
Successfully installed datashader-0.16.3 pyct-0.5.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.ensemble import VotingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from gensim.models import Word2Vec
from hmmlearn import hmm
import umap
import umap.plot
import matplotlib.pyplot as plt
import ast
from tqdm import tqdm

In [4]:
df = pd.read_csv('sncb_data_challenge.csv', delimiter=';')
df.sample(5)

Unnamed: 0.1,Unnamed: 0,incident_id,vehicles_sequence,events_sequence,seconds_to_incident_sequence,approx_lat,approx_lon,train_kph_sequence,dj_ac_state_sequence,dj_dc_state_sequence,incident_type
674,674,4464669,"[506, 506, 506, 506, 506, 506, 506, 506, 506, ...","[4016, 2744, 4148, 2708, 4026, 4020, 4168, 414...","[-11715, -10710, -10710, -10708, -10708, -1070...",50.731644,4.502946,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Fal...",13
861,861,4602871,"[549, 549, 549, 549, 549, 549, 549, 549, 549, ...","[4120, 3636, 3658, 2956, 2956, 2956, 2956, 295...","[-14343, -14331, -14331, -14009, -13894, -1389...",50.45217,4.318044,"[0.0, 0.0, 0.0, 13.7, 32.1, 34.5, 49.3, 50.7, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",6
495,495,4456317,"[1087, 1087, 1087, 1087, 1087, 1087, 1087, 108...","[2956, 2956, 2956, 2956, 2956, 2956, 4068, 363...","[-14373, -14364, -14363, -14359, -14348, -1434...",50.556395,3.923098,"[104.4, 104.2, 103.9, 102.7, 81.3, 77.3, 0.0, ...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",99
216,216,4443009,"[1020, 1020, 1020, 1020, 1020, 1020, 1020, 102...","[2956, 2956, 2956, 2956, 2956, 2956, 2956, 295...","[-14375, -14346, -14335, -14323, -14303, -1429...",50.168094,5.871434,"[48.6, 93.2, 91.8, 94.0, 93.8, 94.4, 96.2, 94....","[True, True, True, True, True, True, True, Tru...","[False, False, False, False, False, False, Fal...",2
26,26,4434195,"[511, 511, 511, 511, 511, 511, 511, 511, 511, ...","[4068, 3636, 3658, 2682, 4066, 3636, 3658, 406...","[-14231, -14214, -14214, -14113, -14071, -1405...",50.711379,4.401537,"[0.046875, 0.0, 0.0, 34.984375, 0.015625, 0.0,...","[False, False, False, False, False, False, Fal...","[True, True, True, True, True, True, True, Tru...",13


In [68]:
events_list = []
events_pre_incident = []
events_post_incident = []

for i, (events, seconds_to_incident_sequence) in tqdm(enumerate(zip(df["events_sequence"], df["seconds_to_incident_sequence"])), total=len(df)):
    events = ast.literal_eval(events)
    seconds_to_incident_sequence = ast.literal_eval(seconds_to_incident_sequence)

    pre_incidents = []
    post_incidents = []
    event_seq = []

    for event, time_to_incident in zip(events, seconds_to_incident_sequence):
        event_seq.append(str(event))
        if time_to_incident <= 0:
            pre_incidents.append(str(event))
        else:
            post_incidents.append(str(event))

    # Append the pre and post incident lists to the main lists
    events_pre_incident.append(pre_incidents)
    events_post_incident.append(post_incidents)
    events_list.append(event_seq)

100%|██████████| 1011/1011 [00:04<00:00, 233.25it/s]


In [94]:
word2vec = Word2Vec(sentences=events_pre_incident, vector_size=8, window=100, min_count=3, workers=4)
embeddings = []
labels = []
labels_org = df["incident_type"]

for events, label in tqdm(zip(events_pre_incident, labels_org)):
  embedding = np.zeros(8)
  for event in events:
    if event in word2vec.wv:
      embedding += word2vec.wv[event]
      embeddings.append(embedding)
      labels.append(label)

In [None]:
len(embeddings)

In [None]:
umap_model = umap.UMAP(n_components=2, min_dist=0.1, n_neighbors=200, metric='cosine')
X_umap = umap_model.fit_transform(embeddings)
umap.plot.points(umap_model, labels=labels)

In [None]:
class SCNB:
    def __init__(self):
        self.classifiers = {
            'LogisticRegression': LogisticRegression(),
            'DecisionTree': DecisionTreeClassifier(),
            'RandomForest': RandomForestClassifier(),
            'GaussianNB': GaussianNB(),
            'KNN': KNeighborsClassifier(),
            'SVM': SVC(probability=True),
            'XGBoost': XGBClassifier()
        }
        self.models = {}
        self.embedding_methods = {}
        self.ensemble_model = None
        self.hmm_model = None

    def train_classifiers(self, X, y):
        """Entrena cada clasificador popular junto con XGBoost."""
        for name, clf in self.classifiers.items():
            clf.fit(X, y)
            self.models[name] = clf
        print("All classifiers trained successfully.")

    def generate_embeddings(self, words_list):
        """Genera embeddings para una lista de palabras usando distintos encoders."""
        encoders = {
            'CountVectorizer': CountVectorizer(),
            'OneHotEncoder': OneHotEncoder(sparse=False),
            'TfidfVectorizer': TfidfVectorizer()
        }

        for name, encoder in encoders.items():
            self.embedding_methods[name] = encoder.fit_transform(words_list).toarray()

        word2vec_model = Word2Vec(sentences=[words_list], vector_size=100, window=5, min_count=1, workers=4)
        self.embedding_methods['Word2Vec'] = [word2vec_model.wv[word] for word in words_list if word in word2vec_model.wv]
        print("Embeddings generated successfully.")

    def evaluate_models(self, X, y):
        """Evalúa cada modelo entrenado usando StratifiedKFold y genera una tabla con métricas."""
        results = []
        skf = StratifiedKFold(n_splits=5)

        for name, model in self.models.items():
            accuracies, recalls, precisions = [], [], []
            for train_index, test_index in skf.split(X, y):
                X_train, X_test = X[train_index], X[test_index]
                y_train, y_test = y[train_index], y[test_index]

                model.fit(X_train, y_train)
                y_pred = model.predict(X_test)

                accuracies.append(accuracy_score(y_test, y_pred))
                recalls.append(recall_score(y_test, y_pred, average='weighted'))
                precisions.append(precision_score(y_test, y_pred, average='weighted'))

            results.append({
                'Model': name,
                'Accuracy Mean': np.mean(accuracies),
                'Accuracy Std': np.std(accuracies),
                'Accuracy Median': np.median(accuracies),
                'Recall Mean': np.mean(recalls),
                'Recall Std': np.std(recalls),
                'Recall Median': np.median(recalls),
                'Precision Mean': np.mean(precisions),
                'Precision Std': np.std(precisions),
                'Precision Median': np.median(precisions),
            })

        results_df = pd.DataFrame(results)
        print("Evaluation complete.")
        return results_df

    def train_ensemble_model(self, weights=None):
        """Crea y entrena un modelo de ensamblaje usando los mejores modelos entrenados."""
        estimators = [(name, model) for name, model in self.models.items()]
        self.ensemble_model = VotingClassifier(estimators=estimators, voting='soft', weights=weights)
        print("Ensemble model created successfully.")

    def train_hidden_markov_model(self, X, n_components=2):
        """Entrena un modelo Hidden Markov."""
        self.hmm_model = hmm.GaussianHMM(n_components=n_components)
        self.hmm_model.fit(X)
        print("Hidden Markov Model trained successfully.")

    def analyze_hmm(self):
        """Analiza el modelo Hidden Markov mostrando matrices de transición y demás estadísticas."""
        if self.hmm_model:
            print("Transition matrix:", self.hmm_model.transmat_)
            print("Means:", self.hmm_model.means_)
            print("Covars:", self.hmm_model.covars_)
        else:
            print("HMM model not trained yet.")

    def dimensionality_reduction_with_umap(self, X, y, min_dist_values=[0.1, 0.5], n_neighbors_values=[5, 10]):
        """Aplica reducción de dimensionalidad usando UMAP y muestra scatter plots."""
        for min_dist in min_dist_values:
            for n_neighbors in n_neighbors_values:
                umap_model = umap.UMAP(min_dist=min_dist, n_neighbors=n_neighbors)
                X_umap = umap_model.fit_transform(X)

                scatter_matrix(pd.DataFrame(X_umap), alpha=0.2, figsize=(10, 10), diagonal='kde', c=y)
                plt.title(f"UMAP Clustering (min_dist={min_dist}, n_neighbors={n_neighbors})")
                plt.show()

        print("Dimensionality reduction with UMAP completed.")

    def clustering_with_algorithms(self, X, y):
        """Realiza clustering usando algoritmos populares y muestra scatter plots."""
        clusterers = {
            'KMeans': KMeans(n_clusters=len(np.unique(y))),
            'Agglomerative': AgglomerativeClustering(n_clusters=len(np.unique(y))),
            'DBSCAN': DBSCAN()
        }

        for name, clusterer in clusterers.items():
            clusters = clusterer.fit_predict(X)
            plt.figure(figsize=(8, 6))
            scatter_matrix(pd.DataFrame(X), alpha=0.2, figsize=(10, 10), diagonal='kde', c=clusters)
            plt.title(f"Clustering with {name}")
            plt.show()

        print("Clustering with various algorithms completed.")


In [None]:
# Ejemplo de uso
# Supongamos que tienes un DataFrame `df` con variables `X` y `y` definidas.

# Crear instancia de SCNB
scnb = SCNB()

# Entrenar clasificadores
X = df.drop(columns=['target'])
y = df['target']
scnb.train_classifiers(X, y)

# Generar embeddings para una lista de palabras
words_list = ["gato", "perro", "pez", "pájaro"]
scnb.generate_embeddings(words_list)

# Evaluar modelos
results_df = scnb.evaluate_models(X, y)
print(results_df)

# Entrenar modelo de ensamblaje
scnb.train_ensemble_model()

# Entrenar y analizar un modelo HMM
scnb.train_hidden_markov_model(X)
scnb.analyze_hmm()

# Red
