In [27]:
import argparse
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
import numpy as np
import xgboost as xgb
from sklearn.svm import SVC

In [45]:
def merge_and_sort_csv(file1, file2, output_file):
    # Load the CSV files into pandas DataFrames
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    #df3 = pd.read_csv(file3)

    # Combine the data from both DataFrames
    combined_df = pd.concat([df1, df2], ignore_index=True)

    # Sort the combined data by the 'action' column (ascending order)
    sorted_df = combined_df.sort_values(by='action', ascending=True)

    # Save the sorted data to a new CSV file
    sorted_df.to_csv(output_file, index=False)

    print(f"Data merged and sorted successfully. Saved to {output_file}")

# Example usage
file1 = 'datasets_try/processed_features_ml_cl.csv'  # Path to your first CSV file
file2 = 'datasets_try/processed_features_all_cl.csv'  # Path to your second CSV file
#file3 = 'datasets_try/processed_features_5k.csv'
# file4 = 'datasets_try/processed_features_el.csv' 
output_file = 'datasets_try/collision_free.csv'  # Output CSV file

merge_and_sort_csv(file1, file2, output_file)

##add more datapoints 


Data merged and sorted successfully. Saved to datasets_try/collision_free.csv


In [46]:
# Load the dataset
collision_free_data = pd.read_csv('datasets_try/collision_free.csv')

# Check the distribution of the 'action' column
action_distribution = collision_free_data['action'].value_counts()
print(action_distribution)

1.0    2470
4.0    1567
3.0    1424
2.0     411
0.0     317
Name: action, dtype: int64


UPSAMPLE MINORITY using dataset fromrandomly generated cenarios

In [47]:

# Step 1: Load the dataset from the file
file_path = 'datasets_try/processed_features_5k.csv'
file_path2 = 'datasets_try/processed_features_el_cl.csv'
df = pd.read_csv(file_path)

# Step 2: Sort the dataset by the action label (last column)
# Assuming the action label is the last column
df_sorted = df.sort_values(by='action', ascending=True)

# Step 3: Extract data points with action label 0 and 2
df_label_0 = df_sorted[df_sorted[df.columns[-1]] == 0]
df_label_2 = df_sorted[df_sorted[df.columns[-1]] == 2]

# Step 4: Save the filtered data to separate CSV files
file_label_0 = 'datasets_try/label_0_data.csv'
file_label_2 = 'datasets_try/label_2_data.csv'

df_label_0.to_csv(file_label_0, index=False)
df_label_2.to_csv(file_label_2, index=False)

# Step 5: Extract 200 data points from each new CSV and merge them into another CSV
# Load the previously saved files
df_label_0_loaded = pd.read_csv(file_label_0)
df_label_2_loaded = pd.read_csv(file_label_2)

# Randomly select 200 data points from each
df_label_0_sampled = df_label_0_loaded.sample(n=189, random_state=42)
df_label_2_sampled = df_label_2_loaded.sample(n=200, random_state=42)

# Step 6: Merge the two sampled dataframes
merged_df = pd.concat([df_label_0_sampled, df_label_2_sampled])

# Step 7: Load the existing merged dataset if it exists or create a new one
input_file_path = 'datasets_try/collision_free.csv'

try:
    collision_free_data = pd.read_csv(input_file_path)
    # Append the new merged data to the existing one
    collision_free_data_upsampled = pd.concat([collision_free_data, merged_df])
except FileNotFoundError:
    # If the file does not exist, create a new one
    collision_free_data = merged_df

# Step 8: Save the merged data back into the CSV
collision_free_data.to_csv('datasets_try/collision_free_upsampled.csv', index=False)

print(f"Successfully processed and saved the data. Merged dataset saved at: {merged_file_path}")

action_distribution = collision_free_data_upsampled['action'].value_counts()
print(action_distribution)

Successfully processed and saved the data. Merged dataset saved at: datasets_try/collision_free.csv
1.0    2470
4.0    1567
3.0    1424
2.0     611
0.0     506
Name: action, dtype: int64


Normal


In [52]:
# Create a binary target variable
collision_free_data_upsampled['binary_target'] = collision_free_data_upsampled['action'].apply(lambda x: 1 if x in [1,3, 4] else 0)

# Split the data into features and binary target
X_binary = collision_free_data_upsampled.drop(columns=['action', 'binary_target'])
y_binary = collision_free_data_upsampled['binary_target']

# Split the dataset into train, test, and eval sets
X_train_binary, X_temp_binary, y_train_binary, y_temp_binary = train_test_split(X_binary, y_binary, test_size=0.2, random_state=42)
X_test_binary, X_eval_binary, y_test_binary, y_eval_binary = train_test_split(X_temp_binary, y_temp_binary, test_size=0.5, random_state=42)

# Train the binary classifier
binary_rf_model = RandomForestClassifier(random_state=42)
binary_rf_model.fit(X_train_binary, y_train_binary)

# Evaluate the binary classifier
y_pred_binary_test = binary_rf_model.predict(X_test_binary)
accuracy_binary_test = accuracy_score(y_test_binary, y_pred_binary_test)
print(f"Binary Classifier Accuracy on Test Data: {accuracy_binary_test * 100:.2f}%")
# Calculate precision, recall, and F1-score
precision_test = precision_score(y_test_binary, y_pred_binary_test)
recall_test = recall_score(y_test_binary, y_pred_binary_test)
f1_test = f1_score(y_test_binary, y_pred_binary_test)

# Print metrics
print(f"Precision on Test Data: {precision_test * 100:.2f}%")
print(f"Recall on Test Data: {recall_test * 100:.2f}%")
print(f"F1-Score on Test Data: {f1_test * 100:.2f}%")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred_binary_test))

# Generate a confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_binary, y_pred_binary_test))

# Save the binary classifier
joblib.dump(binary_rf_model, 'models_try/binary_rf_model_collision_free_upsampled.pkl')
#print(data[5:10])

Binary Classifier Accuracy on Test Data: 93.31%
Precision on Test Data: 94.28%
Recall on Test Data: 97.77%
F1-Score on Test Data: 95.99%

Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.73      0.80       119
           1       0.94      0.98      0.96       539

    accuracy                           0.93       658
   macro avg       0.91      0.85      0.88       658
weighted avg       0.93      0.93      0.93       658


Confusion Matrix:
[[ 87  32]
 [ 12 527]]


['models_try/binary_rf_model_collision_free_upsampled.pkl']

downsampling

In [58]:
collision_free_data_upsampled['binary_target'] = collision_free_data_upsampled['action'].apply(lambda x: 1 if x in [1, 3,4] else 0)

# Separate majority and minority classes
majority_class = collision_free_data_upsampled[collision_free_data_upsampled['binary_target'] == 1]
minority_class = collision_free_data_upsampled[collision_free_data_upsampled['binary_target'] == 0]
idle_class = collision_free_data_upsampled[collision_free_data_upsampled['action'] == 1]
non_ideal_class = collision_free_data_upsampled[collision_free_data_upsampled['action'] != 1]

# Downsample majority class
idle_downsampled = resample(idle_class, 
                                replace=True,    # Sample without replacement
                                n_samples=1500,  # Match minority class size
                                random_state=42)
collision_free_data_up_downsampled = pd.concat([idle_downsampled, non_ideal_class])

print(collision_free_data_up_downsampled['action'].value_counts())

4.0    1567
1.0    1500
3.0    1424
2.0     611
0.0     506
Name: action, dtype: int64


In [59]:
# Create a binary target variable
collision_free_data_up_downsampled['binary_target'] = collision_free_data_up_downsampled['action'].apply(lambda x: 1 if x in [1,3, 4] else 0)

# Split the data into features and binary target
X_binary = collision_free_data_up_downsampled.drop(columns=['action', 'binary_target'])
y_binary = collision_free_data_up_downsampled['binary_target']

# Split the dataset into train, test, and eval sets
X_train_binary, X_temp_binary, y_train_binary, y_temp_binary = train_test_split(X_binary, y_binary, test_size=0.2, random_state=42)
X_test_binary, X_eval_binary, y_test_binary, y_eval_binary = train_test_split(X_temp_binary, y_temp_binary, test_size=0.5, random_state=42)

# Train the binary classifier
binary_rf_model = RandomForestClassifier(random_state=42)
binary_rf_model.fit(X_train_binary, y_train_binary)

# Evaluate the binary classifier
y_pred_binary_test = binary_rf_model.predict(X_test_binary)
accuracy_binary_test = accuracy_score(y_test_binary, y_pred_binary_test)
print(f"Binary Classifier Accuracy on Test Data: {accuracy_binary_test * 100:.2f}%")
# Calculate precision, recall, and F1-score
precision_test = precision_score(y_test_binary, y_pred_binary_test)
recall_test = recall_score(y_test_binary, y_pred_binary_test)
f1_test = f1_score(y_test_binary, y_pred_binary_test)

# Print metrics
print(f"Precision on Test Data: {precision_test * 100:.2f}%")
print(f"Recall on Test Data: {recall_test * 100:.2f}%")
print(f"F1-Score on Test Data: {f1_test * 100:.2f}%")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred_binary_test))

# Generate a confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_binary, y_pred_binary_test))

# Save the binary classifier
joblib.dump(binary_rf_model, 'models_try/binary_rf_model_collision_free_up_down.pkl')
#print(data[5:10])

Binary Classifier Accuracy on Test Data: 93.40%
Precision on Test Data: 93.99%
Recall on Test Data: 97.99%
F1-Score on Test Data: 95.95%

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.75      0.82       114
           1       0.94      0.98      0.96       447

    accuracy                           0.93       561
   macro avg       0.92      0.87      0.89       561
weighted avg       0.93      0.93      0.93       561


Confusion Matrix:
[[ 86  28]
 [  9 438]]


['models_try/binary_rf_model_collision_free_up_down.pkl']

xgb

In [83]:

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Train the model
xgb_model.fit(X_train_binary, y_train_binary)

# Make predictions on the test set
y_pred_test = xgb_model.predict(X_test_binary)

# Evaluate the model
accuracy_test = accuracy_score(y_test_binary, y_pred_test)
precision_test = precision_score(y_test_binary, y_pred_test)
recall_test = recall_score(y_test_binary, y_pred_test)
f1_test = f1_score(y_test_binary, y_pred_test)

print(f"XGBoost Accuracy on Test Data: {accuracy_test * 100:.2f}%")
print(f"Precision on Test Data: {precision_test * 100:.2f}%")
print(f"Recall on Test Data: {recall_test * 100:.2f}%")
print(f"F1-Score on Test Data: {f1_test * 100:.2f}%")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred_test))

# Generate a confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_binary, y_pred_test))

# Save the trained XGBoost model
joblib.dump(xgb_model, 'models_try/xgb_model.pkl')

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy on Test Data: 87.86%
Precision on Test Data: 91.21%
Recall on Test Data: 95.25%
F1-Score on Test Data: 93.18%

Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.38      0.44        90
           1       0.91      0.95      0.93       610

    accuracy                           0.88       700
   macro avg       0.73      0.67      0.69       700
weighted avg       0.86      0.88      0.87       700


Confusion Matrix:
[[ 34  56]
 [ 29 581]]


['models_try/xgb_model.pkl']

svm

In [54]:
# Initialize the SVM classifier
svm_model = SVC(kernel='rbf', random_state=42)  # Using radial basis function kernel

# Train the model
svm_model.fit(X_train_binary, y_train_binary)

# Make predictions on the test set
y_pred_test = svm_model.predict(X_test_binary)

# Evaluate the model
accuracy_test = accuracy_score(y_test_binary, y_pred_test)
precision_test = precision_score(y_test_binary, y_pred_test)
recall_test = recall_score(y_test_binary, y_pred_test)
f1_test = f1_score(y_test_binary, y_pred_test)

print(f"svm Accuracy on Test Data: {accuracy_test * 100:.2f}%")
print(f"Precision on Test Data: {precision_test * 100:.2f}%")
print(f"Recall on Test Data: {recall_test * 100:.2f}%")
print(f"F1-Score on Test Data: {f1_test * 100:.2f}%")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred_test))

# Generate a confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_binary, y_pred_test))

# Save the trained XGBoost model
joblib.dump(xgb_model, 'models_try/xgb_model.pkl')

svm Accuracy on Test Data: 63.16%
Precision on Test Data: 58.75%
Recall on Test Data: 95.92%
F1-Score on Test Data: 72.87%

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.28      0.43        92
           1       0.59      0.96      0.73        98

    accuracy                           0.63       190
   macro avg       0.73      0.62      0.58       190
weighted avg       0.72      0.63      0.58       190


Confusion Matrix:
[[26 66]
 [ 4 94]]


['models_try/xgb_model.pkl']

class  weights

In [14]:
##using inverse frequency method

class_weights = {
    1: 1,     # Majority class 1
    4: 1,     # Majority class 4
    0: 3,     # Minority class 0
    2: 2.5,   # Minority class 2
    3: 2      # Minority class 3
}


class_weights_binary = {
    0: 17,     # Minority class 
    1: 1,     # Majority class 
}

In [60]:
# Create binary target variable
collision_free_data_upsampled['binary_target'] = collision_free_data_upsampled['action'].apply(lambda x: 1 if x in [1,3, 4] else 0)

# Split the data into features and target
X_binary = collision_free_data_upsampled.drop(columns=['action', 'binary_target'])
y_binary = collision_free_data_upsampled['binary_target']

# Split the dataset into train, test, and eval sets
X_train_binary, X_temp_binary, y_train_binary, y_temp_binary = train_test_split(X_binary, y_binary, test_size=0.2, random_state=42)
X_test_binary, X_eval_binary, y_test_binary, y_eval_binary = train_test_split(X_temp_binary, y_temp_binary, test_size=0.5, random_state=42)

# Train the binary classifier with class weights
binary_rf_model = RandomForestClassifier(class_weight=class_weights_binary, random_state=42)
binary_rf_model.fit(X_train_binary, y_train_binary)

# Evaluate the binary classifier
y_pred_binary_test = binary_rf_model.predict(X_test_binary)
accuracy_binary_test = accuracy_score(y_test_binary, y_pred_binary_test)
print(f"Binary Classifier Accuracy on Test Data: {accuracy_binary_test * 100:.2f}%")
print(f"Precision on Test Data: {precision_test * 100:.2f}%")
print(f"Recall on Test Data: {recall_test * 100:.2f}%")
print(f"F1-Score on Test Data: {f1_test * 100:.2f}%")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred_binary_test))

# Generate a confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_binary, y_pred_binary_test))

# Save the binary classifier
joblib.dump(binary_rf_model, 'models_try/binary_rf_model_weights.pkl')

Binary Classifier Accuracy on Test Data: 93.47%
Precision on Test Data: 93.99%
Recall on Test Data: 97.99%
F1-Score on Test Data: 95.95%

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.71      0.80       119
           1       0.94      0.98      0.96       539

    accuracy                           0.93       658
   macro avg       0.92      0.85      0.88       658
weighted avg       0.93      0.93      0.93       658


Confusion Matrix:
[[ 85  34]
 [  9 530]]


['models_try/binary_rf_model_weights.pkl']

minor action classifier

In [62]:
# Filter the data for classes 0, 2,
minor_data = collision_free_data_upsampled[collision_free_data_upsampled['binary_target'] == 0]
#print(filtered_data[0:5])

# Split the dataset into features and target
X = minor_data.drop(columns=['action', 'binary_target'])
y = minor_data['action']  # Multi-class target

# Split into train, test, and eval sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_eval, y_test, y_eval = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train a single Random Forest classifier
rf_model_minor = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_minor.fit(X_train, y_train)

# Make predictions
y_pred_test = rf_model_minor.predict(X_test)
y_pred_eval = rf_model_minor.predict(X_eval)

# Evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Evaluation Accuracy:", accuracy_score(y_eval, y_pred_eval))
print("\nClassification Report on Test Data:\n", classification_report(y_test, y_pred_test))

# Save the binary classifier
joblib.dump(rf_model_minor, 'models_try/minor_rf_model_upsampled.pkl')


Test Accuracy: 0.9821428571428571
Evaluation Accuracy: 0.9910714285714286

Classification Report on Test Data:
               precision    recall  f1-score   support

         0.0       0.96      1.00      0.98        55
         2.0       1.00      0.96      0.98        57

    accuracy                           0.98       112
   macro avg       0.98      0.98      0.98       112
weighted avg       0.98      0.98      0.98       112



['models_try/minor_rf_model_upsampled.pkl']

major action classifier

In [64]:
# Filter the data for classes 1 3and 4
major_data_down_upsampled = collision_free_data_up_downsampled[collision_free_data_up_downsampled['binary_target'] == 1]
#print(filtered_data[0:5])

# Split the dataset into features and target
X = major_data_down_upsampled.drop(columns=['action', 'binary_target'])
y = major_data_down_upsampled['action']  # Multi-class target

# Split into train, test, and eval sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_eval, y_test, y_eval = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train a single Random Forest classifier
rf_model_major = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_major.fit(X_train, y_train)

# Make predictions
y_pred_test = rf_model_major.predict(X_test)
y_pred_eval = rf_model_major.predict(X_eval)

# Evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Evaluation Accuracy:", accuracy_score(y_eval, y_pred_eval))
print("\nClassification Report on Test Data:\n", classification_report(y_test, y_pred_test))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_test))

# Save the binary classifier
joblib.dump(rf_model_major, 'models_try/major_rf_model_down_upsampled.pkl')


Test Accuracy: 0.7104677060133631
Evaluation Accuracy: 0.76

Classification Report on Test Data:
               precision    recall  f1-score   support

         1.0       0.65      0.63      0.64       156
         3.0       0.76      0.70      0.73       147
         4.0       0.73      0.81      0.77       146

    accuracy                           0.71       449
   macro avg       0.71      0.71      0.71       449
weighted avg       0.71      0.71      0.71       449


Confusion Matrix:
[[ 98  23  35]
 [ 35 103   9]
 [ 18  10 118]]


['models_try/major_rf_model_down_upsampled.pkl']