In [29]:
import argparse
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
import numpy as np
import xgboost as xgb
from sklearn.svm import SVC

In [130]:
def merge_and_sort_csv(file1, file2, output_file):
    # Load the CSV files into pandas DataFrames
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    #df3 = pd.read_csv(file3)

    # Combine the data from both DataFrames
    combined_df = pd.concat([df1, df2], ignore_index=True)

    # Sort the combined data by the 'action' column (ascending order)
    sorted_df = combined_df.sort_values(by='action', ascending=True)

    # Save the sorted data to a new CSV file
    sorted_df.to_csv(output_file, index=False)

    print(f"Data merged and sorted successfully. Saved to {output_file}")

# Example usage
file1 = 'datasets_try/processed_features_ml_cl.csv'  # Path to your first CSV file
file2 = 'datasets_try/processed_features_el.csv'  # Path to your second CSV file
#file3 = 'datasets_try/processed_features_5k.csv'
# file4 = 'datasets_try/processed_features_el.csv' 
output_file = 'datasets_try/merged_sorted_data.csv'  # Output CSV file

merge_and_sort_csv(file1, file2, output_file)

##add more datapoints 


Data merged and sorted successfully. Saved to datasets_try/merged_sorted_data.csv


In [131]:
# Load the dataset
data = pd.read_csv('datasets_try/merged_sorted_data.csv')

# Check the distribution of the 'action' column
action_distribution = data['action'].value_counts()
print(action_distribution)

1.0    2388
4.0    1528
3.0    1342
2.0     244
0.0     167
Name: action, dtype: int64


UPSAMPLE MINORITY

Normal


In [126]:
# Create a binary target variable
data['binary_target'] = data['action'].apply(lambda x: 1 if x in [1,3, 4] else 0)

# Split the data into features and binary target
X_binary = data.drop(columns=['action', 'binary_target'])
y_binary = data['binary_target']

# Split the dataset into train, test, and eval sets
X_train_binary, X_temp_binary, y_train_binary, y_temp_binary = train_test_split(X_binary, y_binary, test_size=0.2, random_state=42)
X_test_binary, X_eval_binary, y_test_binary, y_eval_binary = train_test_split(X_temp_binary, y_temp_binary, test_size=0.5, random_state=42)

# Train the binary classifier
binary_rf_model = RandomForestClassifier(random_state=42)
binary_rf_model.fit(X_train_binary, y_train_binary)

# Evaluate the binary classifier
y_pred_binary_test = binary_rf_model.predict(X_test_binary)
accuracy_binary_test = accuracy_score(y_test_binary, y_pred_binary_test)
print(f"Binary Classifier Accuracy on Test Data: {accuracy_binary_test * 100:.2f}%")
# Calculate precision, recall, and F1-score
precision_test = precision_score(y_test_binary, y_pred_binary_test)
recall_test = recall_score(y_test_binary, y_pred_binary_test)
f1_test = f1_score(y_test_binary, y_pred_binary_test)

# Print metrics
print(f"Precision on Test Data: {precision_test * 100:.2f}%")
print(f"Recall on Test Data: {recall_test * 100:.2f}%")
print(f"F1-Score on Test Data: {f1_test * 100:.2f}%")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred_binary_test))

# Generate a confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_binary, y_pred_binary_test))

# Save the binary classifier
joblib.dump(binary_rf_model, 'models_try/binary_rf_model.pkl')
#print(data[5:10])

Binary Classifier Accuracy on Test Data: 92.72%
Precision on Test Data: 93.83%
Recall on Test Data: 98.70%
F1-Score on Test Data: 96.20%

Classification Report:
              precision    recall  f1-score   support

           0       0.29      0.07      0.12        27
           1       0.94      0.99      0.96       385

    accuracy                           0.93       412
   macro avg       0.61      0.53      0.54       412
weighted avg       0.90      0.93      0.91       412


Confusion Matrix:
[[  2  25]
 [  5 380]]


['models_try/binary_rf_model.pkl']

downsampling

In [124]:
data['binary_target'] = data['action'].apply(lambda x: 1 if x in [1, 3,4] else 0)

# Separate majority and minority classes
majority_class = data[data['binary_target'] == 1]
minority_class = data[data['binary_target'] == 0]

# Downsample majority class
majority_downsampled = resample(majority_class, 
                                replace=True,    # Sample without replacement
                                n_samples=3000,  # Match minority class size
                                random_state=42)
balanced_data = pd.concat([majority_downsampled, minority_class])

print(balanced_data['action'].value_counts())

2.0    182
0.0    136
1.0    135
4.0     99
3.0     84
Name: action, dtype: int64


In [125]:
# Create a binary target variable
balanced_data['binary_target'] = balanced_data['action'].apply(lambda x: 1 if x in [1,3, 4] else 0)

# Split the data into features and binary target
X_binary = balanced_data.drop(columns=['action', 'binary_target'])
y_binary = balanced_data['binary_target']

# Split the dataset into train, test, and eval sets
X_train_binary, X_temp_binary, y_train_binary, y_temp_binary = train_test_split(X_binary, y_binary, test_size=0.2, random_state=42)
X_test_binary, X_eval_binary, y_test_binary, y_eval_binary = train_test_split(X_temp_binary, y_temp_binary, test_size=0.5, random_state=42)

# Train the binary classifier
binary_rf_model = RandomForestClassifier(random_state=42)
binary_rf_model.fit(X_train_binary, y_train_binary)

# Evaluate the binary classifier
y_pred_binary_test = binary_rf_model.predict(X_test_binary)
accuracy_binary_test = accuracy_score(y_test_binary, y_pred_binary_test)
print(f"Binary Classifier Accuracy on Test Data: {accuracy_binary_test * 100:.2f}%")
# Calculate precision, recall, and F1-score
precision_test = precision_score(y_test_binary, y_pred_binary_test)
recall_test = recall_score(y_test_binary, y_pred_binary_test)
f1_test = f1_score(y_test_binary, y_pred_binary_test)

# Print metrics
print(f"Precision on Test Data: {precision_test * 100:.2f}%")
print(f"Recall on Test Data: {recall_test * 100:.2f}%")
print(f"F1-Score on Test Data: {f1_test * 100:.2f}%")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred_binary_test))

# Generate a confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_binary, y_pred_binary_test))

# Save the binary classifier
joblib.dump(binary_rf_model, 'models_try/binary_rf_model_down.pkl')
#print(data[5:10])

Binary Classifier Accuracy on Test Data: 79.69%
Precision on Test Data: 81.25%
Recall on Test Data: 78.79%
F1-Score on Test Data: 80.00%

Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.81      0.79        31
           1       0.81      0.79      0.80        33

    accuracy                           0.80        64
   macro avg       0.80      0.80      0.80        64
weighted avg       0.80      0.80      0.80        64


Confusion Matrix:
[[25  6]
 [ 7 26]]


['models_try/binary_rf_model_down.pkl']

xgb

In [83]:

# Initialize the XGBoost model
xgb_model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Train the model
xgb_model.fit(X_train_binary, y_train_binary)

# Make predictions on the test set
y_pred_test = xgb_model.predict(X_test_binary)

# Evaluate the model
accuracy_test = accuracy_score(y_test_binary, y_pred_test)
precision_test = precision_score(y_test_binary, y_pred_test)
recall_test = recall_score(y_test_binary, y_pred_test)
f1_test = f1_score(y_test_binary, y_pred_test)

print(f"XGBoost Accuracy on Test Data: {accuracy_test * 100:.2f}%")
print(f"Precision on Test Data: {precision_test * 100:.2f}%")
print(f"Recall on Test Data: {recall_test * 100:.2f}%")
print(f"F1-Score on Test Data: {f1_test * 100:.2f}%")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred_test))

# Generate a confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_binary, y_pred_test))

# Save the trained XGBoost model
joblib.dump(xgb_model, 'models_try/xgb_model.pkl')

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy on Test Data: 87.86%
Precision on Test Data: 91.21%
Recall on Test Data: 95.25%
F1-Score on Test Data: 93.18%

Classification Report:
              precision    recall  f1-score   support

           0       0.54      0.38      0.44        90
           1       0.91      0.95      0.93       610

    accuracy                           0.88       700
   macro avg       0.73      0.67      0.69       700
weighted avg       0.86      0.88      0.87       700


Confusion Matrix:
[[ 34  56]
 [ 29 581]]


['models_try/xgb_model.pkl']

svm

In [54]:
# Initialize the SVM classifier
svm_model = SVC(kernel='rbf', random_state=42)  # Using radial basis function kernel

# Train the model
svm_model.fit(X_train_binary, y_train_binary)

# Make predictions on the test set
y_pred_test = svm_model.predict(X_test_binary)

# Evaluate the model
accuracy_test = accuracy_score(y_test_binary, y_pred_test)
precision_test = precision_score(y_test_binary, y_pred_test)
recall_test = recall_score(y_test_binary, y_pred_test)
f1_test = f1_score(y_test_binary, y_pred_test)

print(f"svm Accuracy on Test Data: {accuracy_test * 100:.2f}%")
print(f"Precision on Test Data: {precision_test * 100:.2f}%")
print(f"Recall on Test Data: {recall_test * 100:.2f}%")
print(f"F1-Score on Test Data: {f1_test * 100:.2f}%")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred_test))

# Generate a confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_binary, y_pred_test))

# Save the trained XGBoost model
joblib.dump(xgb_model, 'models_try/xgb_model.pkl')

svm Accuracy on Test Data: 63.16%
Precision on Test Data: 58.75%
Recall on Test Data: 95.92%
F1-Score on Test Data: 72.87%

Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.28      0.43        92
           1       0.59      0.96      0.73        98

    accuracy                           0.63       190
   macro avg       0.73      0.62      0.58       190
weighted avg       0.72      0.63      0.58       190


Confusion Matrix:
[[26 66]
 [ 4 94]]


['models_try/xgb_model.pkl']

class  weights

In [80]:
##using inverse frequency method

class_weights = {
    1: 1,     # Majority class 1
    4: 1,     # Majority class 4
    0: 3,     # Minority class 0
    2: 2.5,   # Minority class 2
    3: 2      # Minority class 3
}


class_weights_binary = {
    0: 17,     # Minority class 
    1: 1,     # Majority class 
}

In [87]:
# Create binary target variable
data['binary_target'] = data['action'].apply(lambda x: 1 if x in [1, 4] else 0)

# Split the data into features and target
X_binary = data.drop(columns=['action', 'binary_target'])
y_binary = data['binary_target']

# Split the dataset into train, test, and eval sets
X_train_binary, X_temp_binary, y_train_binary, y_temp_binary = train_test_split(X_binary, y_binary, test_size=0.2, random_state=42)
X_test_binary, X_eval_binary, y_test_binary, y_eval_binary = train_test_split(X_temp_binary, y_temp_binary, test_size=0.5, random_state=42)

# Train the binary classifier with class weights
binary_rf_model = RandomForestClassifier(class_weight=class_weights_binary, random_state=42)
binary_rf_model.fit(X_train_binary, y_train_binary)

# Evaluate the binary classifier
y_pred_binary_test = binary_rf_model.predict(X_test_binary)
accuracy_binary_test = accuracy_score(y_test_binary, y_pred_binary_test)
print(f"Binary Classifier Accuracy on Test Data: {accuracy_binary_test * 100:.2f}%")
print(f"Precision on Test Data: {precision_test * 100:.2f}%")
print(f"Recall on Test Data: {recall_test * 100:.2f}%")
print(f"F1-Score on Test Data: {f1_test * 100:.2f}%")

# Generate a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test_binary, y_pred_binary_test))

# Generate a confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test_binary, y_pred_binary_test))

# Save the binary classifier
joblib.dump(binary_rf_model, 'models_try/binary_rf_model_weights.pkl')

Binary Classifier Accuracy on Test Data: 99.50%
Precision on Test Data: 99.64%
Recall on Test Data: 99.69%
F1-Score on Test Data: 99.67%

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2519
           1       1.00      1.00      1.00      5823

    accuracy                           0.99      8342
   macro avg       0.99      0.99      0.99      8342
weighted avg       0.99      0.99      0.99      8342


Confusion Matrix:
[[2490   29]
 [  13 5810]]


['models_try/binary_rf_model_weights.pkl']

minor action classifier

In [88]:
# Filter the data for classes 0, 2, and 3
filtered_data = data[data['binary_target'] == 0]
print(filtered_data[0:5])

# Split the dataset into features and target
X = filtered_data.drop(columns=['action', 'binary_target'])
y = filtered_data['action']  # Multi-class target

# Split into train, test, and eval sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_eval, y_test, y_eval = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train a single Random Forest classifier
rf_model_minor = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_minor.fit(X_train, y_train)

# Make predictions
y_pred_test = rf_model_minor.predict(X_test)
y_pred_eval = rf_model_minor.predict(X_eval)

# Evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Evaluation Accuracy:", accuracy_score(y_eval, y_pred_eval))
print("\nClassification Report on Test Data:\n", classification_report(y_test, y_pred_test))

# Save the binary classifier
joblib.dump(rf_model_minor, 'models_try/minor_rf_model.pkl')


   vehicles_in_ego_lane  vehicles_in_left_lane  vehicles_in_right_lane  \
0                   2.0                    3.0                     0.0   
1                   2.0                    1.0                     3.0   
2                   1.0                    3.0                     0.0   
3                   2.0                    2.0                     4.0   
4                   2.0                    2.0                     4.0   

   closest_in_ego_lane_dist  closest_left_lane_dist  closest_right_lane_dist  \
0                  17.83900                 8.96505                  0.00000   
1                  12.48320                34.53645                 11.20628   
2                  11.01953                 5.71289                  0.00000   
3                   9.11692                 2.27161                  8.42142   
4                  18.23709                18.24152                  5.58881   

   relative_velocity_ego_lane  relative_velocity_left_lane  \
0           

['models_try/minor_rf_model.pkl']

major action classifier

In [89]:
# Filter the data for classes 1 and 4
filtered_data = data[data['binary_target'] == 1]
print(filtered_data[0:5])

# Split the dataset into features and target
X = filtered_data.drop(columns=['action', 'binary_target'])
y = filtered_data['action']  # Multi-class target

# Split into train, test, and eval sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_eval, y_test, y_eval = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Train a single Random Forest classifier
rf_model_major = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_major.fit(X_train, y_train)

# Make predictions
y_pred_test = rf_model_major.predict(X_test)
y_pred_eval = rf_model_major.predict(X_eval)

# Evaluate performance
print("Test Accuracy:", accuracy_score(y_test, y_pred_test))
print("Evaluation Accuracy:", accuracy_score(y_eval, y_pred_eval))
print("\nClassification Report on Test Data:\n", classification_report(y_test, y_pred_test))

# Save the binary classifier
joblib.dump(rf_model_major, 'models_try/major_rf_model.pkl')


      vehicles_in_ego_lane  vehicles_in_left_lane  vehicles_in_right_lane  \
2674                   2.0                    2.0                     2.0   
2675                   4.0                    2.0                     1.0   
2676                   2.0                    3.0                     2.0   
2677                   1.0                    3.0                     3.0   
2678                   2.0                    2.0                     2.0   

      closest_in_ego_lane_dist  closest_left_lane_dist  \
2674                  16.73204                10.99064   
2675                  15.06501                21.62792   
2676                   7.07892                14.97510   
2677                  22.94460                 7.63052   
2678                  36.50484                 6.56768   

      closest_right_lane_dist  relative_velocity_ego_lane  \
2674                 14.66744                    1.900778   
2675                 38.01758                    0.597625   
2676 

['models_try/major_rf_model.pkl']