In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

In [2]:
# Load dataset
df = pd.read_csv(r"C:/Users/twool/Downloads/intersection_analysis_2D.csv") 

# Print dataset head for debugging
print("Dataset preview:")
print(df.head())

# Convert is_point columns from TRUE/FALSE to 1/0
df.replace({"TRUE": 1, "FALSE": 0, True: 1, False: 0}, inplace=True)

# Undersample label 0, keep all label 1
target_col = "error"
label_0 = df[df[target_col] == 0]
label_1 = df[df[target_col] == 1]

# Randomly sample 10,000 from label 0
label_0_sampled = label_0.sample(n=10000, random_state=42)
df_balanced = pd.concat([label_0_sampled, label_1], ignore_index=True)

# Shuffle the balanced dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Identify features and target
ignore_cols = ["geometry"]
features = df_balanced.drop(columns=ignore_cols + [target_col])
target = df_balanced[target_col]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Print dataset statistics
print("\nTraining set:")
print(f"Total data points: {len(y_train)}")
print(y_train.value_counts(normalize=True) * 100)

print("\nTest set:")
print(f"Total data points: {len(y_test)}")
print(y_test.value_counts(normalize=True) * 100)

# Standardize features for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Dataset preview:
                          geometry  num_of_lines  avg_angle_of_intersection  \
0    POINT (30.1132479 81.9040127)             3                   1.569418   
1  POINT (30.11393889 81.90421187)             3                   0.000869   
2  POINT (30.13443475 81.92446576)             3                   0.000869   
3  POINT (30.12866542 81.91527886)             3                   0.001507   
4  POINT (30.10870486 81.90333917)             4                   2.092366   

   num_of_involved_line_intersections  \
0                                  20   
1                                  16   
2                                  16   
3                                  20   
4                                   7   

   vicinity_of_involved_line_intersections  min_distance_to_endpoint  \
0                                 0.006751                 30.167739   
1                                 0.005855                 30.168430   
2                                 0.025495   

In [3]:
# Train models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=20),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM (Linear Kernel)": SVC(kernel='linear', random_state=42),
    "SVM (RBF Kernel)": SVC(kernel='rbf', random_state=42),
    "SVM (Polynomial Kernel)": SVC(kernel='poly', random_state=42),
    "SVM (Sigmoid Kernel)": SVC(kernel='sigmoid', random_state=42)
}

for name, model in models.items():
    print(f"\n{name} Performance:")
    
    if "SVM" in name:
        model.fit(X_train_scaled, y_train)
        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    
    for dataset, y_true, y_pred in [("Train", y_train, y_train_pred), ("Test", y_test, y_test_pred)]:
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        percent_errors_identified = (tp / (tp + fn)) * 100 if (tp + fn) > 0 else 0
        
        print(f"\n{name} - {dataset} set:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1-score: {f1:.4f}")
        print(f"False Positives: {fp}")
        print(f"False Negatives: {fn}")
        print(f"Percentage of Errors Correctly Identified: {percent_errors_identified:.2f}%")
        print(classification_report(y_true, y_pred))
    
    # Cross-validation
    scores = cross_val_score(model, X_train_scaled if "SVM" in name else X_train, y_train, cv=5, scoring='accuracy')
    print(f"Cross-validation accuracy scores: {scores}")
    print(f"Mean cross-validation accuracy: {scores.mean():.4f}")





Decision Tree Performance:

Decision Tree - Train set:
Accuracy: 1.0000
F1-score: 1.0000
False Positives: 0
False Negatives: 0
Percentage of Errors Correctly Identified: 100.00%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8006
           1       1.00      1.00      1.00       328

    accuracy                           1.00      8334
   macro avg       1.00      1.00      1.00      8334
weighted avg       1.00      1.00      1.00      8334


Decision Tree - Test set:
Accuracy: 0.9765
F1-score: 0.7263
False Positives: 24
False Negatives: 25
Percentage of Errors Correctly Identified: 72.22%
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1994
           1       0.73      0.72      0.73        90

    accuracy                           0.98      2084
   macro avg       0.86      0.86      0.86      2084
weighted avg       0.98      0.98      0.98      2084

Cross-validation

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE  # For augmenting the minority class

# Load dataset
df = pd.read_csv(r"C:/Users/twool/Downloads/intersection_analysis_2D.csv") 

# Print dataset head for debugging
print("Dataset preview:")
print(df.head())

# Convert is_point columns from TRUE/FALSE to 1/0
df.replace({"TRUE": 1, "FALSE": 0, True: 1, False: 0}, inplace=True)

# Undersample label 0, keep all label 1
target_col = "error"
label_0 = df[df[target_col] == 0]
label_1 = df[df[target_col] == 1]

# Randomly sample 10,000 from label 0
label_0_sampled = label_0.sample(n=10000, random_state=42)
df_balanced = pd.concat([label_0_sampled, label_1], ignore_index=True)

# Shuffle the balanced dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Identify features and target
ignore_cols = ["geometry"]
features = df_balanced.drop(columns=ignore_cols + [target_col])
target = df_balanced[target_col]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Print dataset statistics after applying SMOTE
print("\nTraining set after SMOTE:")
print(f"Total data points: {len(y_train_res)}")
print(y_train_res.value_counts(normalize=True) * 100)

print("\nTest set (original):")
print(f"Total data points: {len(y_test)}")
print(y_test.value_counts(normalize=True) * 100)

# Standardize features for SVM and Decision Tree
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train_res)
y_train_rf_pred = rf_model.predict(X_train_scaled)
y_test_rf_pred = rf_model.predict(X_test_scaled)

# Evaluate Random Forest
print("\nRandom Forest Performance:")
for dataset, y_true, y_pred in [("Train", y_train_res, y_train_rf_pred), ("Test", y_test, y_test_rf_pred)]:
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    percent_errors_identified = (tp / (tp + fn)) * 100 if (tp + fn) > 0 else 0

    print(f"\nRandom Forest - {dataset} set:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"Percentage of Errors Correctly Identified: {percent_errors_identified:.2f}%")
    print(classification_report(y_true, y_pred))

# Train Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train_res)
y_train_dt_pred = dt_model.predict(X_train_scaled)
y_test_dt_pred = dt_model.predict(X_test_scaled)

# Evaluate Decision Tree
print("\nDecision Tree Performance:")
for dataset, y_true, y_pred in [("Train", y_train_res, y_train_dt_pred), ("Test", y_test, y_test_dt_pred)]:
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    percent_errors_identified = (tp / (tp + fn)) * 100 if (tp + fn) > 0 else 0

    print(f"\nDecision Tree - {dataset} set:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"Percentage of Errors Correctly Identified: {percent_errors_identified:.2f}%")
    print(classification_report(y_true, y_pred))




Dataset preview:
                          geometry  num_of_lines  avg_angle_of_intersection  \
0    POINT (30.1132479 81.9040127)             3                   1.569418   
1  POINT (30.11393889 81.90421187)             3                   0.000869   
2  POINT (30.13443475 81.92446576)             3                   0.000869   
3  POINT (30.12866542 81.91527886)             3                   0.001507   
4  POINT (30.10870486 81.90333917)             4                   2.092366   

   num_of_involved_line_intersections  \
0                                  20   
1                                  16   
2                                  16   
3                                  20   
4                                   7   

   vicinity_of_involved_line_intersections  min_distance_to_endpoint  \
0                                 0.006751                 30.167739   
1                                 0.005855                 30.168430   
2                                 0.025495   

In [4]:
# Load dataset
df = pd.read_csv(r"C:/Users/twool/Downloads/intersection_analysis_2D.csv") 

# Print dataset head for debugging
print("Dataset preview:")
print(df.head())

# Convert is_point columns from TRUE/FALSE to 1/0
df.replace({"TRUE": 1, "FALSE": 0, True: 1, False: 0}, inplace=True)

# Undersample label 0, keep all label 1
target_col = "error"
label_0 = df[df[target_col] == 0]
label_1 = df[df[target_col] == 1]

# Randomly sample 10,000 from label 0
label_0_sampled = label_0.sample(n=10000, random_state=42)
df_balanced = pd.concat([label_0_sampled, label_1], ignore_index=True)

# Shuffle the balanced dataset
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Identify features and target
ignore_cols = ["geometry"]
features = df_balanced.drop(columns=ignore_cols + [target_col])
target = df_balanced[target_col]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Print dataset statistics
print("\nTraining set:")
print(f"Total data points: {len(y_train)}")
print(y_train.value_counts(normalize=True) * 100)

print("\nTest set:")
print(f"Total data points: {len(y_test)}")
print(y_test.value_counts(normalize=True) * 100)

# Standardize features for SVM
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Dataset preview:
                          geometry  num_of_lines  avg_angle_of_intersection  \
0    POINT (30.1132479 81.9040127)             3                   1.569418   
1  POINT (30.11393889 81.90421187)             3                   0.000869   
2  POINT (30.13443475 81.92446576)             3                   0.000869   
3  POINT (30.12866542 81.91527886)             3                   0.001507   
4  POINT (30.10870486 81.90333917)             4                   2.092366   

   num_of_involved_line_intersections  \
0                                  20   
1                                  16   
2                                  16   
3                                  20   
4                                   7   

   vicinity_of_involved_line_intersections  min_distance_to_endpoint  \
0                                 0.006751                 30.167739   
1                                 0.005855                 30.168430   
2                                 0.025495   

In [6]:
# Core packages
import numpy as np
import pandas as pd

# Model evaluation
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score

# Classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

# Boosting libraries
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Define models
models = {
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "SVM (Polynomial Kernel)": SVC(kernel='poly', degree=3),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=100, random_state=42),
    "Gaussian Naive Bayes": GaussianNB()
}

# Train and evaluate models
for name, model in models.items():
    print(f"\n{name} Performance:")
    
    # For models requiring scaling (SVM, XGBoost, GradientBoosting, LightGBM)
    if name in ["SVM (Polynomial Kernel)", "XGBoost", "Gradient Boosting", "LightGBM"]:
        model.fit(X_train_scaled, y_train)
        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    
    for dataset, y_true, y_pred in [("Train", y_train, y_train_pred), ("Test", y_test, y_test_pred)]:
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        percent_errors_identified = (tp / (tp + fn)) * 100 if (tp + fn) > 0 else 0
        
        print(f"\n{name} - {dataset} set:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1-score: {f1:.4f}")
        print(f"False Positives: {fp}")
        print(f"False Negatives: {fn}")
        print(f"Percentage of Errors Correctly Identified: {percent_errors_identified:.2f}%")
        print(classification_report(y_true, y_pred))
        
    # Cross-validation
    cv_scores = cross_val_score(model, X_train_scaled if name in ["SVM (Polynomial Kernel)", "XGBoost", "Gradient Boosting", "LightGBM"] else X_train, y_train, cv=5, scoring='accuracy')
    print(f"\n{name} - Cross-validation Accuracy (5-fold): {cv_scores.mean():.4f} Â± {cv_scores.std():.4f}")


KNN Performance:

KNN - Train set:
Accuracy: 0.9855
F1-score: 0.8112
False Positives: 53
False Negatives: 68
Percentage of Errors Correctly Identified: 79.27%
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      8006
           1       0.83      0.79      0.81       328

    accuracy                           0.99      8334
   macro avg       0.91      0.89      0.90      8334
weighted avg       0.99      0.99      0.99      8334


KNN - Test set:
Accuracy: 0.9789
F1-score: 0.7500
False Positives: 20
False Negatives: 24
Percentage of Errors Correctly Identified: 73.33%
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1994
           1       0.77      0.73      0.75        90

    accuracy                           0.98      2084
   macro avg       0.88      0.86      0.87      2084
weighted avg       0.98      0.98      0.98      2084


KNN - Cross-validation Accuracy (5-fold): 0.

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier, StackingClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier


# Standardize features for SVM and Logistic Regression
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define base models
models = {
    "Logistic Regression": LogisticRegression(random_state=42),
    "SVM (Linear)": SVC(kernel='linear', random_state=42),
    "SVM (RBF)": SVC(kernel='rbf', random_state=42),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
}

# Ensemble Methods
ensemble_models = {
    "Voting Classifier": VotingClassifier(estimators=[('lr', models["Logistic Regression"]),
                                                      ('rf', models["Random Forest"]),
                                                      ('svm', models["SVM (Linear)"])], voting='hard'),
    "Stacking Classifier": StackingClassifier(estimators=[('lr', models["Logistic Regression"]),
                                                          ('rf', models["Random Forest"]),
                                                          ('svm', models["SVM (Linear)"])],
                                              final_estimator=LogisticRegression()),
    "AdaBoost with Logistic Regression": AdaBoostClassifier(models["Logistic Regression"], random_state=42),
    "Bagging with Logistic Regression": BaggingClassifier(models["Logistic Regression"], random_state=42),
}

# Function to compute metrics
def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    percent_errors_identified = (tp / (tp + fn)) * 100 if (tp + fn) > 0 else 0
    return accuracy, f1, fp, fn, percent_errors_identified

# Train and evaluate models
for name, model in {**models, **ensemble_models}.items():
    print(f"\n{name} Performance:")

    # Cross-validation scores (using 5-fold CV)
    cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='accuracy')
    print(f"Cross-validation accuracy (mean): {cv_scores.mean():.4f}")
    
    if isinstance(model, LogisticRegression) or isinstance(model, KNeighborsClassifier) or isinstance(model, SVC):
        # Base models
        model.fit(X_train_scaled, y_train)
        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)
    else:
        # Ensemble methods
        model.fit(X_train_scaled, y_train)
        y_train_pred = model.predict(X_train_scaled)
        y_test_pred = model.predict(X_test_scaled)
    
    # For train and test metrics
    for dataset, y_true, y_pred in [("Train", y_train, y_train_pred), ("Test", y_test, y_test_pred)]:
        accuracy, f1, fp, fn, percent_errors_identified = compute_metrics(y_true, y_pred)
        
        print(f"\n{name} - {dataset} set:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1-score: {f1:.4f}")
        print(f"False Positives: {fp}")
        print(f"False Negatives: {fn}")
        print(f"Percentage of Errors Correctly Identified: {percent_errors_identified:.2f}%")
        print(classification_report(y_true, y_pred))



Logistic Regression Performance:
Cross-validation accuracy (mean): 0.9638

Logistic Regression - Train set:
Accuracy: 0.9634
F1-score: 0.2615
False Positives: 31
False Negatives: 274
Percentage of Errors Correctly Identified: 16.46%
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      8006
           1       0.64      0.16      0.26       328

    accuracy                           0.96      8334
   macro avg       0.80      0.58      0.62      8334
weighted avg       0.95      0.96      0.95      8334


Logistic Regression - Test set:
Accuracy: 0.9602
F1-score: 0.2906
False Positives: 10
False Negatives: 73
Percentage of Errors Correctly Identified: 18.89%
              precision    recall  f1-score   support

           0       0.96      0.99      0.98      1994
           1       0.63      0.19      0.29        90

    accuracy                           0.96      2084
   macro avg       0.80      0.59      0.64      2084
weighted a

In [18]:
from sklearn.neural_network import MLPClassifier

# Define MLP model
mlp = MLPClassifier(hidden_layer_sizes=(200,100,50,100,200), max_iter=1000, random_state=42)

# Train the model
mlp.fit(X_train_scaled, y_train)

# Predict on train and test data
y_train_pred_mlp = mlp.predict(X_train_scaled)
y_test_pred_mlp = mlp.predict(X_test_scaled)

# Function to compute metrics
def compute_metrics(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    percent_errors_identified = (tp / (tp + fn)) * 100 if (tp + fn) > 0 else 0
    return accuracy, f1, fp, fn, percent_errors_identified

# Print MLP Performance
print("\nMLP Classifier Performance:")

for dataset, y_true, y_pred in [("Train", y_train, y_train_pred_mlp), ("Test", y_test, y_test_pred_mlp)]:
    accuracy, f1, fp, fn, percent_errors_identified = compute_metrics(y_true, y_pred)
    
    print(f"\nMLP - {dataset} set:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"F1-score: {f1:.4f}")
    print(f"False Positives: {fp}")
    print(f"False Negatives: {fn}")
    print(f"Percentage of Errors Correctly Identified: {percent_errors_identified:.2f}%")
    print(classification_report(y_true, y_pred))



MLP Classifier Performance:

MLP - Train set:
Accuracy: 0.9939
F1-score: 0.9202
False Positives: 17
False Negatives: 34
Percentage of Errors Correctly Identified: 89.63%
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8006
           1       0.95      0.90      0.92       328

    accuracy                           0.99      8334
   macro avg       0.97      0.95      0.96      8334
weighted avg       0.99      0.99      0.99      8334


MLP - Test set:
Accuracy: 0.9746
F1-score: 0.6936
False Positives: 23
False Negatives: 30
Percentage of Errors Correctly Identified: 66.67%
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1994
           1       0.72      0.67      0.69        90

    accuracy                           0.97      2084
   macro avg       0.85      0.83      0.84      2084
weighted avg       0.97      0.97      0.97      2084



In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

# Load dataset
df = pd.read_csv(r"C:/Users/twool/Downloads/intersection_analysis_2D.csv") 

print("Dataset preview:")
print(df.head())

# Convert TRUE/FALSE to 1/0
df.replace({"TRUE": 1, "FALSE": 0, True: 1, False: 0}, inplace=True)

# Undersample label 0, keep all label 1
target_col = "error"
label_0 = df[df[target_col] == 0]
label_1 = df[df[target_col] == 1]
label_0_sampled = label_0.sample(n=10000, random_state=42)
df_balanced = pd.concat([label_0_sampled, label_1], ignore_index=True)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Split features and target
ignore_cols = ["geometry"]
features = df_balanced.drop(columns=ignore_cols + [target_col])
target = df_balanced[target_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# SMOTE to balance training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nTraining set after SMOTE:")
print(f"Total data points: {len(y_train_res)}")
print(y_train_res.value_counts(normalize=True) * 100)

print("\nTest set (original):")
print(f"Total data points: {len(y_test)}")
print(y_test.value_counts(normalize=True) * 100)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# Define models
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "MLP (Neural Network)": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "LightGBM": LGBMClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42)
}

# Train and evaluate
for name, model in models.items():
    print(f"\n{name} Performance:")
    
    # Use scaled features for all models here since SMOTE + scaling is applied globally
    model.fit(X_train_scaled, y_train_res)
    y_train_pred = model.predict(X_train_scaled)
    y_test_pred = model.predict(X_test_scaled)

    for dataset, y_true, y_pred in [("Train", y_train_res, y_train_pred), ("Test", y_test, y_test_pred)]:
        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        percent_errors_identified = (tp / (tp + fn)) * 100 if (tp + fn) > 0 else 0

        print(f"\n{name} - {dataset} set:")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"F1-score: {f1:.4f}")
        print(f"False Positives: {fp}")
        print(f"False Negatives: {fn}")
        print(f"Percentage of Errors Correctly Identified: {percent_errors_identified:.2f}%")
        print(classification_report(y_true, y_pred))


Dataset preview:
                          geometry  num_of_lines  avg_angle_of_intersection  \
0    POINT (30.1132479 81.9040127)             3                   1.569418   
1  POINT (30.11393889 81.90421187)             3                   0.000869   
2  POINT (30.13443475 81.92446576)             3                   0.000869   
3  POINT (30.12866542 81.91527886)             3                   0.001507   
4  POINT (30.10870486 81.90333917)             4                   2.092366   

   num_of_involved_line_intersections  \
0                                  20   
1                                  16   
2                                  16   
3                                  20   
4                                   7   

   vicinity_of_involved_line_intersections  min_distance_to_endpoint  \
0                                 0.006751                 30.167739   
1                                 0.005855                 30.168430   
2                                 0.025495   




MLP (Neural Network) - Train set:
Accuracy: 0.9824
F1-score: 0.9825
False Positives: 210
False Negatives: 72
Percentage of Errors Correctly Identified: 99.10%
              precision    recall  f1-score   support

           0       0.99      0.97      0.98      8006
           1       0.97      0.99      0.98      8006

    accuracy                           0.98     16012
   macro avg       0.98      0.98      0.98     16012
weighted avg       0.98      0.98      0.98     16012


MLP (Neural Network) - Test set:
Accuracy: 0.9525
F1-score: 0.5992
False Positives: 83
False Negatives: 16
Percentage of Errors Correctly Identified: 82.22%
              precision    recall  f1-score   support

           0       0.99      0.96      0.97      1994
           1       0.47      0.82      0.60        90

    accuracy                           0.95      2084
   macro avg       0.73      0.89      0.79      2084
weighted avg       0.97      0.95      0.96      2084


KNN Performance:

KNN - Tra

In [2]:
pip install torch

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

# Load dataset
df = pd.read_csv(r"C:/Users/twool/Downloads/intersection_analysis_2D.csv") 

print("Dataset preview:")
print(df.head())

# Convert TRUE/FALSE to 1/0
df.replace({"TRUE": 1, "FALSE": 0, True: 1, False: 0}, inplace=True)

# Undersample label 0, keep all label 1
target_col = "error"
label_0 = df[df[target_col] == 0]
label_1 = df[df[target_col] == 1]
label_0_sampled = label_0.sample(n=10000, random_state=42)
df_balanced = pd.concat([label_0_sampled, label_1], ignore_index=True)
df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Split features and target
ignore_cols = ["geometry"]
features = df_balanced.drop(columns=ignore_cols + [target_col])
target = df_balanced[target_col]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# SMOTE to balance training set
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nTraining set after SMOTE:")
print(f"Total data points: {len(y_train_res)}")
print(y_train_res.value_counts(normalize=True) * 100)

print("\nTest set (original):")
print(f"Total data points: {len(y_test)}")
print(y_test.value_counts(normalize=True) * 100)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# Use the same data loading process as requested
train_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train_res))
train_dataset = train_dataset.batch(64).shuffle(buffer_size=1000)

test_dataset = tf.data.Dataset.from_tensor_slices((X_test_scaled, y_test))
test_dataset = test_dataset.batch(64)

# Initialize models
models_dict = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42),
    'CatBoost': CatBoostClassifier(learning_rate=0.05, depth=7, iterations=500, random_state=42, verbose=0)
}

# Define Keras-based models
def create_mlp_model(input_dim):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_dim=input_dim),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='sigmoid')  # Binary classification
    ])
    model.compile(optimizer=optimizers.Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_cnn_model(input_dim):
    model = models.Sequential([
        layers.Reshape((input_dim, 1), input_shape=(input_dim,)),
        layers.Conv1D(64, 3, activation='relu'),
        layers.Conv1D(32, 3, activation='relu'),
        layers.GlobalMaxPooling1D(),
        layers.Dense(1, activation='sigmoid')  # Binary classification
    ])
    model.compile(optimizer=optimizers.Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_rnn_model(input_dim):
    model = models.Sequential([
        layers.Reshape((input_dim, 1), input_shape=(input_dim,)),
        layers.LSTM(64, return_sequences=False),
        layers.Dense(1, activation='sigmoid')  # Binary classification
    ])
    model.compile(optimizer=optimizers.Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_gru_model(input_dim):
    model = models.Sequential([
        layers.Reshape((input_dim, 1), input_shape=(input_dim,)),
        layers.GRU(64, return_sequences=False),
        layers.Dense(1, activation='sigmoid')  # Binary classification
    ])
    model.compile(optimizer=optimizers.Adam(lr=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Function to evaluate models
def evaluate_model(model, X_train, y_train, X_test, y_test, model_type="sklearn"):
    if model_type == "sklearn":
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
    elif model_type == "keras":
        # Evaluate Keras models on train and test data
        y_train_pred = (model.predict(X_train) > 0.5).astype(int)
        y_test_pred = (model.predict(X_test) > 0.5).astype(int)

    # Print classification reports
    print(f"\n{model.__class__.__name__} Classification Report (Train):")
    print(classification_report(y_train, y_train_pred))
    
    print(f"\n{model.__class__.__name__} Classification Report (Test):")
    print(classification_report(y_test, y_test_pred))
    
    print(f"\n{model.__class__.__name__} Confusion Matrix (Test):")
    print(confusion_matrix(y_test, y_test_pred))

# Train and evaluate each model
for name, model in models_dict.items():
    print(f"\n{name} Performance:")
    evaluate_model(model, X_train_res, y_train_res, X_test, y_test, model_type="sklearn")

# Train and evaluate NN models
def train_keras_model(model, train_dataset, test_dataset, epochs=10):
    model.fit(train_dataset, epochs=epochs, validation_data=test_dataset)
    evaluate_model(model, X_train_res, y_train_res, X_test, y_test, model_type="keras")

# Instantiate and train Keras NN models
keras_models = [
    create_mlp_model(X_train_res.shape[1]),
    create_cnn_model(X_train_res.shape[1]),
    create_rnn_model(X_train_res.shape[1]),
    create_gru_model(X_train_res.shape[1])
]

for keras_model in keras_models:
    print(f"\n{keras_model.__class__.__name__} Performance:")
    train_keras_model(keras_model, train_dataset, test_dataset)


Dataset preview:
                          geometry  num_of_lines  avg_angle_of_intersection  \
0    POINT (30.1132479 81.9040127)             3                   1.569418   
1  POINT (30.11393889 81.90421187)             3                   0.000869   
2  POINT (30.13443475 81.92446576)             3                   0.000869   
3  POINT (30.12866542 81.91527886)             3                   0.001507   
4  POINT (30.10870486 81.90333917)             4                   2.092366   

   num_of_involved_line_intersections  \
0                                  20   
1                                  16   
2                                  16   
3                                  20   
4                                   7   

   vicinity_of_involved_line_intersections  min_distance_to_endpoint  \
0                                 0.006751                 30.167739   
1                                 0.005855                 30.168430   
2                                 0.025495   




Sequential Performance:
Epoch 1/10

  1/251 [..............................] - ETA: 3:18 - loss: 0.9319 - accuracy: 0.1719
Epoch 2/10

  1/251 [..............................] - ETA: 5s - loss: 0.2580 - accuracy: 0.9375
 55/251 [=====>........................] - ETA: 0s - loss: 0.2981 - accuracy: 0.8801
Epoch 3/10

  1/251 [..............................] - ETA: 4s - loss: 0.3969 - accuracy: 0.7656
 55/251 [=====>........................] - ETA: 0s - loss: 0.2471 - accuracy: 0.9020
Epoch 4/10

  1/251 [..............................] - ETA: 4s - loss: 0.2520 - accuracy: 0.8906
 55/251 [=====>........................] - ETA: 0s - loss: 0.2029 - accuracy: 0.9270
Epoch 5/10

  1/251 [..............................] - ETA: 4s - loss: 0.3038 - accuracy: 0.8750
 53/251 [=====>........................] - ETA: 0s - loss: 0.2091 - accuracy: 0.9195
Epoch 6/10

  1/251 [..............................] - ETA: 4s - loss: 0.2850 - accuracy: 0.8750
 54/251 [=====>........................] - ETA: 0s 

In [6]:
# ========================
# Imports
# ========================
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score, recall_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

# ========================
# Load and Prepare Dataset
# ========================
df = pd.read_csv(r"C:/Users/twool/Downloads/intersection_analysis_2D.csv") 
print("Dataset preview:")
print(df.head())

df.replace({"TRUE": 1, "FALSE": 0, True: 1, False: 0}, inplace=True)

# Balance dataset by undersampling class 0
target_col = "error"
label_0 = df[df[target_col] == 0].sample(n=10000, random_state=42)
label_1 = df[df[target_col] == 1]
df_balanced = pd.concat([label_0, label_1]).sample(frac=1, random_state=42).reset_index(drop=True)

# ========================
# Feature/Target Split
# ========================
ignore_cols = ["geometry"]
features = df_balanced.drop(columns=ignore_cols + [target_col])
target = df_balanced[target_col]

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42, stratify=target)

# ========================
# Apply SMOTE
# ========================
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nTraining set after SMOTE:")
print(y_train_res.value_counts(normalize=True) * 100)

print("\nTest set:")
print(y_test.value_counts(normalize=True) * 100)

# ========================
# Standardize Features
# ========================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train_scaled, y_train_res)).batch(64).shuffle(1000)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test_scaled, y_test)).batch(64)

# ========================
# Scikit-learn Models
# ========================
models_dict = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42),
    'LightGBM': lgb.LGBMClassifier(random_state=42),
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'CatBoost': CatBoostClassifier(learning_rate=0.05, depth=7, iterations=500, random_state=42, verbose=0)
}

# ========================
# TensorFlow Models
# ========================
def create_mlp_model(input_dim):
    model = models.Sequential([
        layers.Dense(128, activation='relu', input_dim=input_dim),
        layers.Dense(64, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=optimizers.Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
    model._name = "MLP_Model"
    return model

def create_cnn_model(input_dim):
    model = models.Sequential([
        layers.Reshape((input_dim, 1), input_shape=(input_dim,)),
        layers.Conv1D(64, 3, activation='relu'),
        layers.Conv1D(32, 3, activation='relu'),
        layers.GlobalMaxPooling1D(),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=optimizers.Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
    model._name = "CNN_Model"
    return model

def create_rnn_model(input_dim):
    model = models.Sequential([
        layers.Reshape((input_dim, 1), input_shape=(input_dim,)),
        layers.LSTM(64),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=optimizers.Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
    model._name = "RNN_Model"
    return model

def create_gru_model(input_dim):
    model = models.Sequential([
        layers.Reshape((input_dim, 1), input_shape=(input_dim,)),
        layers.GRU(64),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=optimizers.Adam(0.001), loss='binary_crossentropy', metrics=['accuracy'])
    model._name = "GRU_Model"
    return model


# ========================
# Evaluation Function
# ========================
def evaluate_model(model, X_train, y_train, X_test, y_test, model_type="sklearn", name=None):
    if model_type == "sklearn":
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        model_name = name or model.__class__.__name__
    else:  # keras
        y_train_pred = (model.predict(X_train) > 0.5).astype(int).flatten()
        y_test_pred = (model.predict(X_test) > 0.5).astype(int).flatten()
        model_name = name or model.name

    def print_metrics(y_true, y_pred, dataset_name):
        acc = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        cm = confusion_matrix(y_true, y_pred)
        tp = cm[1, 1]
        fn = cm[1, 0]
        percent_errors_identified = (tp / (tp + fn)) * 100 if (tp + fn) > 0 else 0

        print(f"\nðŸ“Š {model_name} - {dataset_name} Metrics")
        print(f"Accuracy: {acc:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"Percent Errors Identified: {percent_errors_identified:.2f}%")
        print("Classification Report:")
        print(classification_report(y_true, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_true, y_pred))

    print_metrics(y_train, y_train_pred, "Train")
    print_metrics(y_test, y_test_pred, "Test")
    
# ========================
# Train & Evaluate Scikit-learn Models
# ========================
for name, model in models_dict.items():
    print(f"\n{name} Performance:")
    evaluate_model(model, X_train_res, y_train_res, X_test, y_test)

# ========================
# Train & Evaluate TensorFlow Models
# ========================
def train_keras_model(model, train_dataset, test_dataset, input_data):
    model.fit(train_dataset, epochs=10, validation_data=test_dataset, verbose=0)
    evaluate_model(model, input_data['X_train'], input_data['y_train'], input_data['X_test'], input_data['y_test'], model_type="keras")

keras_models = [
    create_mlp_model(X_train_res.shape[1]),
    create_cnn_model(X_train_res.shape[1]),
    create_rnn_model(X_train_res.shape[1]),
    create_gru_model(X_train_res.shape[1])
]

keras_input_data = {
    'X_train': X_train_scaled,
    'y_train': y_train_res,
    'X_test': X_test_scaled,
    'y_test': y_test
}

for model in keras_models:
    print(f"\n{model.name} Performance:")
    train_keras_model(model, train_dataset, test_dataset, keras_input_data)


Dataset preview:
                          geometry  num_of_lines  avg_angle_of_intersection  \
0    POINT (30.1132479 81.9040127)             3                   1.569418   
1  POINT (30.11393889 81.90421187)             3                   0.000869   
2  POINT (30.13443475 81.92446576)             3                   0.000869   
3  POINT (30.12866542 81.91527886)             3                   0.001507   
4  POINT (30.10870486 81.90333917)             4                   2.092366   

   num_of_involved_line_intersections  \
0                                  20   
1                                  16   
2                                  16   
3                                  20   
4                                   7   

   vicinity_of_involved_line_intersections  min_distance_to_endpoint  \
0                                 0.006751                 30.167739   
1                                 0.005855                 30.168430   
2                                 0.025495   

In [8]:
from imblearn.over_sampling import ADASYN, SMOTE
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Different data augmentation techniques for tabular data
def apply_data_augmentation(X_train, y_train, method="smote"):
    if method == "smote":
        smote = SMOTE(random_state=42)
        X_res, y_res = smote.fit_resample(X_train, y_train)
    elif method == "adasyn":
        adasyn = ADASYN(random_state=42)
        X_res, y_res = adasyn.fit_resample(X_train, y_train)
    elif method == "random_undersampling":
        X_res, y_res = resample(X_train, y_train, replace=False, n_samples=len(y_train[y_train == 1]), random_state=42)
    elif method == "random_oversampling":
        X_res, y_res = resample(X_train, y_train, replace=True, n_samples=len(y_train[y_train == 0]), random_state=42)
    elif method == "gaussian_noise":
        noise = np.random.normal(0, 0.1, X_train.shape)
        X_res = X_train + noise
        y_res = y_train  # No need to alter the labels
    elif method == "feature_permutation":
        permuted_X = X_train.copy()
        for col in range(X_train.shape[1]):
            permuted_X[:, col] = np.random.permutation(permuted_X[:, col])
        X_res = permuted_X
        y_res = y_train
    else:
        raise ValueError(f"Unknown augmentation method: {method}")
    
    return X_res, y_res

# Wrapper function for experimenting with different augmentation methods
def experiment_with_augmentation(X_train, y_train, X_test, y_test, augmentation_methods):
    for method in augmentation_methods:
        print(f"\nTrying augmentation method: {method}")
        X_train_res, y_train_res = apply_data_augmentation(X_train, y_train, method)

        # Standardize the augmented data
        scaler = StandardScaler()
        X_train_res_scaled = scaler.fit_transform(X_train_res)
        X_test_scaled = scaler.transform(X_test)

        # Train and evaluate models with augmented data
        for name, model in models_dict.items():
            print(f"\n{name} Performance with {method}:")
            evaluate_model(model, X_train_res_scaled, y_train_res, X_test_scaled, y_test)

augmentation_methods = ["smote", "adasyn", "random_undersampling", "random_oversampling", "gaussian_noise"]

# Run the experiment with all augmentation methods
experiment_with_augmentation(X_train, y_train, X_test, y_test, augmentation_methods)



Trying augmentation method: smote

Random Forest Performance with smote:

ðŸ“Š RandomForestClassifier - Train Metrics
Accuracy: 1.0000
F1 Score: 1.0000
Percent Errors Identified: 100.00%
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8000
           1       1.00      1.00      1.00      8000

    accuracy                           1.00     16000
   macro avg       1.00      1.00      1.00     16000
weighted avg       1.00      1.00      1.00     16000

Confusion Matrix:
[[8000    0]
 [   0 8000]]

ðŸ“Š RandomForestClassifier - Test Metrics
Accuracy: 0.9885
F1 Score: 0.8605
Percent Errors Identified: 88.10%
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2000
           1       0.84      0.88      0.86        84

    accuracy                           0.99      2084
   macro avg       0.92      0.94      0.93      2084
weighted a