In [103]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Normalizer
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

In [104]:
df = pd.read_csv("accident.csv")

In [105]:
df.fillna(df.mode().iloc[0], inplace=True)

In [106]:
label_cols = ["Gender", "Helmet_Used", "Seatbelt_Used"]
encoder = LabelEncoder()
for col in label_cols:
    df[col] = encoder.fit_transform(df[col])

In [107]:
X = df.drop("Survived", axis=1)
y = df["Survived"]

In [108]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [109]:
param_grids = {
    "SVM": {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear']
    },
    "Gradient Boosting": {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5]
    },
    "XGBoost": {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5]
    }
}

In [110]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Support Vector Machine": SVC(),
    "Naive Bayes": GaussianNB()
}

In [111]:
scalers = {
    "L1 Normalization": Normalizer(norm="l1"),
    "L2 Normalization": Normalizer(norm="l2"),
    "Min-Max Scaling": MinMaxScaler(),
    "Standard Scaling": StandardScaler()
}

In [112]:
results_preprocessing = {}

for scale_name, scaler in scalers.items():
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    results_preprocessing[scale_name] = {}

    for name, model in models.items():
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)

        results_preprocessing[scale_name][name] = {
            "Accuracy": round(accuracy_score(y_test, y_pred), 3),
            "Precision": round(precision_score(y_test, y_pred, average='weighted'), 3),
            "Recall": round(recall_score(y_test, y_pred, average='weighted'), 3),
            "F1 Score": round(f1_score(y_test, y_pred, average='weighted'), 3)
        }

with open("./json/results_preprocessing.json", "w") as f:
    json.dump(results_preprocessing, f, indent=4)

In [113]:
results_no_tuning = {}
best_hyperparameters = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    results_no_tuning[name] = {
        "Accuracy": round(accuracy_score(y_test, y_pred), 3),
        "Precision": round(precision_score(y_test, y_pred, average='weighted'), 3),
        "Recall": round(recall_score(y_test, y_pred, average='weighted'), 3),
        "F1 Score": round(f1_score(y_test, y_pred, average='weighted'), 3)
    }
    
with open("./json/results_no_tuning.json", "w") as f:
    json.dump(results_no_tuning, f, indent=4)

In [114]:
results_tuned = {}

for name, model in models.items():
    if name in param_grids:
        grid = GridSearchCV(model, param_grids[name], cv=10, scoring='accuracy', n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
        best_params = grid.best_params_
    else:
        best_model = model
        best_model.fit(X_train, y_train)
        best_params = best_model.get_params()

    scores = cross_val_score(best_model, X_train, y_train, cv=10, scoring='accuracy')
    avg_accuracy = scores.mean()

    y_pred = best_model.predict(X_test)

    results_tuned[name] = {
        "Accuracy": round(accuracy_score(y_test, y_pred), 3),
        "Precision": round(precision_score(y_test, y_pred, average='weighted'), 3),
        "Recall": round(recall_score(y_test, y_pred, average='weighted'), 3),
        "F1 Score": round(f1_score(y_test, y_pred, average='weighted'), 3),
        "Cross-Validation Accuracy (10-Fold)": round(avg_accuracy, 3)
    }
    best_hyperparameters[name] = {key: value for key, value in best_params.items()}

with open("./json/results_tuned.json", "w") as f:
    json.dump(results_tuned, f, indent=4)

with open("./json/best_hyperparameters.json", "w") as f:
    json.dump(best_hyperparameters, f, indent=4)

In [115]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

In [116]:
results_pca = {}

for model_name, model in models.items():
    model.fit(X_pca, y)
    y_pred = model.predict(X_pca)
    
    results_pca[model_name] = {
        "Accuracy": round(accuracy_score(y, y_pred), 3),
        "Precision": round(precision_score(y, y_pred, average='macro'), 3),
        "Recall": round(recall_score(y, y_pred, average='macro'), 3),
        "F1 Score": round(f1_score(y, y_pred, average='macro'), 3)
    }

with open("./json/results_pca.json", "w") as json_file:
    json.dump(results_pca, json_file, indent=4)