In [23]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.pipeline import Pipeline

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier

In [24]:
# Cross-validation function
def perform_cross_validation(model, X, y, k_folds=5, stratified=True):
    if stratified:
        cv = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
    else:
        cv = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    
    scores = {'accuracy': [], 'precision': [], 'recall': [], 'f1': []}
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model.fit(X_train, y_train)
        predictions = model.predict(X_val)
        
        scores['accuracy'].append(accuracy_score(y_val, predictions))
        scores['precision'].append(precision_score(y_val, predictions, average='weighted'))
        scores['recall'].append(recall_score(y_val, predictions, average='weighted'))
        scores['f1'].append(f1_score(y_val, predictions, average='weighted'))
    
    return {metric: sum(values)/len(values) for metric, values in scores.items()}

In [25]:
# Hyperparameter tuning
def tune_hyperparameters(model, param_grid, X, y, search_type='grid', k_folds=5, stratified=True):
    if stratified:
        cv = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)
    else:
        cv = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    
    if search_type == 'grid':
        search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    else:
        search = RandomizedSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42)
    
    search.fit(X, y)
    return search.best_estimator_, search.best_params_

In [33]:
def train_model(file_path, model_name):
    # Load the dataset
    data = pd.read_csv(file_path)
    X = data.drop('target', axis=1)  # Assuming 'target' is the label column
    y = data['target']
    
    # Split the dataset
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Define models and parameter grids
    models = {
        'logistic_regression': (LogisticRegression(), {'C': [0.1, 1, 10]}),
        'decision_tree': (DecisionTreeClassifier(), {'max_depth': [5, 10, 20]}),
        'random_forest': (RandomForestClassifier(), {'n_estimators': [50, 100, 200], 'max_depth': [10, 20]}),
        'naive_bayes': (GaussianNB(), {}),
        'svm': (SVC(), {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}),
        'knn': (KNeighborsClassifier(), {'n_neighbors': [3, 5, 7]}),
        'adaboost': (AdaBoostClassifier(), {'n_estimators': [50, 100]}),
        'gradient_boost': (GradientBoostingClassifier(), {'n_estimators': [50, 100]}),
        'xgboost': (XGBClassifier(), {'n_estimators': [50, 100], 'max_depth': [5, 10]}),
        'ann': (MLPClassifier(), {'hidden_layer_sizes': [(50,), (100,)], 'activation': ['relu', 'tanh']})
    }
    
    model, param_grid = models[model_name]
    
    # Perform hyperparameter tuning
    best_model, best_params = tune_hyperparameters(model, param_grid, X_train, y_train, search_type='grid')
    
    # Perform cross-validation
    scores = perform_cross_validation(best_model, X_train, y_train)
    
    # Train final model and evaluate
    best_model.fit(X_train, y_train)
    predictions = best_model.predict(X_test)
    final_scores = {
        'accuracy': accuracy_score(y_test, predictions),
        'precision': precision_score(y_test, predictions, average='weighted'),
        'recall': recall_score(y_test, predictions, average='weighted'),
        'f1': f1_score(y_test, predictions, average='weighted')
    }
    
    return {
        'best_params': best_params,
        'cv_scores': scores,
        'test_scores': final_scores
    }


In [None]:
feature_files = [
    "W100_O25_Features.csv", "W100_O50_Features.csv",
    "W200_O25_Features.csv", "W200_O50_Features.csv",
    "W300_O25_Features.csv", "W300_O50_Features.csv",
    "W400_O25_Features.csv", "W400_O50_Features.csv",
    "W500_O25_Features.csv", "W500_O50_Features.csv"
]

models = [
    'logistic_regression', 'decision_tree', 'random_forest', 'naive_bayes',
    'svm', 'knn', 'adaboost', 'gradient_boost', 'xgboost', 'ann'
]

results = {}
for file in feature_files:
    results[file] = {}
    for model in models:
        print(f"Processing {file} with {model}...")
        results[file][model] = train_model(file, model)


In [35]:
# Dictionary to store results
final_results = {}

# Directory to store results
project_path='../'
results_dir = os.path.join(project_path, 'data/Model_Results')
os.makedirs(results_dir, exist_ok=True)

In [None]:
# Save overall results
overall_results_path = os.path.join(results_dir, "overall_results_W100_O50.json")
with open(overall_results_path, 'w') as f:
    json.dump(final_results, f)