# Import

In [35]:
import random
import numpy as np
import pandas as pd
import time
import optuna
from sklearn.preprocessing import LabelEncoder,RobustScaler,StandardScaler
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score,roc_curve,f1_score,classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from pyxlsb import open_workbook as open_xlsb
from imblearn.under_sampling import RepeatedEditedNearestNeighbours
from imblearn.over_sampling import SMOTE

# function for reading format file .xlsb

In [36]:
def read_xlsb(file_path, sheet_name):
    df_list = []
    with open_xlsb(file_path) as wb:
        with wb.get_sheet(sheet_name) as sheet:
            for row in sheet.rows():
                df_list.append([item.v for item in row])

    df = pd.DataFrame(df_list[1:], columns=df_list[0])
    df.dropna(how='all', inplace=True)  # Remove rows with all missing valuesx
    return df

# download data

In [37]:
train_data_path = "./data/Training.xlsb"
train_sheet_name = 'Training'
train_data = read_xlsb(train_data_path, train_sheet_name)

test_data_path = "./data/Test.xlsb"
test_sheet_name = 'Test'
test_data = read_xlsb(test_data_path, test_sheet_name)

# create train test dataframe
train_data_copy = train_data.copy()
test_data_copy = test_data.copy()
X = train_data_copy.drop(["MARKER","ID"],axis = 1)
X_test = test_data_copy.drop(["MARKER","ID"],axis = 1)
y = train_data_copy['MARKER']
y_test = test_data_copy['MARKER']

# making copies
X_copy = X.copy()
y_copy= y.copy()

# Coding of categorical features

In [38]:
def encode_categorical_features(df):
    encoded_df = df.copy()

    categorical_columns = df.select_dtypes(include=['object']).columns

    for column in categorical_columns:
        unique_values = df[column].nunique()
        if column == 'P':
            encoded_values = {
                '0 Zero': 0,
                '1 One': 1,
                '2 Two': 2,
                '3 Three': 3,
                'More than 3': 4
            }
            encoded_df[column] = df[column].map(encoded_values)
        elif unique_values <= 2:
            encoder = LabelEncoder()
            encoded_df[column] = encoder.fit_transform(df[column])
        else:
            counter = df[column].value_counts().to_dict()
            encoded_df[column] = df[column].map(counter)

    return encoded_df

# Применение кодирования к обучающим данным и сохранение их в CSV файл
train_data_copy[X.columns] = encode_categorical_features(train_data_copy[X.columns])
train_data_copy.to_csv('./data_proc/encoded_train_data.csv')

# Применение кодирования к тестовым данным и сохранение их в CSV файл
test_data_copy[X_test.columns] = encode_categorical_features(test_data_copy[X_test.columns])
test_data_copy.to_csv('./data_proc/encoded_test_data.csv')


# Removing outliers

In [39]:
def remove_outliers(train_data, num_columns_threshold=5, ignore_columns=[], remove_class_1_outliers=True):
    print("len train_data input", len(train_data))
    columns_to_analyze = [col for col in train_data.columns if col not in ignore_columns]
    df = train_data[columns_to_analyze]

    # We count Q1 and Q3 for each column
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)

    # Calculating the interquartile range (IQR)
    iqr = q3 - q1

    # Determining the boundaries of the interval for removing outliers
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Create a mask to define rows with outliers in each column
    outliers_mask = ((df < lower_bound) | (df > upper_bound))

    # Count the number of columns with outliers in each row
    num_outliers = outliers_mask.sum(axis=1)

    # Create a mask to identify rows with outliers in 5 or more columns
    mask = (num_outliers >= num_columns_threshold)

    # Removing rows with outliers from train_data
    if remove_class_1_outliers:
        mask_to_remove = mask
    else:
        mask_to_remove = mask & (train_data['MARKER'] != 1)
    train_data_cleaned = train_data[~mask_to_remove]

    print("len train_data_cleaned", len(train_data_cleaned))
    # Count the number of deleted rows for each class
    num_deleted_class_0 = len(train_data[train_data['MARKER'] == 0]) - len(train_data_cleaned[train_data_cleaned['MARKER'] == 0])
    num_deleted_class_1 = len(train_data[train_data['MARKER'] == 1]) - len(train_data_cleaned[train_data_cleaned['MARKER'] == 1])
    print("Deleted rows from class 0:", num_deleted_class_0)
    print("Deleted rows from class 1:", num_deleted_class_1)

    return train_data_cleaned

# An example of using the function to remove rows from the class MARKER = 1
train_data_copy = remove_outliers(train_data_copy, num_columns_threshold=5, ignore_columns=['ID', 'MARKER'], remove_class_1_outliers=True)
#print(train_data_copy)
train_data_copy.to_csv('./data_proc/remove_outliers_train_data.csv')


len train_data input 89734
len train_data_cleaned 87350
Deleted rows from class 0: 2366
Deleted rows from class 1: 18


# scaling with choice of method

In [40]:
def scale_dataframe(df, ignore_columns=[], scaler_type='robust'):
    columns_to_scale = [col for col in df.columns if col not in ignore_columns]

    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'robust':
        scaler = RobustScaler()
    else:
        raise ValueError("Invalid scaler_type. Use 'standard' or 'robust'.")

    scaled_data = scaler.fit_transform(df[columns_to_scale])

    # Обновляем значения в столбцах, которые нужно масштабировать
    df.loc[:, columns_to_scale] = scaled_data
    return df

# Пример использования с разными способами масштабирования
train_data_copy = scale_dataframe(train_data_copy,ignore_columns=['ID', 'MARKER'],scaler_type= "standard")
#print(train_data_copy)
train_data_copy.to_csv('./data_proc/prepar_train_data.csv')

test_data_copy = scale_dataframe(test_data_copy,ignore_columns=['ID', 'MARKER'],scaler_type= "standard")
#print(test_data_copy)
test_data_copy.to_csv('./data_proc/prepar_test_data.csv')


# general feature selection

In [41]:
# Function for removing features with low diversity of values
def remove_low_variance_features(df, threshold=0.95, ignore_columns=[]):
    num_rows = len(df)
    columns_to_check = [col for col in df.columns if col not in ignore_columns]
    low_variance_columns = [col for col in columns_to_check if (df[col].value_counts() / num_rows).max() > threshold]
    df_filtered = df.drop(columns=low_variance_columns)
    return df_filtered

# Function for removing features with high correlation with each other
def remove_high_correlation_features(df, y_df, threshold=0.8, ignore_columns=['ID', 'MARKER']):
    corr_matrix = df.corr()
    to_drop = []

    for column in corr_matrix.columns:
        if column in ignore_columns:
            continue

        correlated_columns = corr_matrix.index[corr_matrix[column] > threshold].tolist()  # Finds a list of columns that are highly correlated with the current column
        if correlated_columns:
            max_corr_with_target = max(correlated_columns, key=lambda col: abs(df[column].corr(y_df)))
            # Finds the column with the maximum absolute correlation with the target feature y_df

            if column != max_corr_with_target and max_corr_with_target not in ignore_columns:
                to_drop.append(column)

    df_filtered = df.drop(columns=to_drop)
    return df_filtered

# Feature selection using the KBEST method
def select_KBest(X, y, k=17, ignore_columns=['ID', 'MARKER']):
    selector = SelectKBest(score_func=f_classif, k=k)
    X_new = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]

    X_new_df = pd.DataFrame(X_new, columns=selected_features, index=X.index)

    for col in ignore_columns:
        X_new_df[col] = X[col]
    return X_new_df, selected_features



train_data_copy = remove_low_variance_features(train_data_copy,threshold=0.85, ignore_columns=['ID', 'MARKER'])
print("size train_data_copy after remove low variance:",train_data_copy.shape)

train_data_copy = remove_high_correlation_features(train_data_copy,y_copy,threshold=0.75,ignore_columns=['ID', 'MARKER'])
print("size train_data_copy after remove high correlation_features:",train_data_copy.shape)

train_data_copy,important_columns = select_KBest(train_data_copy, train_data_copy["MARKER"], k=17)
print("size train_data_copy after KBest:",train_data_copy.shape)


size train_data_copy after remove low variance: (87350, 23)
size train_data_copy after remove high correlation_features: (87350, 21)
size train_data_copy after KBest: (87350, 17)


  f = msb / msw


# Undersampling (RepeatedEditedNearestNeighbours)

In [42]:
def apply_renn_undersampling(dataframe, target_column_name):
    # Divide the data into features (X) and target variable (y)
    X = dataframe.drop(columns=[target_column_name])
    y = dataframe[target_column_name]

    # Initialize RENN
    renn = RepeatedEditedNearestNeighbours(sampling_strategy='auto', n_neighbors=3, max_iter=100,n_jobs = 2)

    # Use RENN for undersampling
    X_resampled, y_resampled = renn.fit_resample(X, y)

    # Create a new DataFrame with reduced selection
    resampled_dataframe = pd.DataFrame(data=X_resampled, columns=X.columns)
    resampled_dataframe[target_column_name] = y_resampled

    return resampled_dataframe

# Apply the function to DataFrame
print(train_data_copy.shape)
train_data_copy = apply_renn_undersampling(train_data_copy, 'MARKER')
print(train_data_copy.shape)


(87350, 17)
(86011, 17)


# Ovesampling with SMOTE

In [43]:
def apply_smote_oversampling(dataframe, target_column_name, random_state=42):
    # Divide the data into features (X) and target variable (y)
    X = dataframe.drop(columns=[target_column_name])
    y = dataframe[target_column_name]

    # Initialize SMOTE
    smote = SMOTE(sampling_strategy='auto', random_state=random_state,n_jobs = 2)

    # Apply SMOTE for oversampling
    X_resampled, y_resampled = smote.fit_resample(X, y)

    # Create a new DataFrame with a larger sample
    resampled_dataframe = pd.DataFrame(data=X_resampled, columns=X.columns)
    resampled_dataframe[target_column_name] = y_resampled

    return resampled_dataframe

# Apply the function to DataFrame
print(train_data_copy.shape)
train_data_copy = apply_smote_oversampling(train_data_copy, 'MARKER')
print(train_data_copy.shape)


(86011, 17)
(171374, 17)




# Logistic regression 3

In [44]:
def build_logistic_regression_model(df, target_column_name):
    # Select features and target variable
    X = df.drop(columns=['ID', target_column_name])
    y = df[target_column_name]

    # Divide the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define parameters to iterate over
    param_grid = {
        'penalty': ['l1', 'l2'],  # Enumerate L1 and L2 regularization
        'C': np.logspace(-4, 4, 30)  # Iterate through the regularization parameter C
    }

    # Create a logistic regression model
    logreg = LogisticRegression(solver='liblinear', class_weight='balanced')

    # Create a GridSearchCV object to iterate through parameters
    grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='roc_auc')

    # Perform a grid search on the training set
    grid_search.fit(X_train, y_train)

    # Get the best parameters and F1-score metric value
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print("Best Parameters:", best_params)
    print("Best F1 Score:", best_score)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Train the model on selected most important features
    coefficients = best_model.coef_[0]
    feature_names = X.columns
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

    # Select the 10 most important features
    top_features = coefficients_df.iloc[np.abs(coefficients_df['Coefficient']).nlargest(10).index]
    selected_features = top_features['Feature'].tolist()
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    # Training the model using selected features
    best_model.fit(X_train_selected, y_train)

    # Predictions for the test set
    y_pred = best_model.predict(X_test_selected)

    # Calculate the F1-score metric on the test set
    test_f1_score = f1_score(y_test, y_pred)
    print("Test F1 Score:", test_f1_score)

    return best_model, top_features


def evaluate_with_threshold(model, test_data, target_column_name, selected_features, threshold=0.5):
    # Extract features from test data
    test_X = test_data[selected_features]

    # Get probabilities for class 1
    test_probabilities = model.predict_proba(test_X)[:, 1]

    # Apply a probability threshold to get predictions (0 or 1)
    test_predictions = (test_probabilities > threshold).astype(int)

    # Create a DataFrame with predicted values
    predictions_df = pd.DataFrame({'Prediction': test_predictions})

    # Write the DataFrame to a CSV file
    predictions_df.to_csv("predictions.csv", index=False)

    target_data = test_data[target_column_name]

    # Calculate the F1-score metric on test data
    test_f1 = f1_score(target_data, test_predictions)

    # Calculate the ROC AUC metric on test data
    test_roc_auc = roc_auc_score(target_data, test_probabilities)

    return test_predictions, test_f1, test_roc_auc


final_model, coefficients = build_logistic_regression_model(train_data_copy, target_column_name="MARKER")
print("Selected Features:", coefficients)

# Get a list of selected features
selected_features = coefficients['Feature'].tolist()

# Use different probability thresholds and see how the metrics change
threshold = 0.965
test_predictions, test_f1_score, test_roc_auc = evaluate_with_threshold(final_model, test_data_copy, target_column_name="MARKER", selected_features=selected_features, threshold=threshold)
print("Test Predictions:", test_predictions)
print("Test F1 Score (Threshold={}):".format(threshold), test_f1_score)
print("Test ROC AUC Score (Threshold={}):".format(threshold), test_roc_auc)


Best Parameters: {'C': 0.38566204211634725, 'penalty': 'l2'}
Best F1 Score: 0.8967226035112631
Test F1 Score: 0.8160787247511078
Selected Features:    Feature  Coefficient
4        H     1.074952
7        L    -0.824463
1        D     0.820600
9        N     0.718309
10       O    -0.585401
11       R    -0.470017
13       W    -0.326202
6        J    -0.262980
8        M    -0.217708
14       X    -0.193170
Test Predictions: [1 1 0 ... 0 0 0]
Test F1 Score (Threshold=0.965): 0.11180124223602483
Test ROC AUC Score (Threshold=0.965): 0.8744854375745416


# Random forest 2

In [45]:
def objective(trial, X, y):
    n_estimators = trial.suggest_int('n_estimators', 5, 35, step=5)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    min_samples_split = trial.suggest_float('min_samples_split', 0.05, 1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.05, 0.5)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        class_weight='balanced',
        random_state=42
    )

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')
    average_score = scores.mean()

    return average_score

def build_and_predict_random_forest(train_data, test_data, target_column_name, selected_features):
    # Выделите признаки и целевую переменную
    X_train = train_data[selected_features]
    y_train = train_data[target_column_name]
    X_test = test_data[selected_features]
    y_test = test_data[target_column_name]

    # Подбор гиперпараметров
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=40)

    best_params = study.best_params
    best_n_estimators = best_params['n_estimators']
    best_max_depth = best_params['max_depth']
    best_min_samples_split = best_params['min_samples_split']
    best_min_samples_leaf = best_params['min_samples_leaf']

    # Создайте и обучите модель случайного леса с наилучшими гиперпараметрами
    model = RandomForestClassifier(
        n_estimators=best_n_estimators,
        max_depth=best_max_depth,
        min_samples_split=best_min_samples_split,
        min_samples_leaf=best_min_samples_leaf,
        class_weight='balanced',
        random_state=42
    )
    model.fit(X_train, y_train)

    # Прогнозирование на тестовых данных
    y_pred_prob = model.predict_proba(X_test)[:, 1]

    # Вычисление ROC AUC
    roc_auc = roc_auc_score(y_test, y_pred_prob)

    # Вывод ROC AUC
    print("ROC AUC:", roc_auc)

    return model

# Построение и оценка модели случайного леса
model = build_and_predict_random_forest(train_data_copy, test_data_copy, "MARKER", selected_features)
print("Predictions on Test Data:",model.predict(test_data_copy[selected_features]))

[I 2023-10-05 16:20:21,336] A new study created in memory with name: no-name-76af1eb5-4b9a-4526-a520-7b136eb8ad1f
[I 2023-10-05 16:20:23,271] Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 30, 'max_depth': 13, 'min_samples_split': 0.8526759234382455, 'min_samples_leaf': 0.1716245722466027}. Best is trial 0 with value: 0.5.
[I 2023-10-05 16:20:24,912] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 25, 'max_depth': 14, 'min_samples_split': 0.9207459982077155, 'min_samples_leaf': 0.15675295717410476}. Best is trial 0 with value: 0.5.
[I 2023-10-05 16:20:26,065] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 15, 'max_depth': 10, 'min_samples_split': 0.8922468677033729, 'min_samples_leaf': 0.44773565542626337}. Best is trial 0 with value: 0.5.
[I 2023-10-05 16:20:27,003] Trial 3 finished with value: 0.913149798865026 and parameters: {'n_estimators': 5, 'max_depth': 3, 'min_samples_split': 0.6146401881206447, 'min_samples_leaf': 0.2760

ROC AUC: 0.8261607887409068
Predictions on Test Data: [1. 1. 1. ... 0. 1. 0.]


# Adaboosting

In [46]:
# Function for optimizing hyperparameters
def optimize_adaboost_hyperparameters(trial, X_train, y_train):
    n_estimators = trial.suggest_int('n_estimators', 1, 15, step=2)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 1.0, log=True)

    estimator = DecisionTreeClassifier(class_weight='balanced')
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, estimator=estimator, random_state=42)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')
    return np.mean(scores)

# Function for training and prediction with optimal model
def train_and_predict_with_optimal_model(X_train, y_train, X_test, y_test, selected_features, best_params):
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    optimal_model = AdaBoostClassifier(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], random_state=42)
    optimal_model.fit(X_train_selected, y_train)

    y_pred_prob = optimal_model.predict_proba(X_test_selected)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    optimal_threshold_index = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_threshold_index]

    y_pred = (y_pred_prob > optimal_threshold).astype(int)

    return y_pred

def run_ada_boosting():
    start = time.time()

    # Loading data
    X_train = train_data_copy[selected_features]
    y_train = train_data_copy['MARKER']

    X_test = test_data_copy[selected_features]
    y_test = test_data_copy['MARKER']

    # Create a Study object for optimization
    study = optuna.create_study(direction='maximize')

    # Run hyperparameter optimization
    study.optimize(lambda trial: optimize_adaboost_hyperparameters(trial, X_train, y_train), n_trials=30, n_jobs=2)

    # Getting the best hyperparameters
    best_params = study.best_params

    # Train a model with optimal hyperparameters and make predictions on a test data set
    y_pred = train_and_predict_with_optimal_model(X_train, y_train, X_test, y_test, selected_features, best_params)
    y_pred_prob = model.predict_proba(X_test)[:, 1]

    # Calculate metrics
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    f1 = f1_score(y_test, y_pred)

    # Output the results of optimization and model evaluation
    print("Optimal Hyperparameters:", best_params)
    print("Predictions on Test Data:", y_pred)
    np.savetxt('./data_proc/y_pred_boosting.csv', y_pred)
    print("Test ROC AUC:", roc_auc)
    print("Test F1 Score:", f1)

    finish = time.time()
    print("время выполнения", finish - start)

run_ada_boosting()

[I 2023-10-05 16:25:06,505] A new study created in memory with name: no-name-4c611631-1343-4df0-a50b-3c8af50b7cb3
[I 2023-10-05 16:25:12,585] Trial 0 finished with value: 0.9921925233954292 and parameters: {'n_estimators': 7, 'learning_rate': 0.08864122543263231}. Best is trial 0 with value: 0.9921925233954292.
[I 2023-10-05 16:25:12,756] Trial 1 finished with value: 0.9921925233954292 and parameters: {'n_estimators': 9, 'learning_rate': 0.0034119576850596264}. Best is trial 0 with value: 0.9921925233954292.
[I 2023-10-05 16:25:18,686] Trial 2 finished with value: 0.9921925233954292 and parameters: {'n_estimators': 5, 'learning_rate': 0.05055191219812622}. Best is trial 0 with value: 0.9921925233954292.
[I 2023-10-05 16:25:18,734] Trial 3 finished with value: 0.9921925233954292 and parameters: {'n_estimators': 9, 'learning_rate': 0.0020948769099855703}. Best is trial 0 with value: 0.9921925233954292.
[I 2023-10-05 16:25:24,642] Trial 4 finished with value: 0.9921925233954292 and parame

Optimal Hyperparameters: {'n_estimators': 7, 'learning_rate': 0.08864122543263231}
Predictions on Test Data: [1 1 1 ... 0 0 0]
Test ROC AUC: 0.8261607887409068
Test F1 Score: 0.022508038585209
время выполнения 78.83369278907776


# Universal algorithm 2

In [47]:
# Hyperparameter Optimization
def optimize_hyperparameters(X, y, model_type):
    def objective(trial):
        if model_type == 'logistic_regression':
            param_grid = {
                'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
                'C': trial.suggest_loguniform('C', 1e-6, 1e4)
            }
            model = LogisticRegression(solver='liblinear', class_weight='balanced', **param_grid)
        elif model_type == 'random_forest':
            param_grid = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 350, step=50),
                'max_depth': trial.suggest_int('max_depth', 3, 15),
                'min_samples_split': trial.suggest_float('min_samples_split', 0.05, 1.0),
                'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.05, 0.5)
            }
            model = RandomForestClassifier(
                n_estimators=param_grid['n_estimators'],
                max_depth=param_grid['max_depth'],
                min_samples_split=param_grid['min_samples_split'],
                min_samples_leaf=param_grid['min_samples_leaf'],
                class_weight='balanced',
                random_state=42
            )
        elif model_type == 'adaboost':
            n_estimators = trial.suggest_int('n_estimators', 5, 75, step=5)
            learning_rate = trial.suggest_float('learning_rate', 0.001, 1.0, log=True)

            estimator = DecisionTreeClassifier(class_weight='balanced')
            model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, estimator=estimator, random_state=42)
        # Add other models and hyperparameters as needed

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc' )
        return np.mean(scores)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20,n_jobs= 4)
    best_params = study.best_params
    return best_params

# Model training
def train_model(X_train, y_train, model_type, hyperparameters=None):
    if model_type == 'logistic_regression':
        if hyperparameters is None:
            model = LogisticRegression(solver='liblinear', class_weight='balanced')
        else:
            model = LogisticRegression(solver='liblinear', class_weight='balanced', **hyperparameters)
    elif model_type == 'random_forest':
        model = RandomForestClassifier(
            #class_weight='balanced',
            random_state=42
        )
    elif model_type == 'adaboost':
        if hyperparameters is None:
            model = AdaBoostClassifier(random_state=42)
        else:
            model = AdaBoostClassifier(n_estimators=hyperparameters['n_estimators'], learning_rate=hyperparameters['learning_rate'], random_state=42)
    #Add other models and options as needed

    model.fit(X_train, y_train)
    return model

# Model evaluation
def evaluate_model(model, X_test, y_test):
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    optimal_threshold_index = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_threshold_index]
    y_pred = (y_pred_prob > optimal_threshold).astype(int)

    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)

    return y_pred, f1, roc_auc

def fitness_score(model_type, X_train, y_train, selected_features):
    model = train_model(X_train[selected_features], y_train, model_type)
    _, _, roc_auc_score = evaluate_model(model, X_train[selected_features], y_train)
    return roc_auc_score

def mutate(selected_features, mutation_rate):
    mutated_features = selected_features.copy()
    for i in range(len(mutated_features)):
        if random.random() < mutation_rate:
            mutated_features[i] = random.randint(0, 1)  # 1 or 0 with equal probability
    return mutated_features

# Genetic algorithm
def genetic_algorithm(X_train, y_train, model_type, num_gener, popul_size, mutation_rate):
    num_features = X_train.shape[1]
    column_names = X_train.columns

    population = [random.choices([0, 1], k=num_features) for _ in range(popul_size)]

    for generation in range(num_gener):
        scores = []
        for selected_features in population:
            selected_column_names = [column_names[i] for i, select in enumerate(selected_features) if select == 1]
            score = fitness_score(model_type, X_train, y_train, selected_column_names)
            scores.append((selected_features, score))

        scores.sort(key=lambda x: x[1], reverse=True)
        elite = scores[:num_features // 2]

        new_population = [selected_features for selected_features, _ in elite]
        while len(new_population) < num_features:
            parent1, parent2 = elite[0][0], elite[1][0]  # Use the top 2 parents
            child = [parent1[i] if random.random() < 0.5 else parent2[i] for i in range(num_features)]
            child = mutate(child, mutation_rate)
            new_population.append(child)

        population = new_population

    best_features = max(scores, key=lambda x: x[1])[0]
    return best_features

def select_features_random_forest(X_train, y_train, num_features):
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    feature_importances = model.feature_importances_
    feature_indices = np.argsort(feature_importances)[::-1]  # Sort in descending order

    selected_features = X_train.columns[feature_indices[:num_features]]
    return selected_features


def select_features_adaboost(X_train, y_train, num_features):
    model = AdaBoostClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    feature_weights = model.feature_importances_
    feature_indices = np.argsort(feature_weights)[::-1]  # Sort in descending order

    selected_features = X_train.columns[feature_indices[:num_features]]
    return selected_features


# Run the experiment
def run_experiment(train_data, test_data, target_column_name, model_type, use_gen_alg=False, num_gener=10, popul_size=20, num_features=False, mutat_rate=0.1, hyp_optimize=True):
    X_train = train_data.drop(columns=['ID', target_column_name])
    y_train = train_data[target_column_name]

    if (use_gen_alg or num_features) is False:
        best_features = list(X_train.columns)  # Use all features
    else:
        if use_gen_alg and not num_features:
            print("genetic algorithm selected")
            best_feature_indices = genetic_algorithm(X_train, y_train, model_type, num_gener, popul_size, mutat_rate)
            best_features = [X_train.columns[i] for i, select in enumerate(best_feature_indices) if select == 1]
        else:
            print("built-in algorithm selected")
            if model_type == 'logistic_regression' and num_features:
                # Use top N features based on coefficients if not optimizing hyperparameters
                model = train_model(X_train, y_train, model_type)
                coef_abs = np.abs(model.coef_[0])
                best_features_idx = np.argsort(coef_abs)[-num_features:]
                best_features = list(X_train.columns[best_features_idx])

            elif model_type == 'random_forest' and num_features:
                best_features = select_features_random_forest(X_train, y_train, num_features)

            elif model_type == 'adaboost' and num_features:
                best_features = select_features_adaboost(X_train, y_train, num_features)

    print("Selected Features:", best_features)

    X_train_selected = X_train[best_features]
    X_test_selected = test_data.drop(columns=['ID', target_column_name])[best_features]

    if hyp_optimize:
        best_hyperparameters = optimize_hyperparameters(X_train_selected, y_train, model_type)
        print("Best Hyperparameters:", best_hyperparameters)
    else:
        best_hyperparameters = None

    model = train_model(X_train_selected, y_train, model_type, best_hyperparameters)

    y_pred, f1, roc_auc = evaluate_model(model, X_test_selected, test_data[target_column_name])

    y_pred = pd.DataFrame(y_pred)
    y_pred.to_csv('./data_proc/y_pred_universal.csv')

    print("Test Predictions:", y_pred)
    print("Test F1 Score:", f1)
    print("Test ROC AUC Score:", roc_auc)

print("column to input:",train_data_copy.columns)
run_experiment(train_data_copy, test_data_copy, 'MARKER', "random_forest", use_gen_alg=False, num_gener=5, popul_size=4,num_features= 10, mutat_rate=0.999, hyp_optimize=True)

column to input: Index(['ID', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'R', 'S',
       'W', 'X', 'MARKER'],
      dtype='object')
built-in algorithm selected


[I 2023-10-05 16:26:51,990] A new study created in memory with name: no-name-1d3eae99-a520-408f-b030-cefbdb8082fe


Selected Features: Index(['H', 'O', 'G', 'S', 'W', 'I', 'N', 'D', 'E', 'L'], dtype='object')


[I 2023-10-05 16:26:57,576] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 50, 'max_depth': 3, 'min_samples_split': 0.974464911598113, 'min_samples_leaf': 0.14677535810609324}. Best is trial 1 with value: 0.5.
[I 2023-10-05 16:27:07,647] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 150, 'max_depth': 8, 'min_samples_split': 0.35997048219839556, 'min_samples_leaf': 0.4093697785973021}. Best is trial 1 with value: 0.5.
[I 2023-10-05 16:27:12,394] Trial 0 finished with value: 0.5 and parameters: {'n_estimators': 200, 'max_depth': 4, 'min_samples_split': 0.4167393256833172, 'min_samples_leaf': 0.47966086094336136}. Best is trial 1 with value: 0.5.
[I 2023-10-05 16:27:25,220] Trial 3 finished with value: 0.5 and parameters: {'n_estimators': 350, 'max_depth': 13, 'min_samples_split': 0.7397989432731723, 'min_samples_leaf': 0.29826042876490905}. Best is trial 1 with value: 0.5.
[I 2023-10-05 16:27:30,263] Trial 4 finished with value: 0.5 and parameters: 

Best Hyperparameters: {'n_estimators': 50, 'max_depth': 10, 'min_samples_split': 0.07540658892595858, 'min_samples_leaf': 0.05108125338542559}
Test Predictions:        0
0      1
1      1
2      1
3      1
4      0
...   ..
38400  0
38401  1
38402  1
38403  1
38404  0

[38405 rows x 1 columns]
Test F1 Score: 0.014648220759044166
Test ROC AUC Score: 0.727317182419608
