In [1]:
import random
import numpy as np
import pandas as pd
import time
import optuna
from sklearn.preprocessing import LabelEncoder,RobustScaler,StandardScaler
from sklearn.feature_selection import SelectKBest,f_classif
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score,StratifiedKFold
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score,roc_curve,f1_score,classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from main import read_xlsb

In [2]:

# download data
train_data_path = "./data/Training.xlsb"
train_sheet_name = 'Training'  # Укажите имя листа, который хотите прочитать
train_data = read_xlsb(train_data_path, train_sheet_name)

test_data_path = "./data/Test.xlsb"
test_sheet_name = 'Test'  # Укажите имя листа, который хотите прочитать
test_data = read_xlsb(test_data_path, test_sheet_name)

# создание train test dataframe
train_data_copy = train_data.copy()
test_data_copy = test_data.copy()
X = train_data_copy.drop(["MARKER","ID"],axis = 1)
X_test = test_data_copy.drop(["MARKER","ID"],axis = 1)
y = train_data_copy['MARKER']
y_test = test_data_copy['MARKER']

#создание копий
X_copy = X.copy()
y_copy= y.copy()

In [3]:
# удаление дублирующих строк
train_data_copy = train_data_copy.drop_duplicates()
print(train_data_copy)


            ID         A         B       C       D         E         F     G  \
0          1.0  0.198778  0.099389    0.00  799.90  1.777556  0.888778  13.0   
1          2.0  0.043000  0.021264   49.97  173.03  0.384511  0.190143  13.0   
2          3.0  0.067073  0.067073    0.00  329.90  0.599818  0.599818  13.0   
3          4.0  0.052700  0.052700    0.00  235.65  0.471300  0.471300  13.0   
4          5.0  0.141880  0.141880    0.00  634.45  1.268900  1.268900  13.0   
...        ...       ...       ...     ...     ...       ...       ...   ...   
89729  89730.0  0.027941  0.014844   25.00  474.00  0.557647  0.296250  20.0   
89730  89731.0  0.120017  0.068581  179.96  720.03  1.200050  0.685743  10.0   
89731  89732.0  0.153033  0.122427   29.01  550.99  0.459158  0.367327   3.0   
89732  89733.0  0.107575  0.043030   53.82  215.18  0.537950  0.215180   5.0   
89733  89734.0  0.095750  0.095750   57.48  172.42  0.287367  0.287367   3.0   

          H      I  ...       P        

In [4]:
# кодирование категориальных признаков
def encode_categorical_features(df):
    encoded_df = df.copy()

    # Шаг 1: Выбор категориальных признаков
    categorical_columns = df.select_dtypes(include=['object']).columns

    # Шаг 2: Закодировать категориальные признаки
    for column in categorical_columns:
        unique_values = df[column].nunique()
        if column == 'P':
            # Отдельный метод для столбца 'P'
            encoded_values = {
                '0 Zero': 0,
                '1 One': 1,
                '2 Two': 2,
                '3 Three': 3,
                'More than 3': 4
            }
            encoded_df[column] = df[column].map(encoded_values)
        elif unique_values <= 2:
            # One-hot encoding для признаков с <= 2 уникальными значениями
            encoder = LabelEncoder()
            encoded_df[column] = encoder.fit_transform(df[column])
        else:
            # Count encoding для признаков с > 2 уникальными значениями
            counter = df[column].value_counts().to_dict()
            encoded_df[column] = df[column].map(counter)

    return encoded_df


train_data_copy[X.columns] = encode_categorical_features(train_data_copy[X.columns])
print(train_data_copy)
train_data_copy.to_csv('./data_proc/encoded_train_data.csv')

test_data_copy[X_test.columns] = encode_categorical_features(test_data_copy[X_test.columns])
print(test_data_copy)
test_data_copy.to_csv('./data_proc/encoded_test_data.csv')

            ID         A         B       C       D         E         F     G  \
0          1.0  0.198778  0.099389    0.00  799.90  1.777556  0.888778  13.0   
1          2.0  0.043000  0.021264   49.97  173.03  0.384511  0.190143  13.0   
2          3.0  0.067073  0.067073    0.00  329.90  0.599818  0.599818  13.0   
3          4.0  0.052700  0.052700    0.00  235.65  0.471300  0.471300  13.0   
4          5.0  0.141880  0.141880    0.00  634.45  1.268900  1.268900  13.0   
...        ...       ...       ...     ...     ...       ...       ...   ...   
89729  89730.0  0.027941  0.014844   25.00  474.00  0.557647  0.296250  20.0   
89730  89731.0  0.120017  0.068581  179.96  720.03  1.200050  0.685743  10.0   
89731  89732.0  0.153033  0.122427   29.01  550.99  0.459158  0.367327   3.0   
89732  89733.0  0.107575  0.043030   53.82  215.18  0.537950  0.215180   5.0   
89733  89734.0  0.095750  0.095750   57.48  172.42  0.287367  0.287367   3.0   

          H  I  ...  P      Q  R      S

In [5]:
def remove_outliers(train_data, num_columns_threshold=5, ignore_columns=[], remove_class_1_outliers=True):
    print("len train_data input", len(train_data))
    # Выбираем только столбцы, которые не находятся в списке ignore_columns
    columns_to_analyze = [col for col in train_data.columns if col not in ignore_columns]
    df = train_data[columns_to_analyze]

    # Подсчитываем Q1 и Q3 для каждого столбца
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)

    # Рассчитываем межквартильный размах (IQR)
    iqr = q3 - q1

    # Определяем границы интервала для удаления выбросов
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    # Создаем маску для определения строк с выбросами в каждом столбце
    outliers_mask = ((df < lower_bound) | (df > upper_bound))

    # Подсчитываем количество столбцов с выбросами в каждой строке
    num_outliers = outliers_mask.sum(axis=1)

    # Создаем маску для определения строк с выбросами в 5 и более столбцах
    mask = (num_outliers >= num_columns_threshold)

    # Удаляем строки с выбросами из train_data
    if remove_class_1_outliers:
        mask_to_remove = mask
    else:
        mask_to_remove = mask & (train_data['MARKER'] != 1)
    train_data_cleaned = train_data[~mask_to_remove]

    print("len train_data_cleaned", len(train_data_cleaned))
    # Подсчитываем количество удаленных строк для каждого класса
    num_deleted_class_0 = len(train_data[train_data['MARKER'] == 0]) - len(train_data_cleaned[train_data_cleaned['MARKER'] == 0])
    num_deleted_class_1 = len(train_data[train_data['MARKER'] == 1]) - len(train_data_cleaned[train_data_cleaned['MARKER'] == 1])
    print("Deleted rows from class 0:", num_deleted_class_0)
    print("Deleted rows from class 1:", num_deleted_class_1)

    return train_data_cleaned

# Пример использования функции с удалением строк из класса MARKER = 1
train_data_copy = remove_outliers(train_data_copy, num_columns_threshold=5, ignore_columns=['ID', 'MARKER'], remove_class_1_outliers=True)
print(train_data_copy)
train_data_copy.to_csv('./data_proc/remove_outliers_train_data.csv')


len train_data input 89734
len train_data_cleaned 87350
Deleted rows from class 0: 2366
Deleted rows from class 1: 18
            ID         A         B       C       D         E         F     G  \
0          1.0  0.198778  0.099389    0.00  799.90  1.777556  0.888778  13.0   
1          2.0  0.043000  0.021264   49.97  173.03  0.384511  0.190143  13.0   
2          3.0  0.067073  0.067073    0.00  329.90  0.599818  0.599818  13.0   
3          4.0  0.052700  0.052700    0.00  235.65  0.471300  0.471300  13.0   
4          5.0  0.141880  0.141880    0.00  634.45  1.268900  1.268900  13.0   
...        ...       ...       ...     ...     ...       ...       ...   ...   
89729  89730.0  0.027941  0.014844   25.00  474.00  0.557647  0.296250  20.0   
89730  89731.0  0.120017  0.068581  179.96  720.03  1.200050  0.685743  10.0   
89731  89732.0  0.153033  0.122427   29.01  550.99  0.459158  0.367327   3.0   
89732  89733.0  0.107575  0.043030   53.82  215.18  0.537950  0.215180   5.0   
89

In [6]:
# масштабирование с выбором способа
def scale_dataframe(df, ignore_columns=[], scaler_type='robust'):
    columns_to_scale = [col for col in df.columns if col not in ignore_columns]

    if scaler_type == 'standard':
        scaler = StandardScaler()
    elif scaler_type == 'robust':
        scaler = RobustScaler()
    else:
        raise ValueError("Invalid scaler_type. Use 'standard' or 'robust'.")

    scaled_data = scaler.fit_transform(df[columns_to_scale])

    # Обновляем значения в столбцах, которые нужно масштабировать
    df.loc[:, columns_to_scale] = scaled_data
    return df

# Пример использования с разными способами масштабирования
train_data_copy = scale_dataframe(train_data_copy,ignore_columns=['ID', 'MARKER'],scaler_type= "standard")
print(train_data_copy)
train_data_copy.to_csv('./data_proc/prepar_train_data.csv')

test_data_copy = scale_dataframe(test_data_copy,ignore_columns=['ID', 'MARKER'],scaler_type= "standard")
print(test_data_copy)
test_data_copy.to_csv('./data_proc/prepar_test_data.csv')


            ID         A         B         C         D         E         F  \
0          1.0  0.260126 -0.096338 -0.851553  0.650218  1.024437  0.448462   
1          2.0 -0.829576 -0.893296 -0.330364 -0.843031 -0.719023 -0.837314   
2          3.0 -0.661182 -0.425997 -0.851553 -0.469356 -0.449556 -0.083343   
3          4.0 -0.761722 -0.572614 -0.851553 -0.693866 -0.610402 -0.319869   
4          5.0 -0.137887  0.337116 -0.851553  0.256104  0.387831  1.148043   
...        ...       ...       ...       ...       ...       ...       ...   
89729  89730.0 -0.934916 -0.958787 -0.590802 -0.126099 -0.502335 -0.642033   
89730  89731.0 -0.290827 -0.410611  1.025437  0.459962  0.301662  0.074794   
89731  89732.0 -0.059867  0.138671 -0.548978  0.057296 -0.625598 -0.511223   
89732  89733.0 -0.377859 -0.671258 -0.290208 -0.742627 -0.526987 -0.791235   
89733  89734.0 -0.460577 -0.133459 -0.252035 -0.844484 -0.840603 -0.658382   

              G         H         I  ...         P         Q   

In [7]:
# общий отбор признаков
# Функция для удаления признаков с малым разнообразием значений
def remove_low_variance_features(df, threshold=0.95, ignore_columns=[]):
    num_rows = len(df)
    columns_to_check = [col for col in df.columns if col not in ignore_columns]
    low_variance_columns = [col for col in columns_to_check if (df[col].value_counts() / num_rows).max() > threshold]
    df_filtered = df.drop(columns=low_variance_columns)
    return df_filtered

# Функция для удаления признаков с высокой корреляцией между собой
def remove_high_correlation_features(df, y_df, threshold=0.8, ignore_columns=['ID', 'MARKER']):
    corr_matrix = df.corr()  # Вычисляет матрицу корреляции для всех столбцов в df
    to_drop = []  # Создает пустой список, в который будут добавляться столбцы для удаления

    for column in corr_matrix.columns:  # Проходит по каждому столбцу матрицы корреляции
        if column in ignore_columns:
            continue  # Пропускает столбцы, указанные в ignore_columns

        correlated_columns = corr_matrix.index[corr_matrix[column] > threshold].tolist()  # Находит список столбцов, которые сильно коррелируют с текущим столбцом
        if correlated_columns:  # Если список сильно коррелирующих столбцов не пустой
            max_corr_with_target = max(correlated_columns, key=lambda col: abs(df[column].corr(y_df)))
            # Находит столбец с максимальной абсолютной корреляцией с целевым признаком y_df

            if column != max_corr_with_target and max_corr_with_target not in ignore_columns:  # Добавляем проверку, чтобы игнорировать столбцы из ignore_columns
                to_drop.append(column)  # Добавляет текущий столбец в список для удаления

    df_filtered = df.drop(columns=to_drop)  # Удаляет столбцы из списка to_drop из исходного dataframe df
    return df_filtered  # Возвращает новый dataframe с удаленными столбцами


#Univariate feature selection

def select_features_roc_auc(X, y, k=17, ignore_columns=['ID', 'MARKER']):
    selector = SelectKBest(score_func=f_classif, k=k)
    X_new = selector.fit_transform(X, y)
    selected_features = X.columns[selector.get_support()]

    X_new_df = pd.DataFrame(X_new, columns=selected_features, index=X.index)

    for col in ignore_columns:
        X_new_df[col] = X[col]
    return X_new_df, selected_features



train_data_copy = remove_low_variance_features(train_data_copy,threshold=0.85, ignore_columns=['ID', 'MARKER'])
print(train_data_copy.columns,len(train_data_copy.columns))

train_data_copy = remove_high_correlation_features(train_data_copy,y_copy,threshold=0.75,ignore_columns=['ID', 'MARKER'])
print(train_data_copy.columns,len(train_data_copy.columns))

train_data_copy,important_columns = select_features_roc_auc(train_data_copy, train_data_copy["MARKER"], k=17)
print(train_data_copy.columns,len(train_data_copy.columns))



Index(['ID', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
       'N', 'O', 'P', 'R', 'S', 'T', 'W', 'X', 'MARKER'],
      dtype='object') 23
Index(['ID', 'A', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
       'P', 'R', 'S', 'T', 'W', 'X', 'MARKER'],
      dtype='object') 21
Index(['ID', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'R', 'S',
       'W', 'X', 'MARKER'],
      dtype='object') 17


  f = msb / msw


In [8]:
# Oversampling

def perform_random_oversampling(df, target_column_name):
    # Выделите признаки и целевую переменную
    """
    X = df.drop(columns=[target_column_name])
    y = df[target_column_name]
    """
    # Разделим данные по значениям целевой переменной
    df_majority = df[df[target_column_name] == 0]
    df_minority = df[df[target_column_name] == 1]

    # Применим Random Over-sampling к классу с меньшим количеством записей
    df_minority_upsampled = resample(df_minority, replace=True, n_samples=len(df_majority), random_state=42)

    # Объединим балансированные классы
    balanced_df = pd.concat([df_majority, df_minority_upsampled])

    return balanced_df

"""
# Примените функцию для создания новой таблицы с балансированными классами
train_data_copy = perform_random_oversampling(train_data_copy, target_column_name="MARKER")
print(train_data_copy)
train_data_copy.to_csv("random_oversampling.csv")
"""

'\n# Примените функцию для создания новой таблицы с балансированными классами\ntrain_data_copy = perform_random_oversampling(train_data_copy, target_column_name="MARKER")\nprint(train_data_copy)\ntrain_data_copy.to_csv("random_oversampling.csv")\n'

In [9]:
# логистическая регрессия 3
def build_logistic_regression_model(df, target_column_name):
    # Выделите признаки и целевую переменную
    X = df.drop(columns=['ID', target_column_name])
    y = df[target_column_name]

    # Разделите данные на обучающий и тестовый наборы
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Определите параметры для перебора
    param_grid = {
        'penalty': ['l1', 'l2'],  # Перебор L1 и L2 регуляризации
        'C': np.logspace(-4, 4, 30)  # Перебор параметра регуляризации C
    }

    # Создайте модель логистической регрессии
    logreg = LogisticRegression(solver='liblinear', class_weight='balanced')

    # Создайте объект GridSearchCV для перебора параметров
    #grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='f1')
    grid_search = GridSearchCV(logreg, param_grid, cv=5, scoring='roc_auc')

    # Выполните поиск по сетке на обучающем наборе
    grid_search.fit(X_train, y_train)

    # Получите наилучшие параметры и значение метрики F1-score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    print("Best Parameters:", best_params)
    print("Best F1 Score:", best_score)

    # Получите наилучшую модель
    best_model = grid_search.best_estimator_

    # Обучите модель на выбранных наиболее важных признаках
    coefficients = best_model.coef_[0]
    feature_names = X.columns
    coefficients_df = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})

    # Отберите 10 наиболее важных признаков
    top_features = coefficients_df.iloc[np.abs(coefficients_df['Coefficient']).nlargest(10).index]
    selected_features = top_features['Feature'].tolist()
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    # Обучение модели на выбранных признаках
    best_model.fit(X_train_selected, y_train)

    # Предсказания для тестового набора
    y_pred = best_model.predict(X_test_selected)

    # Вычислите метрику F1-score на тестовом наборе
    test_f1_score = f1_score(y_test, y_pred)
    print("Test F1 Score:", test_f1_score)

    return best_model, top_features


def evaluate_with_threshold(model, test_data, target_column_name, selected_features, threshold=0.5):
    # Выделите признаки из test данных
    test_X = test_data[selected_features]

    # Получите вероятности для класса 1
    test_probabilities = model.predict_proba(test_X)[:, 1]

    # Примените порог вероятности для получения предсказаний (0 или 1)
    test_predictions = (test_probabilities > threshold).astype(int)

    # Создайте DataFrame с предсказанными значениями
    predictions_df = pd.DataFrame({'Prediction': test_predictions})

    # Запишите DataFrame в CSV-файл
    predictions_df.to_csv("predictions.csv", index=False)

    target_data = test_data[target_column_name]

    # Вычислим метрику F1-score на test данных
    test_f1 = f1_score(target_data, test_predictions)

    # Вычислим метрику ROC AUC на test данных
    test_roc_auc = roc_auc_score(target_data, test_probabilities)

    return test_predictions, test_f1, test_roc_auc


final_model, coefficients = build_logistic_regression_model(train_data_copy, target_column_name="MARKER")
print("Selected Features:", coefficients)

# Получите список выбранных признаков
selected_features = coefficients['Feature'].tolist()

# Используйте разные значения порога вероятности и смотрите, как изменяются метрики
threshold = 0.965  # Пример значения порога
test_predictions, test_f1_score, test_roc_auc = evaluate_with_threshold(final_model, test_data_copy, target_column_name="MARKER", selected_features=selected_features, threshold=threshold)
print("Test Predictions:", test_predictions)
print("Test F1 Score (Threshold={}):".format(threshold), test_f1_score)
print("Test ROC AUC Score (Threshold={}):".format(threshold), test_roc_auc)


Best Parameters: {'C': 0.004520353656360241, 'penalty': 'l1'}
Best F1 Score: 0.8734017484448844
Test F1 Score: 0.03255069370330843
Selected Features:    Feature  Coefficient
4        H     0.914290
1        D     0.669932
7        L    -0.603961
9        N     0.525908
11       R    -0.418870
10       O    -0.416445
13       W    -0.270730
14       X    -0.200644
8        M    -0.178622
5        I    -0.160928
Test Predictions: [0 0 0 ... 0 0 0]
Test F1 Score (Threshold=0.965): 0.0855614973262032
Test ROC AUC Score (Threshold=0.965): 0.8742808120714367


In [10]:
# универсальный алгоритм
"""
def optimize_hyperparameters(X, y, model_type):
    def objective(trial):
        if model_type == 'logistic_regression':
            param_grid = {
                'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
                'C': trial.suggest_loguniform('C', 1e-6, 1e4)
            }
        # Другие модели и гиперпараметры
        # ...

        model = LogisticRegression(solver='liblinear', class_weight='balanced', **param_grid)
        scores = cross_val_score(model, X, y, cv=5, scoring='roc_auc', n_jobs=2)
        return np.mean(scores)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=10)
    best_params = study.best_params
    return best_params

def train_model(X_train, y_train, model_type, hyperparameters=None):
    if model_type == 'logistic_regression':
        if hyperparameters is None:
            model = LogisticRegression(solver='liblinear', class_weight='balanced')
        else:
            model = LogisticRegression(solver='liblinear', class_weight='balanced', **hyperparameters)
    # Другие модели и параметры
    # ...

    model.fit(X_train, y_train)
    return model

def evaluate_model(model, X_test, y_test):
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    optimal_threshold_index = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_threshold_index]
    y_pred = (y_pred_prob > optimal_threshold).astype(int)

    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)

    return y_pred, f1, roc_auc


def fitness_score(model_type, X_train, y_train, selected_features):
    model = train_model(X_train[selected_features], y_train, model_type)
    _, _, roc_auc_score = evaluate_model(model, X_train[selected_features], y_train)
    return roc_auc_score

def mutate(selected_features, mutation_rate):
    mutated_features = selected_features.copy()
    for i in range(len(mutated_features)):
        if random.random() < mutation_rate:
            mutated_features[i] = random.randint(0, 1)  # 1 or 0 with equal probability
    return mutated_features


def genetic_algorithm(X_train, y_train, model_type,num_gener,popul_size,mutation_rate):
    num_features = X_train.shape[1]
    column_names = X_train.columns

    population = [random.choices([0, 1], k=num_features) for _ in range(popul_size)]

    for generation in range(num_gener):
        scores = []
        for selected_features in population:
            selected_column_names = [column_names[i] for i, select in enumerate(selected_features) if select == 1]
            score = fitness_score(model_type, X_train, y_train, selected_column_names)
            scores.append((selected_features, score))

        scores.sort(key=lambda x: x[1], reverse=True)
        elite = scores[:num_features // 2]

        new_population = [selected_features for selected_features, _ in elite]
        while len(new_population) < num_features:
            parent1, parent2 = elite[0][0], elite[1][0]  # Use the top 2 parents
            child = [parent1[i] if random.random() < 0.5 else parent2[i] for i in range(num_features)]
            child = mutate(child, mutation_rate)
            new_population.append(child)

        population = new_population

    best_features = max(scores, key=lambda x: x[1])[0]
    return best_features


def run_experiment(train_data, test_data, target_column_name, model_type, use_gen_alg=False, num_gener=10, popul_size=20,select_feature = False,num_features= False, mutat_rate=0.1, hyp_optimize=True):
    X_train = train_data.drop(columns=['ID', target_column_name])
    y_train = train_data[target_column_name]

    if select_feature is False:
        best_features = list(X_train.columns)  # Use all features
    else:
        if use_gen_alg and not num_features:
            print("генетический")
            best_feature_indices = genetic_algorithm(X_train, y_train, model_type, num_gener, popul_size, mutat_rate)
            best_features = [X_train.columns[i] for i, select in enumerate(best_feature_indices) if select == 1]
        else:
            if model_type == 'logistic_regression' and num_features:
                print("встроенный")
                # Use top N features based on coefficients if not optimizing hyperparameters
                model = train_model(X_train, y_train, model_type)
                coef_abs = np.abs(model.coef_[0])
                best_features_idx = np.argsort(coef_abs)[-num_features:]
                best_features = list(X_train.columns[best_features_idx])

    print("Selected Features:", best_features)

    X_train_selected = X_train[best_features]
    X_test_selected = test_data.drop(columns=['ID', target_column_name])[best_features]

    if hyp_optimize:
        best_hyperparameters = optimize_hyperparameters(X_train_selected, y_train, model_type)
        print("Best Hyperparameters:", best_hyperparameters)
    else:
        best_hyperparameters = None

    model = train_model(X_train_selected, y_train, model_type, best_hyperparameters)

    y_pred, f1, roc_auc = evaluate_model(model, X_test_selected, test_data[target_column_name])
    print("Test Predictions:", y_pred)
    print("Test F1 Score:", f1)
    print("Test ROC AUC Score:", roc_auc)
"""

# Использование функции с генетическим алгоритмом и оптимизацией гиперпараметров
#run_experiment(train_data_copy, test_data_copy, 'MARKER', 'logistic_regression',use_gen_alg = True, num_gener=2, popul_size=5, select_feature = True,num_features=False, mutat_rate=0.999, hyp_optimize=True)



'\ndef optimize_hyperparameters(X, y, model_type):\n    def objective(trial):\n        if model_type == \'logistic_regression\':\n            param_grid = {\n                \'penalty\': trial.suggest_categorical(\'penalty\', [\'l1\', \'l2\']),\n                \'C\': trial.suggest_loguniform(\'C\', 1e-6, 1e4)\n            }\n        # Другие модели и гиперпараметры\n        # ...\n\n        model = LogisticRegression(solver=\'liblinear\', class_weight=\'balanced\', **param_grid)\n        scores = cross_val_score(model, X, y, cv=5, scoring=\'roc_auc\', n_jobs=2)\n        return np.mean(scores)\n\n    study = optuna.create_study(direction=\'maximize\')\n    study.optimize(objective, n_trials=10)\n    best_params = study.best_params\n    return best_params\n\ndef train_model(X_train, y_train, model_type, hyperparameters=None):\n    if model_type == \'logistic_regression\':\n        if hyperparameters is None:\n            model = LogisticRegression(solver=\'liblinear\', class_weight=\'ba

In [24]:
# универсальный алгоритм 2
# Оптимизация гиперпараметров
def optimize_hyperparameters(X, y, model_type):
    def objective(trial):
        if model_type == 'logistic_regression':
            param_grid = {
                'penalty': trial.suggest_categorical('penalty', ['l1', 'l2']),
                'C': trial.suggest_loguniform('C', 1e-6, 1e4)
            }
            model = LogisticRegression(solver='liblinear', class_weight='balanced', **param_grid)
        elif model_type == 'random_forest':
            param_grid = {
                'n_estimators': trial.suggest_int('n_estimators', 50, 350, step=50),
                'max_depth': trial.suggest_int('max_depth', 3, 15),
                'min_samples_split': trial.suggest_float('min_samples_split', 0.05, 1.0),
                'min_samples_leaf': trial.suggest_float('min_samples_leaf', 0.05, 0.5)
            }
            model = RandomForestClassifier(
                n_estimators=param_grid['n_estimators'],
                max_depth=param_grid['max_depth'],
                min_samples_split=param_grid['min_samples_split'],
                min_samples_leaf=param_grid['min_samples_leaf'],
                class_weight='balanced',
                random_state=42
            )
        elif model_type == 'adaboost':
            n_estimators = trial.suggest_int('n_estimators', 5, 75, step=5)
            learning_rate = trial.suggest_float('learning_rate', 0.001, 1.0, log=True)

            estimator = DecisionTreeClassifier(class_weight='balanced')
            model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, estimator=estimator, random_state=42)
        # Добавьте другие модели и гиперпараметры по мере необходимости

        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc' )
        #average_score = scores.mean()
        #return average_score
        return np.mean(scores)

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20,n_jobs= 4)
    best_params = study.best_params
    return best_params

# Обучение модели
def train_model(X_train, y_train, model_type, hyperparameters=None):
    if model_type == 'logistic_regression':
        if hyperparameters is None:
            model = LogisticRegression(solver='liblinear', class_weight='balanced')
        else:
            model = LogisticRegression(solver='liblinear', class_weight='balanced', **hyperparameters)
    elif model_type == 'random_forest':
        model = RandomForestClassifier(
            #class_weight='balanced',
            random_state=42
        )
    elif model_type == 'adaboost':
        if hyperparameters is None:
            model = AdaBoostClassifier(random_state=42)
        else:
            model = AdaBoostClassifier(n_estimators=hyperparameters['n_estimators'], learning_rate=hyperparameters['learning_rate'], random_state=42)
    # Добавьте другие модели и параметры по мере необходимости

    model.fit(X_train, y_train)
    return model

# Оценка модели
def evaluate_model(model, X_test, y_test):
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    optimal_threshold_index = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_threshold_index]
    y_pred = (y_pred_prob > optimal_threshold).astype(int)

    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred_prob)

    return y_pred, f1, roc_auc

def fitness_score(model_type, X_train, y_train, selected_features):
    model = train_model(X_train[selected_features], y_train, model_type)
    _, _, roc_auc_score = evaluate_model(model, X_train[selected_features], y_train)
    return roc_auc_score

def mutate(selected_features, mutation_rate):
    mutated_features = selected_features.copy()
    for i in range(len(mutated_features)):
        if random.random() < mutation_rate:
            mutated_features[i] = random.randint(0, 1)  # 1 or 0 with equal probability
    return mutated_features

# Генетический алгоритм
def genetic_algorithm(X_train, y_train, model_type, num_gener, popul_size, mutation_rate):
    num_features = X_train.shape[1]
    column_names = X_train.columns

    population = [random.choices([0, 1], k=num_features) for _ in range(popul_size)]

    for generation in range(num_gener):
        scores = []
        for selected_features in population:
            selected_column_names = [column_names[i] for i, select in enumerate(selected_features) if select == 1]
            score = fitness_score(model_type, X_train, y_train, selected_column_names)
            scores.append((selected_features, score))

        scores.sort(key=lambda x: x[1], reverse=True)
        elite = scores[:num_features // 2]

        new_population = [selected_features for selected_features, _ in elite]
        while len(new_population) < num_features:
            parent1, parent2 = elite[0][0], elite[1][0]  # Use the top 2 parents
            child = [parent1[i] if random.random() < 0.5 else parent2[i] for i in range(num_features)]
            child = mutate(child, mutation_rate)
            new_population.append(child)

        population = new_population

    best_features = max(scores, key=lambda x: x[1])[0]
    return best_features

# Запуск эксперимента
def run_experiment(train_data, test_data, target_column_name, model_type, use_gen_alg=False, num_gener=10, popul_size=20, num_features=False, mutat_rate=0.1, hyp_optimize=True):
    X_train = train_data.drop(columns=['ID', target_column_name])
    y_train = train_data[target_column_name]

    if (use_gen_alg or num_features) is False:
        best_features = list(X_train.columns)  # Use all features
    else:
        if use_gen_alg and not num_features:
            print("выбран генетический алгоритм")
            best_feature_indices = genetic_algorithm(X_train, y_train, model_type, num_gener, popul_size, mutat_rate)
            best_features = [X_train.columns[i] for i, select in enumerate(best_feature_indices) if select == 1]
        else:
            if model_type == 'logistic_regression' and num_features:
                print("выбран встроенный алгоритм")
                # Use top N features based on coefficients if not optimizing hyperparameters
                model = train_model(X_train, y_train, model_type)
                coef_abs = np.abs(model.coef_[0])
                best_features_idx = np.argsort(coef_abs)[-num_features:]
                best_features = list(X_train.columns[best_features_idx])

    print("Selected Features:", best_features)

    X_train_selected = X_train[best_features]
    X_test_selected = test_data.drop(columns=['ID', target_column_name])[best_features]

    if hyp_optimize:
        best_hyperparameters = optimize_hyperparameters(X_train_selected, y_train, model_type)
        print("Best Hyperparameters:", best_hyperparameters)
    else:
        best_hyperparameters = None

    model = train_model(X_train_selected, y_train, model_type, best_hyperparameters)

    y_pred, f1, roc_auc = evaluate_model(model, X_test_selected, test_data[target_column_name])
    print("Test Predictions:", y_pred)
    print("Test F1 Score:", f1)
    print("Test ROC AUC Score:", roc_auc)

print("column to input:",train_data_copy.columns)
run_experiment(train_data_copy, test_data_copy, 'MARKER', "random_forest", use_gen_alg=True, num_gener=5, popul_size=4,num_features=False, mutat_rate=0.999, hyp_optimize=True)

column to input: Index(['ID', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'R', 'S',
       'W', 'X', 'MARKER'],
      dtype='object')
выбран генетический алгоритм


[I 2023-09-10 16:13:20,614] A new study created in memory with name: no-name-6a83ae1c-86d1-4811-8da0-64c1f762bd23


Selected Features: ['D', 'E', 'G', 'J', 'M', 'R', 'S', 'X']


[I 2023-09-10 16:13:24,558] Trial 3 finished with value: 0.7854037999367399 and parameters: {'n_estimators': 50, 'max_depth': 11, 'min_samples_split': 0.4236322360935034, 'min_samples_leaf': 0.18970406173676185}. Best is trial 3 with value: 0.7854037999367399.
[I 2023-09-10 16:13:25,266] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 100, 'max_depth': 4, 'min_samples_split': 0.2232286922779207, 'min_samples_leaf': 0.4028569987779269}. Best is trial 3 with value: 0.7854037999367399.
[I 2023-09-10 16:13:27,382] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 150, 'max_depth': 15, 'min_samples_split': 0.7711827494084058, 'min_samples_leaf': 0.4231589218229385}. Best is trial 3 with value: 0.7854037999367399.
[I 2023-09-10 16:13:29,050] Trial 4 finished with value: 0.5 and parameters: {'n_estimators': 100, 'max_depth': 9, 'min_samples_split': 0.8792614691553902, 'min_samples_leaf': 0.2928866003140352}. Best is trial 3 with value: 0.7854037999367399.
[I 

Best Hyperparameters: {'n_estimators': 250, 'max_depth': 15, 'min_samples_split': 0.067184037986259, 'min_samples_leaf': 0.06637117554716927}
Test Predictions: [0 0 0 ... 0 0 0]
Test F1 Score: 0.03796133567662566
Test ROC AUC Score: 0.7114088474840015


In [12]:
# случайный лес 2
def objective(trial, X, y):
    n_estimators = trial.suggest_int('n_estimators', 5, 35, step=5)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    min_samples_split = trial.suggest_float('min_samples_split', 0.05, 1.0)
    min_samples_leaf = trial.suggest_float('min_samples_leaf', 0.05, 0.5)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        class_weight='balanced',
        random_state=42
    )

    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X, y, cv=skf, scoring='roc_auc')
    average_score = scores.mean()

    return average_score

def build_and_predict_random_forest(train_data, test_data, target_column_name, selected_features):
    # Выделите признаки и целевую переменную
    X_train = train_data[selected_features]
    y_train = train_data[target_column_name]
    X_test = test_data[selected_features]
    y_test = test_data[target_column_name]

    # Подбор гиперпараметров
    study = optuna.create_study(direction='maximize')
    study.optimize(lambda trial: objective(trial, X_train, y_train), n_trials=40)

    best_params = study.best_params
    best_n_estimators = best_params['n_estimators']
    best_max_depth = best_params['max_depth']
    best_min_samples_split = best_params['min_samples_split']
    best_min_samples_leaf = best_params['min_samples_leaf']

    # Создайте и обучите модель случайного леса с наилучшими гиперпараметрами
    model = RandomForestClassifier(
        n_estimators=best_n_estimators,
        max_depth=best_max_depth,
        min_samples_split=best_min_samples_split,
        min_samples_leaf=best_min_samples_leaf,
        class_weight='balanced',
        random_state=42
    )
    model.fit(X_train, y_train)

    # Прогнозирование на тестовых данных
    y_pred_prob = model.predict_proba(X_test)[:, 1]

    # Вычисление ROC AUC
    roc_auc = roc_auc_score(y_test, y_pred_prob)

    # Вывод ROC AUC
    print("ROC AUC:", roc_auc)

    return model



In [13]:
# Построение и оценка модели случайного леса
model = build_and_predict_random_forest(train_data_copy, test_data_copy, "MARKER", selected_features)

[I 2023-09-10 14:36:55,270] A new study created in memory with name: no-name-23312021-2c38-48db-bb70-40571f1e54cb
[I 2023-09-10 14:36:56,611] Trial 0 finished with value: 0.7353902731883617 and parameters: {'n_estimators': 30, 'max_depth': 15, 'min_samples_split': 0.37361667081640426, 'min_samples_leaf': 0.24498546241411717}. Best is trial 0 with value: 0.7353902731883617.
[I 2023-09-10 14:36:57,304] Trial 1 finished with value: 0.5 and parameters: {'n_estimators': 25, 'max_depth': 13, 'min_samples_split': 0.6663437875547392, 'min_samples_leaf': 0.48864903452924224}. Best is trial 0 with value: 0.7353902731883617.
[I 2023-09-10 14:36:58,098] Trial 2 finished with value: 0.5 and parameters: {'n_estimators': 30, 'max_depth': 3, 'min_samples_split': 0.6970674923114422, 'min_samples_leaf': 0.1580940941403429}. Best is trial 0 with value: 0.7353902731883617.
[I 2023-09-10 14:36:58,616] Trial 3 finished with value: 0.7288766089057195 and parameters: {'n_estimators': 10, 'max_depth': 12, 'min

ROC AUC: 0.8662944201149555


In [14]:
print(train_data_copy.columns,len(train_data_copy.columns))
print(test_data_copy,len(train_data_copy.columns))

Index(['ID', 'C', 'D', 'E', 'G', 'H', 'I', 'J', 'L', 'M', 'N', 'O', 'R', 'S',
       'W', 'X', 'MARKER'],
      dtype='object') 17
            ID         A         B         C         D         E         F  \
0          1.0 -0.537304 -0.258903 -0.528232 -0.623304 -0.267175  0.131267   
1          2.0  0.387210 -0.007078 -0.809791  1.204640  1.082230  0.507430   
2          3.0 -0.785508 -0.614411 -0.809791 -0.331448 -0.555972 -0.291988   
3          4.0 -0.844809 -0.699348 -0.624201 -0.822137 -0.715833 -0.526278   
4          5.0  0.439524  0.585306 -0.809791  1.516391  1.158596  1.392120   
...        ...       ...       ...       ...       ...       ...       ...   
38400  38401.0  2.708937  1.199797 -0.383947  0.668000  0.766830  0.047912   
38401  38402.0 -0.622316 -0.380667 -0.375326 -0.302310 -0.280853  0.111220   
38402  38403.0 -0.467874 -0.159457 -0.265241 -0.832063 -0.788632 -0.632970   
38403  38404.0 -0.826518 -0.673149  3.188120  1.574578 -0.179213  0.260182   
38404  3840

In [15]:
# Adaboosting
# Функция для оптимизации гиперпараметров
def optimize_adaboost_hyperparameters(trial, X_train, y_train):
    n_estimators = trial.suggest_int('n_estimators', 1, 15, step=2)
    learning_rate = trial.suggest_float('learning_rate', 0.001, 1.0, log=True)

    estimator = DecisionTreeClassifier(class_weight='balanced')
    model = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, estimator=estimator, random_state=42)
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')
    return np.mean(scores)

# Функция для обучения и предсказания с оптимальной моделью
def train_and_predict_with_optimal_model(X_train, y_train, X_test, y_test, selected_features, best_params):
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    optimal_model = AdaBoostClassifier(n_estimators=best_params['n_estimators'], learning_rate=best_params['learning_rate'], random_state=42)
    optimal_model.fit(X_train_selected, y_train)

    y_pred_prob = optimal_model.predict_proba(X_test_selected)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
    optimal_threshold_index = np.argmax(tpr - fpr)
    optimal_threshold = thresholds[optimal_threshold_index]

    y_pred = (y_pred_prob > optimal_threshold).astype(int)

    return y_pred

def run_ada_boosting():
    start = time.time()

    # Загрузка данных
    X_train = train_data_copy[selected_features]
    y_train = train_data_copy['MARKER']

    X_test = test_data_copy[selected_features]
    y_test = test_data_copy['MARKER']

    # Создание Study объекта для оптимизации
    study = optuna.create_study(direction='maximize')

    # Запуск оптимизации гиперпараметров
    study.optimize(lambda trial: optimize_adaboost_hyperparameters(trial, X_train, y_train), n_trials=30, n_jobs=2)

    # Получение лучших гиперпараметров
    best_params = study.best_params

    # Обучение модели с оптимальными гиперпараметрами и прогнозирование на тестовом наборе данных
    y_pred = train_and_predict_with_optimal_model(X_train, y_train, X_test, y_test, selected_features, best_params)
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    # Вычисление метрик
    roc_auc = roc_auc_score(y_test, y_pred_prob)
    f1 = f1_score(y_test, y_pred)

    # Вывод результатов оптимизации и оценки модели
    print("Optimal Hyperparameters:", best_params)
    print("Predictions on Test Data:", y_pred)
    np.savetxt('./data_proc/y_pred_boosting.csv', y_pred)
    print("Test ROC AUC:", roc_auc)
    print("Test F1 Score:", f1)

    finish = time.time()
    print("время выполнения", finish - start)

run_ada_boosting()

[I 2023-09-10 14:37:49,758] A new study created in memory with name: no-name-5e7cb630-af35-43d6-a0e7-2ee2be7aef7b
[I 2023-09-10 14:37:56,741] Trial 0 finished with value: 0.6497533779659511 and parameters: {'n_estimators': 11, 'learning_rate': 0.6866948883879619}. Best is trial 0 with value: 0.6497533779659511.
[I 2023-09-10 14:37:56,951] Trial 1 finished with value: 0.5365166598689757 and parameters: {'n_estimators': 9, 'learning_rate': 0.011134805834852796}. Best is trial 0 with value: 0.6497533779659511.
[I 2023-09-10 14:38:02,171] Trial 2 finished with value: 0.5492117878222452 and parameters: {'n_estimators': 7, 'learning_rate': 0.10516411972844596}. Best is trial 0 with value: 0.6497533779659511.
[I 2023-09-10 14:38:02,979] Trial 4 finished with value: 0.5213869730715073 and parameters: {'n_estimators': 1, 'learning_rate': 0.03229221516519538}. Best is trial 0 with value: 0.6497533779659511.
[I 2023-09-10 14:38:03,509] Trial 3 finished with value: 0.6393337950433081 and parameter

Optimal Hyperparameters: {'n_estimators': 15, 'learning_rate': 0.1500292474525289}
Predictions on Test Data: [1 1 0 ... 0 0 0]
Test ROC AUC: 0.8662944201149555
Test F1 Score: 0.03922361504245855
время выполнения 126.00276303291321
