In [2]:
import os
import sys
import glob
import argparse
from collections import Counter, defaultdict
from typing import Iterator, Tuple, Union, List
from multiprocessing import Pool, cpu_count
from collections import Counter
import pandas as pd
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import (
    StratifiedKFold,
    cross_val_score,
    train_test_split,
    RandomizedSearchCV,
)
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

from xgboost import XGBClassifier
import optuna

  from .autonotebook import tqdm as notebook_tqdm


In [3]:

AA_ALPHABET = list("ACDEFGHIKLMNPQRSTVWY")


def extract_kmers(seq, k=3, alphabet=AA_ALPHABET):
    if not isinstance(seq, str):
        return []
    seq = seq.strip()
    n = len(seq)
    if n < k:
        return []
    out = []
    for i in range(n - k + 1):
        kmer = seq[i:i + k]
        if all(c in alphabet for c in kmer):
            out.append(kmer)
    return out



In [4]:
def convert_to_top_seq_format(top_df, dataset_name, default_prob=-999.0):
    df = top_df.copy().reset_index(drop=True)

    n = len(df)
    df.insert(0, "ID", [f"{dataset_name}_seq_top_{i+1}" for i in range(n)])
    df.insert(1, "dataset", dataset_name)
    df.insert(2, "label_positive_probability", float(default_prob))

    cols = [
        "ID",
        "dataset",
        "label_positive_probability",
        "junction_aa",
        "v_call",
        "j_call",
    ]
    # keep importance_score (and any other extra cols) at the end
    cols = cols + [c for c in df.columns if c not in cols]
    return df[cols]
def to_submission_format(test_pred_df, dataset_name="test_dataset_1"):
    df = test_pred_df.copy()

    df["ID"] = df["repertoire_id"]
    df["dataset"] = dataset_name
    df["label_positive_probability"] = df["prediction"]

    df["junction_aa"] = -999.0
    df["v_call"] = -999.0
    df["j_call"] = -999.0

    cols = [
        "ID",
        "dataset",
        "label_positive_probability",
        "junction_aa",
        "v_call",
        "j_call",
    ]
    return df[cols]

In [5]:
def load_data_generator(data_dir: str, metadata_filename='metadata.csv') -> Iterator[
    Union[Tuple[str, pd.DataFrame, bool], Tuple[str, pd.DataFrame]]]:
    """
    A generator to load immune repertoire data.

    This function operates in two modes:
    1.  If metadata is found, it yields data based on the metadata file.
    2.  If metadata is NOT found, it uses glob to find and yield all '.tsv'
        files in the directory.

    Args:
        data_dir (str): The path to the directory containing the data.

    Yields:
        An iterator of tuples. The format depends on the mode:
        - With metadata: (repertoire_id, pd.DataFrame, label_positive)
        - Without metadata: (filename, pd.DataFrame)
    """
    metadata_path = os.path.join(data_dir, metadata_filename)

    if os.path.exists(metadata_path):
        metadata_df = pd.read_csv(metadata_path)
        for row in metadata_df.itertuples(index=False):
            file_path = os.path.join(data_dir, row.filename)
            try:
                repertoire_df = pd.read_csv(file_path, sep='\t')
                yield row.repertoire_id, repertoire_df, row.label_positive
            except FileNotFoundError:
                print(f"Warning: File '{row.filename}' listed in metadata not found. Skipping.")
                continue
    else:
        search_pattern = os.path.join(data_dir, '*.tsv')
        tsv_files = glob.glob(search_pattern)
        for file_path in sorted(tsv_files):
            try:
                filename = os.path.basename(file_path)
                repertoire_df = pd.read_csv(file_path, sep='\t')
                yield filename, repertoire_df
            except Exception as e:
                print(f"Warning: Could not read file '{file_path}'. Error: {e}. Skipping.")
                continue


def load_full_dataset(data_dir: str) -> pd.DataFrame:
    """
    Loads all TSV files from a directory and concatenates them into a single DataFrame.

    This function handles two scenarios:
    1. If metadata.csv exists, it loads data based on the metadata and adds
       'repertoire_id' and 'label_positive' columns.
    2. If metadata.csv does not exist, it loads all .tsv files and adds
       a 'filename' column as an identifier.

    Args:
        data_dir (str): The path to the data directory.

    Returns:
        pd.DataFrame: A single, concatenated DataFrame containing all the data.
    """
    metadata_path = os.path.join(data_dir, 'metadata.csv')
    df_list = []
    data_loader = load_data_generator(data_dir=data_dir)

    if os.path.exists(metadata_path):
        metadata_df = pd.read_csv(metadata_path)
        total_files = len(metadata_df)
        for rep_id, data_df, label in tqdm(data_loader, total=total_files, desc="Loading files"):
            data_df['ID'] = rep_id
            data_df['label_positive'] = label
            df_list.append(data_df)
    else:
        search_pattern = os.path.join(data_dir, '*.tsv')
        total_files = len(glob.glob(search_pattern))
        for filename, data_df in tqdm(data_loader, total=total_files, desc="Loading files"):
            data_df['ID'] = os.path.basename(filename).replace(".tsv", "")
            df_list.append(data_df)

    if not df_list:
        print("Warning: No data files were loaded.")
        return pd.DataFrame()

    full_dataset_df = pd.concat(df_list, ignore_index=True)
    return full_dataset_df

def mismatched_neighbors(kmer, alphabet=AA_ALPHABET, max_mismatches=1, include_self=True):
    k = len(kmer)
    if max_mismatches == 0:
        return [kmer] if include_self else []
    neighbors = set()
    if include_self:
        neighbors.add(kmer)
    for pos in range(k):
        for aa in alphabet:
            if aa == kmer[pos]:
                continue
            new_kmer = kmer[:pos] + aa + kmer[pos + 1 :]
            neighbors.add(new_kmer)
    return list(neighbors)


def mismatch_smooth_counts(counts, k=3, alphabet=AA_ALPHABET):
    out = Counter()
    for kmer, c in counts.items():
        if len(kmer) != k:
            continue
        for nb in mismatched_neighbors(kmer, alphabet=alphabet, max_mismatches=1, include_self=True):
            out[nb] += c
    return out


In [6]:
AA_ALPHABET = list("ACDEFGHIKLMNPQRSTVWY")


def extract_gapped_trimers(seq, patterns=((1,0), (0,1),
(2,0), (0,2), (1,1),
(3,0), (0,3), (2,1), (1,2)), alphabet=AA_ALPHABET):
    if not isinstance(seq, str):
        return []
    seq = seq.strip()
    n = len(seq)
    out = []
    for gap1, gap2 in patterns:
        window = 3 + gap1 + gap2
        if n < window:
            continue
        for i in range(n - window + 1):
            a = seq[i]
            b = seq[i + 1 + gap1]
            c = seq[i + 2 + gap1 + gap2]
            if a in alphabet and b in alphabet and c in alphabet:
                key = f"{a}{b}{c}|g{gap1}{gap2}"
                out.append(key)
    return out


def encode_repertoire(
    seqs,
    k=3,
    use_gaps=False,
    use_mismatch=False,
    gap_patterns=((1,0), (0,1),
(2,0), (0,2), (1,1),
(3,0), (0,3), (2,1), (1,2)),
    alphabet=AA_ALPHABET,
):
    exact_counts = Counter()
    gap_counts = Counter()

    for s in seqs:
        if not isinstance(s, str):
            continue
        exact_counts.update(extract_kmers(s, k=k, alphabet=alphabet))
        if use_gaps and k == 3:
            gap_counts.update(
                extract_gapped_trimers(
                    s,
                    patterns=gap_patterns,
                    alphabet=alphabet,
                )
            )

    if use_mismatch:
        mm_counts = mismatch_smooth_counts(exact_counts, k=k, alphabet=alphabet)
    else:
        mm_counts = Counter()

    features = {}
    for kmer, c in exact_counts.items():
        features[f"exact_{kmer}"] = c
    for kmer, c in gap_counts.items():
        features[f"gap_{kmer}"] = c
    for kmer, c in mm_counts.items():
        features[f"mm1_{kmer}"] = c

    return features

def load_and_encode_repertoires_advanced(
    data_dir,
    k=3,
    use_gaps=False,
    use_mismatch=False,
    metadata_filename="metadata.csv",
    n_jobs=None,
):
    metadata_path = os.path.join(data_dir, metadata_filename)
    use_metadata = os.path.exists(metadata_path)

    tasks = []

    if use_metadata:
        metadata_df = pd.read_csv(metadata_path)
        for row in metadata_df.itertuples(index=False):
            path = os.path.join(data_dir, row.filename)
            tasks.append((row.repertoire_id, path, row.label_positive, k, use_gaps, use_mismatch))
    else:
        pattern = os.path.join(data_dir, "*.tsv")
        files = sorted(glob.glob(pattern))
        for path in files:
            rep_id = os.path.basename(path).replace(".tsv", "")
            tasks.append((rep_id, path, None, k, use_gaps, use_mismatch))

    total = len(tasks)
    if total == 0:
        return pd.DataFrame(), pd.DataFrame()

    if n_jobs is None or n_jobs < 1:
        n_jobs = cpu_count()

    feature_records = []
    meta_records = []

    with Pool(processes=n_jobs) as pool:
        for feats, meta in tqdm(
            pool.imap(_encode_one_repertoire, tasks),
            total=total,
            desc=f"Encoding k={k} advanced",
        ):
            feature_records.append(feats)
            meta_records.append(meta)

    X = pd.DataFrame(feature_records).fillna(0).set_index("ID")
    meta_df = pd.DataFrame(meta_records)
    return X, meta_df

def _encode_one_repertoire(args):
    rep_id, path, label, k, use_gaps, use_mismatch = args
    df = pd.read_csv(path, sep="\t")
    seqs = df["junction_aa"].dropna().astype(str).tolist()
    feats = encode_repertoire(
        seqs,
        k=k,
        use_gaps=use_gaps,
        use_mismatch=use_mismatch,
    )
    feats["ID"] = rep_id

    meta = {"ID": rep_id}
    if label is not None:
        meta["label_positive"] = label

    return feats, meta




In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

def fit_xgb_no_leakage_with_importance(
    X,
    y,
    n_top_features=200,
    random_state=123,
    n_iter=100,
    n_jobs=-1,
):
    y_arr = np.asarray(y).astype(int)

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y_arr,
        test_size=0.2,
        stratify=y_arr,
        random_state=random_state,
    )

    model_full, cv_scores_full, importance_full = fit_xgb(
        X_train,
        y_train,
        random_state=random_state,
        n_iter=n_iter,
        n_jobs=n_jobs,
    )

    top_features = importance_full["feature"].head(n_top_features).tolist()

    X_train_top = X_train.loc[:, top_features]
    X_test_top = X_test.loc[:, top_features]

    model_top, cv_scores_top, importance_top = fit_xgb(
        X_train_top,
        y_train,
        random_state=random_state,
        n_iter=n_iter,
        n_jobs=n_jobs,
    )

    y_test_pred = model_top.predict_proba(X_test_top)[:, 1]
    test_auc = roc_auc_score(y_test, y_test_pred)

    return {
        "initial_model": model_full,
        "initial_cv_scores": cv_scores_full,
        "initial_importance": importance_full,
        "selected_features": top_features,
        "final_model": model_top,
        "final_cv_scores": cv_scores_top,
        "final_importance": importance_top,
        "test_auc": test_auc,
    }

In [8]:
def fit_xgb(X, y, random_state=123, n_iter=150, n_jobs=-1):
    y_arr = np.asarray(y).astype(int)

    cv = StratifiedKFold(
        n_splits=5,
        shuffle=True,
        random_state=random_state,
    )

    max_depth_values = np.arange(3, 8)
    n_estimators_values = np.linspace(200, 1000, 9, dtype=int)
    learning_rate_values = np.logspace(-3, -0.7, 8)
    subsample_values = np.linspace(0.6, 1.0, 5)
    colsample_bytree_values = np.linspace(0.6, 1.0, 5)
    min_child_weight_values = [3, 5, 7]
    gamma_values = [0.0, 0.1, 0.2, 0.5]
    reg_lambda_values = np.logspace(-1, 2, 6)
    reg_alpha_values = [0.0, 1e-3, 1e-2, 1e-1, 1.0, 10.0]

    fixed_params = dict(
        objective="binary:logistic",
        eval_metric="auc",
        tree_method="hist",
        random_state=random_state,
        n_jobs=-1,
        device="cpu",
    )

    def objective(trial):
        params = {
            "max_depth": trial.suggest_categorical("max_depth", list(max_depth_values)),
            "n_estimators": trial.suggest_categorical("n_estimators", list(n_estimators_values)),
            "learning_rate": trial.suggest_categorical("learning_rate", list(learning_rate_values)),
            "subsample": trial.suggest_categorical("subsample", list(subsample_values)),
            "colsample_bytree": trial.suggest_categorical("colsample_bytree", list(colsample_bytree_values)),
            "min_child_weight": trial.suggest_categorical("min_child_weight", min_child_weight_values),
            "gamma": trial.suggest_categorical("gamma", gamma_values),
            "reg_lambda": trial.suggest_categorical("reg_lambda", list(reg_lambda_values)),
            "reg_alpha": trial.suggest_categorical("reg_alpha", reg_alpha_values),
        }
        model = XGBClassifier(**fixed_params, **params)
        scores = cross_val_score(
            model,
            X.values,
            y_arr,
            cv=cv,
            scoring="roc_auc",
            n_jobs=n_jobs,
        )
        return scores.mean()

    sampler = optuna.samplers.TPESampler(seed=random_state)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=n_iter, n_jobs=7,    show_progress_bar=True)

    best_params = study.best_params
    best_model = XGBClassifier(**fixed_params, **best_params)
    best_model.fit(X.values, y_arr)

    scores = cross_val_score(
        best_model,
        X.values,
        y_arr,
        cv=cv,
        scoring="roc_auc",
        n_jobs=n_jobs,
    )

    importance = pd.DataFrame(
        {"feature": X.columns, "importance": best_model.feature_importances_}
    ).sort_values("importance", ascending=False)

    return best_model, scores, importance

In [9]:
def basic_kmer_feature_filter(X, min_nonzero_repertoires=5, min_total_count=10):
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    nonzero_counts = (X != 0).sum(axis=0)
    total_counts = X.sum(axis=0)

    mask = (nonzero_counts >= min_nonzero_repertoires) & (total_counts >= min_total_count)
    X_filtered = X.loc[:, mask]
    selected_features = X_filtered.columns.tolist()
    return X_filtered, selected_features


def apply_feature_filter(X, selected_features):
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    for f in selected_features:
        if f not in X.columns:
            X[f] = 0

    return X[selected_features]

In [10]:
import pandas as pd
import numpy as np

def normalize_kmer_rows_by_category(X, col_categories=None):
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X)

    if col_categories is None:
        cats = []
        for c in X.columns:
            if "_" in c:
                cats.append(c.split("_", 1)[0])
            else:
                cats.append("other")
        col_categories = pd.Index(cats)
    else:
        col_categories = pd.Index(col_categories)

    group_sums = X.groupby(col_categories, axis=1).transform("sum")
    X_norm = X.div(group_sums.replace(0, np.nan))
    return X_norm.fillna(0.0)

In [11]:
from collections import Counter
from multiprocessing import Pool, cpu_count
from functools import partial
from tqdm import tqdm
import pandas as pd


def _score_one_sequence(
    s,
    k,
    alphabet,
    gap_patterns,
    use_gaps,
    use_mismatch,
    importance_map,
):
    s = str(s).strip()
    if len(s) < k:
        return 0.0

    exact_kmers = extract_kmers(s, k=k, alphabet=alphabet)
    exact_counts = Counter(exact_kmers)

    score = 0.0
    for kmer, cnt in exact_counts.items():
        feat_name = f"exact_{kmer}"
        imp = importance_map.get(feat_name)
        if imp is not None:
            score += imp * cnt

    if use_gaps and k == 3:
        gapped = extract_gapped_trimers(s, patterns=gap_patterns, alphabet=alphabet)
        gap_counts = Counter(gapped)
        for gk, cnt in gap_counts.items():
            feat_name = f"gap_{gk}"
            imp = importance_map.get(feat_name)
            if imp is not None:
                score += imp * cnt

    if use_mismatch:
        mm_counts = mismatch_smooth_counts(exact_counts, k=k, alphabet=alphabet)
        for mk, cnt in mm_counts.items():
            feat_name = f"mm1_{mk}"
            imp = importance_map.get(feat_name)
            if imp is not None:
                score += imp * cnt

    return score


def score_sequences_by_kmer_importance(
    sequences_df: pd.DataFrame,
    importance_df: pd.DataFrame,
    sequence_col: str = "junction_aa",
    alphabet=AA_ALPHABET,
    gap_patterns=((1,0), (0,1),
(2,0), (0,2), (1,1),
(3,0), (0,3), (2,1), (1,2)),
    use_gaps: bool = True,
    use_mismatch: bool = True,
    n_jobs: int = None,
) -> pd.DataFrame:
    if sequence_col not in sequences_df.columns:
        raise KeyError(sequence_col)

    feat_names = importance_df["feature"].astype(str).tolist()
    exact_feats = [f for f in feat_names if f.startswith("exact_")]
    if not exact_feats:
        raise ValueError("No 'exact_' features found; cannot infer k")

    k_values = {len(f.split("exact_", 1)[1]) for f in exact_feats}
    if len(k_values) != 1:
        raise ValueError("Inconsistent k across 'exact_' features")
    k = k_values.pop()

    importance_map = dict(zip(importance_df["feature"], importance_df["importance"]))

    if use_gaps:
        has_gap_feats = any(f.startswith("gap_") for f in feat_names)
        use_gaps = has_gap_feats and (k == 3)

    if use_mismatch:
        has_mm_feats = any(f.startswith("mm1_") for f in feat_names)
        use_mismatch = has_mm_feats

    seq_series = sequences_df[sequence_col].fillna("").astype(str)
    n = len(seq_series)

    if n_jobs is None or n_jobs < 1:
        n_jobs = cpu_count()
    else:
        n_jobs = min(n_jobs, cpu_count())

    if n_jobs == 1:
        scores = [
            _score_one_sequence(
                s,
                k,
                alphabet,
                gap_patterns,
                use_gaps,
                use_mismatch,
                importance_map,
            )
            for s in tqdm(seq_series, desc="scoring sequences", total=n)
        ]
    else:
        worker = partial(
            _score_one_sequence,
            k=k,
            alphabet=alphabet,
            gap_patterns=gap_patterns,
            use_gaps=use_gaps,
            use_mismatch=use_mismatch,
            importance_map=importance_map,
        )
        chunksize = max(1, n // (n_jobs * 8))

        with Pool(processes=n_jobs) as pool:
            scores = list(
                tqdm(
                    pool.imap(worker, seq_series, chunksize=chunksize),
                    total=n,
                    desc="scoring sequences",
                )
            )

    out = sequences_df.copy()
    out["importance_score"] = scores
    return out

In [12]:
def get_dataset_pairs(train_dir: str, test_dir: str) -> List[Tuple[str, List[str]]]:
    """Returns list of (train_path, [test_paths]) tuples for dataset pairs."""
    test_groups = defaultdict(list)
    for test_name in sorted(os.listdir(test_dir)):
        if test_name.startswith("test_dataset_"):
            base_id = test_name.replace("test_dataset_", "").split("_")[0]
            test_groups[base_id].append(os.path.join(test_dir, test_name))

    pairs = []
    for train_name in sorted(os.listdir(train_dir)):
        if train_name.startswith("train_dataset_"):
            train_id = train_name.replace("train_dataset_", "")
            train_path = os.path.join(train_dir, train_name)
            pairs.append((train_path, test_groups.get(train_id, [])))

    return pairs

In [13]:
from pathlib import Path

def load_and_encode_vj(folder_path: str, feature_colums=('v_call', 'j_call')):
    base_dir = Path(folder_path)
    dataset_name = base_dir.name

    dir_entries = list(base_dir.iterdir())
    tsv_list = [entry for entry in dir_entries if entry.suffix == '.tsv']
    non_tsv_names = [entry.name for entry in dir_entries if entry.suffix != '.tsv']
    print(f'Loading {len(tsv_list)} .tsv files from {dataset_name} (remaining: {non_tsv_names}).')

    meta_df = None
    meta_file = base_dir / 'metadata.csv'
    if meta_file.exists():
        meta_df = pd.read_csv(meta_file)
        meta_df.set_index('filename', inplace=True)

    records = []
    for tsv_file in tqdm(tsv_list, desc='Loading TSV files'):
        try:
            tab = pd.read_csv(tsv_file, sep='\t')
        except Exception as exc:
            print(f"Error loading {tsv_file.name}: {exc}")
            continue

        row_dict = {
            'ID': tsv_file.stem,
            'dataset': dataset_name,
        }

        if meta_df is not None and tsv_file.name in meta_df.index:
            row_dict['label_positive'] = int(meta_df.at[tsv_file.name, 'label_positive'])

        total_rows = len(tab)
        for feature in feature_colums:
            if feature not in tab.columns or total_rows == 0:
                continue
            freq_series = tab[feature].value_counts() / total_rows
            row_dict.update(freq_series.to_dict())

        records.append(row_dict)

    return pd.DataFrame(records).fillna(0)

In [None]:
import os
import pandas as pd

log_file = "training_auc_log_03E.tsv"

pairs = [
    ('train_datasets/train_datasets/train_dataset_1',
     ['test_datasets/test_datasets/test_dataset_1']),
    ('train_datasets/train_datasets/train_dataset_2',
     ['test_datasets/test_datasets/test_dataset_2']),
    ('train_datasets/train_datasets/train_dataset_3',
     ['test_datasets/test_datasets/test_dataset_3']),
    ('train_datasets/train_datasets/train_dataset_4',
     ['test_datasets/test_datasets/test_dataset_4']),
    ('train_datasets/train_datasets/train_dataset_5',
     ['test_datasets/test_datasets/test_dataset_5']),
    ('train_datasets/train_datasets/train_dataset_6',
     ['test_datasets/test_datasets/test_dataset_6']),
    ('train_datasets/train_datasets/train_dataset_7',
     ['test_datasets/test_datasets/test_dataset_7_1',
      'test_datasets/test_datasets/test_dataset_7_2']),
    ('train_datasets/train_datasets/train_dataset_8',
     ['test_datasets/test_datasets/test_dataset_8_1',
      'test_datasets/test_datasets/test_dataset_8_2',
      'test_datasets/test_datasets/test_dataset_8_3']),
]

k = 3
use_gaps = True
use_mismatch = True
n_jobs = 40
subsample_n = None
n_iter = 50

submission_out_dir = "results_03E_kmer"
important_out_dir = "results_03E_kmer"

os.makedirs(submission_out_dir, exist_ok=True)
os.makedirs(important_out_dir, exist_ok=True)

for train_dir, test_dirs in pairs:
    train_name = os.path.basename(train_dir)
    imp_out_path = os.path.join(important_out_dir, f"{train_name}_important_sequences.tsv")

    submission_paths = [
        os.path.join(submission_out_dir, f"{os.path.basename(td)}_submission.tsv")
        for td in test_dirs
    ]

    if os.path.exists(imp_out_path) and all(os.path.exists(p) for p in submission_paths):
        print(f"Skipping {train_name}: all outputs already exist.")
        continue

    vj_train = load_and_encode_vj(train_dir)
    vj_train = vj_train.set_index("ID")
    vj_train = vj_train.iloc[:, 3:]

    X_train, meta_train = load_and_encode_repertoires_advanced(
        data_dir=train_dir,
        k=k,
        use_gaps=use_gaps,
        use_mismatch=use_mismatch,
        n_jobs=n_jobs,
    )

    X_train_norm = normalize_kmer_rows_by_category(X_train)
    X_train_merged = X_train_norm.join(vj_train)

    if subsample_n is not None:
        X_train_merged = X_train_merged.iloc[:subsample_n]
        y_train = meta_train["label_positive"].iloc[:subsample_n]
    else:
        y_train = meta_train["label_positive"]

    best_model, scores, importance = fit_xgb(
        X=X_train_merged,
        y=y_train,
        random_state=123,
        n_iter=n_iter,
        n_jobs=-1,
    )

    mean_auc = scores.mean()
    print(f"{train_name} CV AUC mean:", mean_auc)

    with open(log_file, "a") as f:
        f.write(f"{train_name}\t{mean_auc}\n")

    for test_dir in test_dirs:
        test_name = os.path.basename(test_dir)
        out_path = os.path.join(submission_out_dir, f"{test_name}_submission.tsv")

        if os.path.exists(out_path):
            print(f"Skipping {test_name}: submission already exists.")
            continue

        vj_test = load_and_encode_vj(test_dir)
        vj_test = vj_test.set_index("ID")
        vj_test = vj_test.iloc[:, 3:]
        vj_test = vj_test.reindex(columns=vj_train.columns, fill_value=0)

        X_test = load_and_encode_repertoires_advanced(
            data_dir=test_dir,
            k=k,
            use_gaps=use_gaps,
            use_mismatch=use_mismatch,
            n_jobs=n_jobs,
        )

        X_test_kmer = X_test[0].reindex(columns=X_train_norm.columns, fill_value=0)
        X_test_norm = normalize_kmer_rows_by_category(X_test_kmer)

        X_test_merged = X_test_norm.join(vj_test)

        if subsample_n is not None:
            X_test_merged = X_test_merged.iloc[:subsample_n]

        test_pred_proba = best_model.predict_proba(X_test_merged)[:, 1]

        test_pred_df = pd.DataFrame(
            {
                "repertoire_id": X_test_merged.index,
                "prediction": test_pred_proba,
            }
        )

        submission_df = to_submission_format(test_pred_df,dataset_name=test_name)
        submission_df.to_csv(out_path, sep="\t", index=False)
        print(f"wrote {out_path}")

    if not os.path.exists(imp_out_path):
        full_df = load_full_dataset(train_dir)
        unique_seqs = full_df[["junction_aa", "v_call", "j_call"]].drop_duplicates()

        all_sequences_scored = score_sequences_by_kmer_importance(
            sequences_df=unique_seqs,
            importance_df=importance,
            sequence_col="junction_aa",
            use_gaps=use_gaps,
            use_mismatch=use_mismatch,
            n_jobs=40,
        )

        top_sequences_df = all_sequences_scored.nlargest(50000, "importance_score")
        top_formatted = convert_to_top_seq_format(
            top_sequences_df,
            dataset_name=train_name,
        ).iloc[:, 0:6]

        top_formatted.to_csv(imp_out_path, sep="\t", index=False)
        print(f"wrote {imp_out_path}")

Skipping train_dataset_1: all outputs already exist.
Skipping train_dataset_2: all outputs already exist.
Skipping train_dataset_3: all outputs already exist.
Loading 400 .tsv files from train_dataset_4 (remaining: ['metadata.csv']).


Loading TSV files: 100%|██████████| 400/400 [00:05<00:00, 67.04it/s]
Encoding k=3 advanced: 100%|██████████| 400/400 [00:27<00:00, 14.53it/s]
[I 2025-12-02 18:53:52,020] A new study created in memory with name: no-name-3a3312c4-1c61-4eda-84ac-f6306dceb22d
Best trial: 2. Best value: 0.645625:   2%|▏         | 1/50 [07:28<6:06:26, 448.70s/it]

[I 2025-12-02 19:01:20,709] Trial 2 finished with value: 0.6456250000000001 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.0}. Best is trial 2 with value: 0.6456250000000001.


Best trial: 6. Best value: 0.996125:   4%|▍         | 2/50 [08:10<2:47:26, 209.30s/it]

[I 2025-12-02 19:02:02,420] Trial 6 finished with value: 0.9961249999999999 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.0}. Best is trial 6 with value: 0.9961249999999999.


Best trial: 6. Best value: 0.996125:   6%|▌         | 3/50 [09:02<1:47:43, 137.53s/it]

[I 2025-12-02 19:02:54,552] Trial 3 finished with value: 0.5908749999999999 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 7, 'gamma': 0.5, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 0.001}. Best is trial 6 with value: 0.9961249999999999.


Best trial: 6. Best value: 0.996125:   8%|▊         | 4/50 [14:00<2:34:03, 200.94s/it]

[I 2025-12-02 19:07:52,697] Trial 0 finished with value: 0.96425 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.0}. Best is trial 6 with value: 0.9961249999999999.


Best trial: 6. Best value: 0.996125:  10%|█         | 5/50 [14:05<1:37:41, 130.25s/it]

[I 2025-12-02 19:07:57,620] Trial 1 finished with value: 0.8863749999999999 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 7, 'gamma': 0.0, 'reg_lambda': np.float64(0.1), 'reg_alpha': 10.0}. Best is trial 6 with value: 0.9961249999999999.


Best trial: 6. Best value: 0.996125:  12%|█▏        | 6/50 [22:57<3:15:33, 266.67s/it]

[I 2025-12-02 19:16:49,086] Trial 11 finished with value: 0.7302500000000001 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 0.0}. Best is trial 6 with value: 0.9961249999999999.


Best trial: 6. Best value: 0.996125:  14%|█▍        | 7/50 [32:31<4:23:06, 367.13s/it]

[I 2025-12-02 19:26:23,054] Trial 5 finished with value: 0.943875 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.5, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.0}. Best is trial 6 with value: 0.9961249999999999.


Best trial: 6. Best value: 0.996125:  16%|█▌        | 8/50 [33:48<3:12:26, 274.93s/it]

[I 2025-12-02 19:27:40,565] Trial 7 finished with value: 0.9252499999999999 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.0}. Best is trial 6 with value: 0.9961249999999999.


Best trial: 9. Best value: 0.99925:  18%|█▊        | 9/50 [38:49<3:13:31, 283.22s/it] 

[I 2025-12-02 19:32:42,000] Trial 9 finished with value: 0.99925 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  20%|██        | 10/50 [41:29<2:43:26, 245.16s/it]

[I 2025-12-02 19:35:21,925] Trial 12 finished with value: 0.6441250000000001 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.1, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 1.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  22%|██▏       | 11/50 [47:19<3:00:11, 277.21s/it]

[I 2025-12-02 19:41:11,832] Trial 14 finished with value: 0.561375 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 7, 'gamma': 0.0, 'reg_lambda': np.float64(100.0), 'reg_alpha': 0.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  24%|██▍       | 12/50 [47:54<2:08:47, 203.37s/it]

[I 2025-12-02 19:41:46,300] Trial 8 finished with value: 0.688625 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(100.0), 'reg_alpha': 0.001}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  26%|██▌       | 13/50 [48:01<1:28:48, 144.00s/it]

[I 2025-12-02 19:41:53,683] Trial 4 finished with value: 0.58425 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.1, 'reg_lambda': np.float64(100.0), 'reg_alpha': 0.01}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  28%|██▊       | 14/50 [56:01<2:27:11, 245.31s/it]

[I 2025-12-02 19:49:53,111] Trial 10 finished with value: 0.882125 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 5, 'gamma': 0.1, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 1.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  30%|███       | 15/50 [56:13<1:42:11, 175.19s/it]

[I 2025-12-02 19:50:05,788] Trial 15 finished with value: 0.724 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.2, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 0.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  32%|███▏      | 16/50 [1:36:50<8:05:03, 855.98s/it]

[I 2025-12-02 20:30:42,765] Trial 16 finished with value: 0.655375 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.2, 'reg_lambda': np.float64(100.0), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  34%|███▍      | 17/50 [1:38:48<5:48:39, 633.91s/it]

[I 2025-12-02 20:32:40,227] Trial 18 finished with value: 0.9948750000000001 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  36%|███▌      | 18/50 [1:38:49<3:56:44, 443.89s/it]

[I 2025-12-02 20:32:41,746] Trial 13 finished with value: 0.999 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  38%|███▊      | 19/50 [1:39:45<2:49:06, 327.29s/it]

[I 2025-12-02 20:33:37,430] Trial 21 finished with value: 0.998 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  40%|████      | 20/50 [1:40:59<2:05:37, 251.24s/it]

[I 2025-12-02 20:34:51,429] Trial 17 finished with value: 0.999125 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  42%|████▏     | 21/50 [1:41:13<1:27:02, 180.10s/it]

[I 2025-12-02 20:35:05,647] Trial 19 finished with value: 0.999125 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  44%|████▍     | 22/50 [1:49:48<2:10:52, 280.45s/it]

[I 2025-12-02 20:43:40,117] Trial 20 finished with value: 0.999125 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  46%|████▌     | 23/50 [2:17:00<5:08:42, 686.01s/it]

[I 2025-12-02 21:10:52,059] Trial 23 finished with value: 0.989875 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  48%|████▊     | 24/50 [2:18:05<3:36:34, 499.78s/it]

[I 2025-12-02 21:11:57,407] Trial 26 finished with value: 0.989875 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  50%|█████     | 25/50 [2:29:25<3:50:46, 553.85s/it]

[I 2025-12-02 21:23:17,401] Trial 22 finished with value: 0.97575 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  52%|█████▏    | 26/50 [2:30:20<2:41:40, 404.19s/it]

[I 2025-12-02 21:24:12,445] Trial 28 finished with value: 0.96775 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  54%|█████▍    | 27/50 [2:46:57<3:43:05, 581.96s/it]

[I 2025-12-02 21:40:49,176] Trial 25 finished with value: 0.9846250000000001 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  56%|█████▌    | 28/50 [2:52:55<3:08:44, 514.76s/it]

[I 2025-12-02 21:46:47,129] Trial 27 finished with value: 0.9641249999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  58%|█████▊    | 29/50 [3:01:10<2:58:09, 509.02s/it]

[I 2025-12-02 21:55:02,759] Trial 24 finished with value: 0.9844999999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  60%|██████    | 30/50 [3:16:15<3:29:15, 627.79s/it]

[I 2025-12-02 22:10:07,666] Trial 30 finished with value: 0.999125 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.01}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  62%|██████▏   | 31/50 [3:26:02<3:14:56, 615.63s/it]

[I 2025-12-02 22:19:54,921] Trial 36 finished with value: 0.82025 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 10.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  64%|██████▍   | 32/50 [3:26:53<2:13:51, 446.22s/it]

[I 2025-12-02 22:20:45,843] Trial 31 finished with value: 0.999125 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.01}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  66%|██████▌   | 33/50 [3:27:41<1:32:30, 326.52s/it]

[I 2025-12-02 22:21:33,099] Trial 32 finished with value: 0.999125 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.01}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  68%|██████▊   | 34/50 [3:30:06<1:12:33, 272.12s/it]

[I 2025-12-02 22:23:58,284] Trial 29 finished with value: 0.9506249999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  70%|███████   | 35/50 [3:33:29<1:02:51, 251.45s/it]

[I 2025-12-02 22:27:21,511] Trial 35 finished with value: 0.99925 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  72%|███████▏  | 36/50 [3:46:38<1:36:18, 412.78s/it]

[I 2025-12-02 22:40:30,714] Trial 33 finished with value: 0.71675 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 10.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  74%|███████▍  | 37/50 [3:52:53<1:26:57, 401.36s/it]

[I 2025-12-02 22:46:45,422] Trial 34 finished with value: 0.71675 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 10.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  76%|███████▌  | 38/50 [3:58:44<1:17:13, 386.14s/it]

[I 2025-12-02 22:52:36,038] Trial 41 finished with value: 0.7703749999999999 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 10.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  78%|███████▊  | 39/50 [4:00:34<55:38, 303.51s/it]  

[I 2025-12-02 22:54:26,743] Trial 43 finished with value: 0.9855 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  80%|████████  | 40/50 [4:06:20<52:43, 316.33s/it]

[I 2025-12-02 23:00:12,996] Trial 44 finished with value: 0.9858749999999998 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 5, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.001}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  82%|████████▏ | 41/50 [4:11:26<46:58, 313.16s/it]

[I 2025-12-02 23:05:18,772] Trial 40 finished with value: 0.71375 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 10.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  84%|████████▍ | 42/50 [4:12:25<31:33, 236.70s/it]

[I 2025-12-02 23:06:17,068] Trial 42 finished with value: 0.9984999999999999 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  86%|████████▌ | 43/50 [4:17:14<29:27, 252.56s/it]

[I 2025-12-02 23:11:06,618] Trial 37 finished with value: 0.99925 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.1}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  88%|████████▊ | 44/50 [4:20:52<24:13, 242.21s/it]

[I 2025-12-02 23:14:44,671] Trial 39 finished with value: 0.721625 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 10.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  90%|█████████ | 45/50 [4:22:27<16:30, 198.00s/it]

[I 2025-12-02 23:16:19,529] Trial 38 finished with value: 0.721625 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 10.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  92%|█████████▏| 46/50 [4:23:21<10:19, 154.94s/it]

[I 2025-12-02 23:17:13,969] Trial 45 finished with value: 0.99175 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.001}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  94%|█████████▍| 47/50 [4:30:06<11:29, 229.68s/it]

[I 2025-12-02 23:23:58,071] Trial 46 finished with value: 0.99825 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 1.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  96%|█████████▌| 48/50 [4:33:30<07:24, 222.15s/it]

[I 2025-12-02 23:27:22,633] Trial 47 finished with value: 0.99925 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 1.0}. Best is trial 9 with value: 0.99925.


Best trial: 9. Best value: 0.99925:  98%|█████████▊| 49/50 [4:38:17<04:01, 241.61s/it]

[I 2025-12-02 23:32:09,677] Trial 49 finished with value: 0.99925 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 1.0}. Best is trial 9 with value: 0.99925.


Best trial: 48. Best value: 0.999375: 100%|██████████| 50/50 [4:44:58<00:00, 341.98s/it]


[I 2025-12-02 23:38:50,949] Trial 48 finished with value: 0.999375 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 1.0}. Best is trial 48 with value: 0.999375.
train_dataset_4 CV AUC mean: 0.999375
Loading 400 .tsv files from test_dataset_4 (remaining: []).


Loading TSV files: 100%|██████████| 400/400 [00:05<00:00, 67.93it/s]
Encoding k=3 advanced: 100%|██████████| 400/400 [00:28<00:00, 14.09it/s]


wrote results_03E_kmer/test_dataset_4_submission.tsv


Loading files: 100%|██████████| 400/400 [00:04<00:00, 80.72it/s]
scoring sequences: 100%|██████████| 9699715/9699715 [03:01<00:00, 53509.82it/s] 


wrote results_03E_kmer/train_dataset_4_important_sequences.tsv
Loading 400 .tsv files from train_dataset_5 (remaining: ['metadata.csv']).


Loading TSV files: 100%|██████████| 400/400 [00:05<00:00, 68.71it/s]
Encoding k=3 advanced: 100%|██████████| 400/400 [00:28<00:00, 13.99it/s]
[I 2025-12-03 00:12:05,411] A new study created in memory with name: no-name-f3d74b89-e41a-41a3-9462-a1726e04c7de
Best trial: 3. Best value: 0.9835:   2%|▏         | 1/50 [09:01<7:21:51, 541.06s/it]

[I 2025-12-03 00:21:06,461] Trial 3 finished with value: 0.9834999999999999 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.5, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.001}. Best is trial 3 with value: 0.9834999999999999.


Best trial: 3. Best value: 0.9835:   4%|▍         | 2/50 [11:26<4:06:31, 308.15s/it]

[I 2025-12-03 00:23:31,571] Trial 2 finished with value: 0.9715 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 10.0}. Best is trial 3 with value: 0.9834999999999999.


Best trial: 3. Best value: 0.9835:   6%|▌         | 3/50 [12:17<2:29:36, 191.00s/it]

[I 2025-12-03 00:24:23,135] Trial 1 finished with value: 0.9556249999999998 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.0}. Best is trial 3 with value: 0.9834999999999999.


Best trial: 3. Best value: 0.9835:   8%|▊         | 4/50 [16:34<2:46:15, 216.87s/it]

[I 2025-12-03 00:28:39,688] Trial 5 finished with value: 0.9675 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.5, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 1.0}. Best is trial 3 with value: 0.9834999999999999.


Best trial: 3. Best value: 0.9835:  10%|█         | 5/50 [25:57<4:16:24, 341.88s/it]

[I 2025-12-03 00:38:03,233] Trial 7 finished with value: 0.9559999999999998 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.5, 'reg_lambda': np.float64(100.0), 'reg_alpha': 1.0}. Best is trial 3 with value: 0.9834999999999999.


Best trial: 3. Best value: 0.9835:  12%|█▏        | 6/50 [26:09<2:48:19, 229.53s/it]

[I 2025-12-03 00:38:14,656] Trial 6 finished with value: 0.9801249999999999 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.001}. Best is trial 3 with value: 0.9834999999999999.


Best trial: 3. Best value: 0.9835:  14%|█▍        | 7/50 [27:40<2:12:00, 184.20s/it]

[I 2025-12-03 00:39:45,526] Trial 9 finished with value: 0.962125 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 1.0}. Best is trial 3 with value: 0.9834999999999999.


Best trial: 3. Best value: 0.9835:  16%|█▌        | 8/50 [34:21<2:57:22, 253.39s/it]

[I 2025-12-03 00:46:27,082] Trial 12 finished with value: 0.9734999999999999 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.1}. Best is trial 3 with value: 0.9834999999999999.


Best trial: 4. Best value: 0.995625:  18%|█▊        | 9/50 [39:06<2:59:49, 263.15s/it]

[I 2025-12-03 00:51:11,665] Trial 4 finished with value: 0.9956250000000001 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.1}. Best is trial 4 with value: 0.9956250000000001.


Best trial: 4. Best value: 0.995625:  20%|██        | 10/50 [43:13<2:52:07, 258.18s/it]

[I 2025-12-03 00:55:18,714] Trial 14 finished with value: 0.9828749999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 1.0}. Best is trial 4 with value: 0.9956250000000001.


Best trial: 4. Best value: 0.995625:  22%|██▏       | 11/50 [47:43<2:50:13, 261.89s/it]

[I 2025-12-03 00:59:49,014] Trial 8 finished with value: 0.960125 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 5, 'gamma': 0.2, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 0.001}. Best is trial 4 with value: 0.9956250000000001.


Best trial: 4. Best value: 0.995625:  24%|██▍       | 12/50 [54:32<3:14:16, 306.74s/it]

[I 2025-12-03 01:06:38,333] Trial 11 finished with value: 0.9730000000000001 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 5, 'gamma': 0.0, 'reg_lambda': np.float64(100.0), 'reg_alpha': 1.0}. Best is trial 4 with value: 0.9956250000000001.


Best trial: 4. Best value: 0.995625:  26%|██▌       | 13/50 [1:03:27<3:51:40, 375.69s/it]

[I 2025-12-03 01:15:32,670] Trial 13 finished with value: 0.951875 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 4 with value: 0.9956250000000001.


Best trial: 17. Best value: 0.9965:  28%|██▊       | 14/50 [1:06:07<3:06:22, 310.63s/it] 

[I 2025-12-03 01:18:12,983] Trial 17 finished with value: 0.9964999999999999 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 17. Best value: 0.9965:  30%|███       | 15/50 [1:09:02<2:37:23, 269.81s/it]

[I 2025-12-03 01:21:08,216] Trial 15 finished with value: 0.967625 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.2, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 0.1}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 17. Best value: 0.9965:  32%|███▏      | 16/50 [1:11:19<2:10:15, 229.88s/it]

[I 2025-12-03 01:23:25,347] Trial 10 finished with value: 0.9872500000000001 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(100.0), 'reg_alpha': 0.001}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 17. Best value: 0.9965:  34%|███▍      | 17/50 [1:11:51<1:33:37, 170.21s/it]

[I 2025-12-03 01:23:56,815] Trial 0 finished with value: 0.966625 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 10.0}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 17. Best value: 0.9965:  36%|███▌      | 18/50 [1:14:03<1:24:42, 158.82s/it]

[I 2025-12-03 01:26:09,123] Trial 18 finished with value: 0.994375 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.1}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 17. Best value: 0.9965:  38%|███▊      | 19/50 [1:18:36<1:39:41, 192.96s/it]

[I 2025-12-03 01:30:41,606] Trial 19 finished with value: 0.9964999999999999 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 17. Best value: 0.9965:  40%|████      | 20/50 [1:21:57<1:37:47, 195.59s/it]

[I 2025-12-03 01:34:03,305] Trial 16 finished with value: 0.99375 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.1}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 17. Best value: 0.9965:  42%|████▏     | 21/50 [1:32:23<2:36:55, 324.69s/it]

[I 2025-12-03 01:44:28,983] Trial 26 finished with value: 0.991 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 17. Best value: 0.9965:  44%|████▍     | 22/50 [1:42:23<3:10:00, 407.15s/it]

[I 2025-12-03 01:54:28,450] Trial 21 finished with value: 0.9956250000000001 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 17. Best value: 0.9965:  46%|████▌     | 23/50 [1:43:00<2:13:20, 296.33s/it]

[I 2025-12-03 01:55:06,298] Trial 22 finished with value: 0.9964999999999999 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 17. Best value: 0.9965:  48%|████▊     | 24/50 [1:44:13<1:39:18, 229.19s/it]

[I 2025-12-03 01:56:18,876] Trial 24 finished with value: 0.9964999999999999 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 17. Best value: 0.9965:  50%|█████     | 25/50 [1:45:28<1:16:13, 182.94s/it]

[I 2025-12-03 01:57:33,917] Trial 23 finished with value: 0.9964999999999999 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 17 with value: 0.9964999999999999.


Best trial: 20. Best value: 0.996625:  52%|█████▏    | 26/50 [1:46:35<59:12, 148.01s/it]  

[I 2025-12-03 01:58:40,434] Trial 20 finished with value: 0.9966250000000001 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  54%|█████▍    | 27/50 [1:47:39<47:05, 122.83s/it]

[I 2025-12-03 01:59:44,513] Trial 27 finished with value: 0.9964999999999999 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  56%|█████▌    | 28/50 [1:50:48<52:24, 142.92s/it]

[I 2025-12-03 02:02:54,292] Trial 32 finished with value: 0.992625 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  58%|█████▊    | 29/50 [1:52:59<48:42, 139.14s/it]

[I 2025-12-03 02:05:04,639] Trial 25 finished with value: 0.9958750000000001 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  60%|██████    | 30/50 [1:57:20<58:37, 175.90s/it]

[I 2025-12-03 02:09:26,298] Trial 29 finished with value: 0.9964999999999999 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  62%|██████▏   | 31/50 [1:59:34<51:39, 163.13s/it]

[I 2025-12-03 02:11:39,642] Trial 30 finished with value: 0.9964999999999999 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  64%|██████▍   | 32/50 [1:59:35<34:20, 114.46s/it]

[I 2025-12-03 02:11:40,527] Trial 28 finished with value: 0.9963750000000001 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  66%|██████▌   | 33/50 [2:02:49<39:14, 138.50s/it]

[I 2025-12-03 02:14:55,131] Trial 31 finished with value: 0.994125 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  68%|██████▊   | 34/50 [2:12:58<1:14:33, 279.62s/it]

[I 2025-12-03 02:25:04,003] Trial 33 finished with value: 0.990625 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  70%|███████   | 35/50 [2:21:56<1:29:14, 357.00s/it]

[I 2025-12-03 02:34:01,535] Trial 35 finished with value: 0.9955 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  72%|███████▏  | 36/50 [2:37:53<2:05:18, 537.04s/it]

[I 2025-12-03 02:49:58,712] Trial 34 finished with value: 0.9960000000000001 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  74%|███████▍  | 37/50 [2:39:39<1:28:21, 407.81s/it]

[I 2025-12-03 02:51:44,989] Trial 37 finished with value: 0.9963750000000001 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  76%|███████▌  | 38/50 [2:43:17<1:10:11, 350.98s/it]

[I 2025-12-03 02:55:23,343] Trial 40 finished with value: 0.98575 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 5, 'gamma': 0.1, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  78%|███████▊  | 39/50 [2:46:28<55:31, 302.83s/it]  

[I 2025-12-03 02:58:33,832] Trial 39 finished with value: 0.9953750000000001 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.1, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  80%|████████  | 40/50 [2:48:10<40:26, 242.61s/it]

[I 2025-12-03 03:00:15,925] Trial 42 finished with value: 0.9595 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 5, 'gamma': 0.1, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 10.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  82%|████████▏ | 41/50 [2:48:17<25:47, 172.00s/it]

[I 2025-12-03 03:00:23,163] Trial 41 finished with value: 0.98575 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 5, 'gamma': 0.1, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  84%|████████▍ | 42/50 [2:49:53<19:52, 149.09s/it]

[I 2025-12-03 03:01:58,814] Trial 36 finished with value: 0.99225 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  86%|████████▌ | 43/50 [2:51:10<14:52, 127.52s/it]

[I 2025-12-03 03:03:15,989] Trial 43 finished with value: 0.959625 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 5, 'gamma': 0.1, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 10.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  88%|████████▊ | 44/50 [2:54:37<15:08, 151.38s/it]

[I 2025-12-03 03:06:43,053] Trial 44 finished with value: 0.968375 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.1, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 10.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  90%|█████████ | 45/50 [2:55:44<10:29, 126.00s/it]

[I 2025-12-03 03:07:49,825] Trial 38 finished with value: 0.982625 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 3, 'gamma': 0.1, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  92%|█████████▏| 46/50 [2:58:35<09:17, 139.49s/it]

[I 2025-12-03 03:10:40,806] Trial 45 finished with value: 0.9662499999999999 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 10.0}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  94%|█████████▍| 47/50 [3:00:39<06:44, 134.99s/it]

[I 2025-12-03 03:12:45,303] Trial 47 finished with value: 0.977375 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  96%|█████████▌| 48/50 [3:00:54<03:17, 98.79s/it] 

[I 2025-12-03 03:12:59,607] Trial 49 finished with value: 0.9737500000000001 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625:  98%|█████████▊| 49/50 [3:00:59<01:10, 70.71s/it]

[I 2025-12-03 03:13:04,788] Trial 46 finished with value: 0.9793749999999999 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.


Best trial: 20. Best value: 0.996625: 100%|██████████| 50/50 [3:01:05<00:00, 217.32s/it]


[I 2025-12-03 03:13:11,223] Trial 48 finished with value: 0.9737500000000001 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 20 with value: 0.9966250000000001.
train_dataset_5 CV AUC mean: 0.9966250000000001
Loading 400 .tsv files from test_dataset_5 (remaining: ['.ipynb_checkpoints']).


Loading TSV files: 100%|██████████| 400/400 [00:05<00:00, 68.00it/s]
Encoding k=3 advanced: 100%|██████████| 400/400 [00:28<00:00, 13.98it/s]


wrote results_03E_kmer/test_dataset_5_submission.tsv


Loading files: 100%|██████████| 400/400 [00:04<00:00, 85.72it/s]
scoring sequences: 100%|██████████| 9700400/9700400 [03:22<00:00, 47886.99it/s] 


wrote results_03E_kmer/train_dataset_5_important_sequences.tsv
Loading 400 .tsv files from train_dataset_6 (remaining: ['.ipynb_checkpoints', 'metadata.csv']).


Loading TSV files: 100%|██████████| 400/400 [00:05<00:00, 71.96it/s]
Encoding k=3 advanced: 100%|██████████| 400/400 [00:27<00:00, 14.62it/s]
[I 2025-12-03 03:43:07,116] A new study created in memory with name: no-name-d54c9c84-9077-4abe-8b2f-8cc6cb644792
Best trial: 0. Best value: 0.82025:   2%|▏         | 1/50 [06:36<5:23:35, 396.23s/it]

[I 2025-12-03 03:49:43,342] Trial 0 finished with value: 0.82025 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 5, 'gamma': 0.5, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.001}. Best is trial 0 with value: 0.82025.


Best trial: 3. Best value: 0.84675:   4%|▍         | 2/50 [09:06<3:21:24, 251.77s/it]

[I 2025-12-03 03:52:13,989] Trial 3 finished with value: 0.8467500000000001 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 10.0}. Best is trial 3 with value: 0.8467500000000001.


Best trial: 4. Best value: 0.860375:   6%|▌         | 3/50 [23:45<7:01:34, 538.18s/it]

[I 2025-12-03 04:06:52,990] Trial 4 finished with value: 0.8603750000000001 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 10.0}. Best is trial 4 with value: 0.8603750000000001.


Best trial: 4. Best value: 0.860375:   8%|▊         | 4/50 [26:12<4:54:10, 383.70s/it]

[I 2025-12-03 04:09:19,857] Trial 8 finished with value: 0.8118749999999999 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.01}. Best is trial 4 with value: 0.8603750000000001.


Best trial: 4. Best value: 0.860375:  10%|█         | 5/50 [27:11<3:19:57, 266.61s/it]

[I 2025-12-03 04:10:18,854] Trial 1 finished with value: 0.856625 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.01}. Best is trial 4 with value: 0.8603750000000001.


Best trial: 4. Best value: 0.860375:  12%|█▏        | 6/50 [27:35<2:15:04, 184.18s/it]

[I 2025-12-03 04:10:43,042] Trial 7 finished with value: 0.8483750000000001 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.0, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 1.0}. Best is trial 4 with value: 0.8603750000000001.


Best trial: 4. Best value: 0.860375:  14%|█▍        | 7/50 [35:03<3:13:41, 270.26s/it]

[I 2025-12-03 04:18:10,516] Trial 2 finished with value: 0.842875 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 5, 'gamma': 0.2, 'reg_lambda': np.float64(100.0), 'reg_alpha': 0.0}. Best is trial 4 with value: 0.8603750000000001.


Best trial: 4. Best value: 0.860375:  16%|█▌        | 8/50 [44:30<4:15:24, 364.88s/it]

[I 2025-12-03 04:27:37,986] Trial 6 finished with value: 0.8568749999999999 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.1, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 1.0}. Best is trial 4 with value: 0.8603750000000001.


Best trial: 4. Best value: 0.860375:  18%|█▊        | 9/50 [44:50<2:55:34, 256.93s/it]

[I 2025-12-03 04:27:57,567] Trial 9 finished with value: 0.8598750000000001 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 1.0}. Best is trial 4 with value: 0.8603750000000001.


Best trial: 10. Best value: 0.866125:  20%|██        | 10/50 [44:55<1:59:22, 179.05s/it]

[I 2025-12-03 04:28:02,232] Trial 10 finished with value: 0.8661249999999999 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 10 with value: 0.8661249999999999.


Best trial: 10. Best value: 0.866125:  22%|██▏       | 11/50 [45:58<1:33:17, 143.53s/it]

[I 2025-12-03 04:29:05,231] Trial 5 finished with value: 0.860125 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 5, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.01}. Best is trial 10 with value: 0.8661249999999999.


Best trial: 10. Best value: 0.866125:  24%|██▍       | 12/50 [1:02:25<4:13:32, 400.32s/it]

[I 2025-12-03 04:45:32,874] Trial 11 finished with value: 0.86175 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(100.0), 'reg_alpha': 0.001}. Best is trial 10 with value: 0.8661249999999999.


Best trial: 10. Best value: 0.866125:  26%|██▌       | 13/50 [1:02:57<2:58:04, 288.77s/it]

[I 2025-12-03 04:46:04,956] Trial 14 finished with value: 0.83725 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.0, 'reg_lambda': np.float64(100.0), 'reg_alpha': 1.0}. Best is trial 10 with value: 0.8661249999999999.


Best trial: 10. Best value: 0.866125:  28%|██▊       | 14/50 [1:05:01<2:23:24, 239.01s/it]

[I 2025-12-03 04:48:08,982] Trial 17 finished with value: 0.8661249999999999 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 10 with value: 0.8661249999999999.


Best trial: 10. Best value: 0.866125:  30%|███       | 15/50 [1:08:24<2:12:56, 227.89s/it]

[I 2025-12-03 04:51:31,102] Trial 15 finished with value: 0.8004999999999999 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.001), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 5, 'gamma': 0.5, 'reg_lambda': np.float64(100.0), 'reg_alpha': 10.0}. Best is trial 10 with value: 0.8661249999999999.


Best trial: 10. Best value: 0.866125:  32%|███▏      | 16/50 [1:12:13<2:09:20, 228.25s/it]

[I 2025-12-03 04:55:20,188] Trial 16 finished with value: 0.8553749999999999 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.1}. Best is trial 10 with value: 0.8661249999999999.


Best trial: 10. Best value: 0.866125:  34%|███▍      | 17/50 [1:13:14<1:37:52, 177.95s/it]

[I 2025-12-03 04:56:21,143] Trial 12 finished with value: 0.8522500000000001 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.0}. Best is trial 10 with value: 0.8661249999999999.


Best trial: 10. Best value: 0.866125:  36%|███▌      | 18/50 [1:20:04<2:12:06, 247.71s/it]

[I 2025-12-03 05:03:11,276] Trial 19 finished with value: 0.866 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.1}. Best is trial 10 with value: 0.8661249999999999.


Best trial: 10. Best value: 0.866125:  38%|███▊      | 19/50 [1:21:28<1:42:33, 198.51s/it]

[I 2025-12-03 05:04:35,167] Trial 21 finished with value: 0.8631249999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.1}. Best is trial 10 with value: 0.8661249999999999.


Best trial: 22. Best value: 0.87425:  40%|████      | 20/50 [1:24:49<1:39:38, 199.29s/it] 

[I 2025-12-03 05:07:56,266] Trial 22 finished with value: 0.87425 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  42%|████▏     | 21/50 [1:25:23<1:12:25, 149.84s/it]

[I 2025-12-03 05:08:30,838] Trial 23 finished with value: 0.8677499999999998 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  44%|████▍     | 22/50 [1:25:38<51:02, 109.39s/it]  

[I 2025-12-03 05:08:45,888] Trial 24 finished with value: 0.869125 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  46%|████▌     | 23/50 [1:26:31<41:36, 92.46s/it] 

[I 2025-12-03 05:09:38,867] Trial 13 finished with value: 0.8418750000000002 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 5, 'gamma': 0.1, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 1.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  48%|████▊     | 24/50 [1:31:36<1:07:42, 156.24s/it]

[I 2025-12-03 05:14:43,889] Trial 26 finished with value: 0.8696249999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.1), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  50%|█████     | 25/50 [1:32:55<55:24, 132.98s/it]  

[I 2025-12-03 05:16:02,597] Trial 28 finished with value: 0.8737499999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  52%|█████▏    | 26/50 [1:33:47<43:25, 108.56s/it]

[I 2025-12-03 05:16:54,168] Trial 29 finished with value: 0.8737499999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  54%|█████▍    | 27/50 [1:36:55<50:51, 132.66s/it]

[I 2025-12-03 05:20:03,067] Trial 18 finished with value: 0.86875 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(100.0), 'reg_alpha': 0.1}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  56%|█████▌    | 28/50 [1:37:01<34:38, 94.46s/it] 

[I 2025-12-03 05:20:08,403] Trial 30 finished with value: 0.8696249999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.1), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  58%|█████▊    | 29/50 [1:37:37<26:59, 77.12s/it]

[I 2025-12-03 05:20:45,066] Trial 27 finished with value: 0.867625 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  60%|██████    | 30/50 [1:37:52<19:25, 58.28s/it]

[I 2025-12-03 05:20:59,386] Trial 31 finished with value: 0.858375 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.1), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  62%|██████▏   | 31/50 [1:37:58<13:27, 42.52s/it]

[I 2025-12-03 05:21:05,138] Trial 20 finished with value: 0.8459999999999999 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  64%|██████▍   | 32/50 [1:38:03<09:23, 31.33s/it]

[I 2025-12-03 05:21:10,350] Trial 32 finished with value: 0.86675 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  66%|██████▌   | 33/50 [1:42:07<26:59, 95.29s/it]

[I 2025-12-03 05:25:14,861] Trial 36 finished with value: 0.811875 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.001}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  68%|██████▊   | 34/50 [1:42:10<17:59, 67.47s/it]

[I 2025-12-03 05:25:17,444] Trial 25 finished with value: 0.86575 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  70%|███████   | 35/50 [1:43:19<16:59, 67.95s/it]

[I 2025-12-03 05:26:26,502] Trial 37 finished with value: 0.791625 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.001}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  72%|███████▏  | 36/50 [1:43:26<11:35, 49.67s/it]

[I 2025-12-03 05:26:33,537] Trial 38 finished with value: 0.804875 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.001}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  74%|███████▍  | 37/50 [1:44:14<10:38, 49.14s/it]

[I 2025-12-03 05:27:21,429] Trial 35 finished with value: 0.8467499999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.001}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  76%|███████▌  | 38/50 [1:49:21<25:17, 126.49s/it]

[I 2025-12-03 05:32:28,416] Trial 40 finished with value: 0.8331249999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 5, 'gamma': 0.2, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  78%|███████▊  | 39/50 [1:49:40<17:16, 94.21s/it] 

[I 2025-12-03 05:32:47,284] Trial 33 finished with value: 0.864625 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  80%|████████  | 40/50 [1:50:13<12:38, 75.82s/it]

[I 2025-12-03 05:33:20,207] Trial 39 finished with value: 0.852125 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  82%|████████▏ | 41/50 [1:50:55<09:53, 65.90s/it]

[I 2025-12-03 05:34:02,971] Trial 41 finished with value: 0.8345 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 5, 'gamma': 0.2, 'reg_lambda': np.float64(0.1), 'reg_alpha': 0.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  84%|████████▍ | 42/50 [1:51:36<07:47, 58.45s/it]

[I 2025-12-03 05:34:44,036] Trial 34 finished with value: 0.8484999999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  86%|████████▌ | 43/50 [1:53:13<08:09, 69.89s/it]

[I 2025-12-03 05:36:20,613] Trial 43 finished with value: 0.8425 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 5, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  88%|████████▊ | 44/50 [1:53:22<05:10, 51.76s/it]

[I 2025-12-03 05:36:30,068] Trial 42 finished with value: 0.8425 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 5, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  90%|█████████ | 45/50 [1:58:07<10:07, 121.53s/it]

[I 2025-12-03 05:41:14,398] Trial 44 finished with value: 0.8355 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 5, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.01}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  92%|█████████▏| 46/50 [2:01:40<09:55, 148.95s/it]

[I 2025-12-03 05:44:47,311] Trial 47 finished with value: 0.8647499999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 10.0}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  94%|█████████▍| 47/50 [2:04:08<07:26, 148.80s/it]

[I 2025-12-03 05:47:15,761] Trial 49 finished with value: 0.8627499999999999 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  96%|█████████▌| 48/50 [2:11:01<07:36, 228.01s/it]

[I 2025-12-03 05:54:08,590] Trial 45 finished with value: 0.8623750000000001 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.01}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425:  98%|█████████▊| 49/50 [2:11:10<02:42, 162.21s/it]

[I 2025-12-03 05:54:17,284] Trial 46 finished with value: 0.8623750000000001 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.01}. Best is trial 22 with value: 0.87425.


Best trial: 22. Best value: 0.87425: 100%|██████████| 50/50 [2:13:24<00:00, 160.09s/it]


[I 2025-12-03 05:56:31,585] Trial 48 finished with value: 0.865625 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.01}. Best is trial 22 with value: 0.87425.
train_dataset_6 CV AUC mean: 0.87425
Loading 400 .tsv files from test_dataset_6 (remaining: []).


Loading TSV files: 100%|██████████| 400/400 [00:06<00:00, 63.56it/s]
Encoding k=3 advanced: 100%|██████████| 400/400 [00:27<00:00, 14.45it/s]


wrote results_03E_kmer/test_dataset_6_submission.tsv


Loading files: 100%|██████████| 400/400 [00:04<00:00, 89.53it/s]
scoring sequences: 100%|██████████| 7717545/7717545 [02:33<00:00, 50262.60it/s] 


wrote results_03E_kmer/train_dataset_6_important_sequences.tsv
Loading 302 .tsv files from train_dataset_7 (remaining: ['.ipynb_checkpoints', 'metadata.csv']).


Loading TSV files: 100%|██████████| 302/302 [01:03<00:00,  4.76it/s]
Encoding k=3 advanced: 100%|██████████| 302/302 [03:57<00:00,  1.27it/s]
[I 2025-12-03 06:15:10,126] A new study created in memory with name: no-name-905eb3a9-331c-461a-bc18-05c0d9111c5a
Best trial: 2. Best value: 0.616024:   2%|▏         | 1/50 [05:42<4:39:22, 342.10s/it]

[I 2025-12-03 06:20:52,222] Trial 2 finished with value: 0.6160235294117646 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.1, 'reg_lambda': np.float64(0.1), 'reg_alpha': 1.0}. Best is trial 2 with value: 0.6160235294117646.


Best trial: 2. Best value: 0.616024:   4%|▍         | 2/50 [06:14<2:08:09, 160.19s/it]

[I 2025-12-03 06:21:25,072] Trial 1 finished with value: 0.5567921568627451 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 0.001}. Best is trial 2 with value: 0.6160235294117646.


Best trial: 2. Best value: 0.616024:   6%|▌         | 3/50 [07:53<1:43:26, 132.06s/it]

[I 2025-12-03 06:23:03,652] Trial 3 finished with value: 0.5972313725490196 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 5, 'gamma': 0.0, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.001}. Best is trial 2 with value: 0.6160235294117646.


Best trial: 2. Best value: 0.616024:   8%|▊         | 4/50 [13:09<2:37:01, 204.81s/it]

[I 2025-12-03 06:28:19,994] Trial 7 finished with value: 0.5987764705882352 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.01}. Best is trial 2 with value: 0.6160235294117646.


Best trial: 2. Best value: 0.616024:  10%|█         | 5/50 [13:46<1:48:07, 144.17s/it]

[I 2025-12-03 06:28:56,646] Trial 9 finished with value: 0.5639372549019608 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(100.0), 'reg_alpha': 10.0}. Best is trial 2 with value: 0.6160235294117646.


Best trial: 4. Best value: 0.656776:  12%|█▏        | 6/50 [14:31<1:20:54, 110.33s/it]

[I 2025-12-03 06:29:41,286] Trial 4 finished with value: 0.6567764705882353 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 7, 'gamma': 0.2, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.01}. Best is trial 4 with value: 0.6567764705882353.


Best trial: 4. Best value: 0.656776:  14%|█▍        | 7/50 [14:36<54:21, 75.86s/it]   

[I 2025-12-03 06:29:46,174] Trial 6 finished with value: 0.6430588235294117 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.0}. Best is trial 4 with value: 0.6567764705882353.


Best trial: 4. Best value: 0.656776:  16%|█▌        | 8/50 [14:59<41:19, 59.02s/it]

[I 2025-12-03 06:30:09,144] Trial 0 finished with value: 0.6205882352941178 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 1.0}. Best is trial 4 with value: 0.6567764705882353.


Best trial: 4. Best value: 0.656776:  18%|█▊        | 9/50 [19:30<1:25:40, 125.38s/it]

[I 2025-12-03 06:34:40,425] Trial 11 finished with value: 0.5982745098039215 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.19952623149688797), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 7, 'gamma': 0.2, 'reg_lambda': np.float64(0.1), 'reg_alpha': 1.0}. Best is trial 4 with value: 0.6567764705882353.


Best trial: 4. Best value: 0.656776:  20%|██        | 10/50 [19:38<59:30, 89.26s/it]  

[I 2025-12-03 06:34:48,814] Trial 12 finished with value: 0.5 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 3, 'gamma': 0.5, 'reg_lambda': np.float64(100.0), 'reg_alpha': 10.0}. Best is trial 4 with value: 0.6567764705882353.


Best trial: 10. Best value: 0.662839:  22%|██▏       | 11/50 [21:01<56:43, 87.27s/it]

[I 2025-12-03 06:36:11,586] Trial 10 finished with value: 0.6628392156862745 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.0}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  24%|██▍       | 12/50 [21:07<39:35, 62.51s/it]

[I 2025-12-03 06:36:17,460] Trial 8 finished with value: 0.6411529411764705 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 7, 'gamma': 0.2, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.0}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  26%|██▌       | 13/50 [22:00<36:47, 59.65s/it]

[I 2025-12-03 06:37:10,527] Trial 5 finished with value: 0.5882509803921568 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(1000), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.9), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(100.0), 'reg_alpha': 1.0}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  28%|██▊       | 14/50 [28:04<1:30:54, 151.52s/it]

[I 2025-12-03 06:43:14,341] Trial 17 finished with value: 0.6361176470588236 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.01}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  30%|███       | 15/50 [31:14<1:35:13, 163.23s/it]

[I 2025-12-03 06:46:24,692] Trial 13 finished with value: 0.6121411764705882 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(500), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 5, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.1}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  32%|███▏      | 16/50 [31:52<1:11:07, 125.50s/it]

[I 2025-12-03 06:47:02,586] Trial 19 finished with value: 0.6288078431372549 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.1}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  34%|███▍      | 17/50 [32:24<53:30, 97.28s/it]   

[I 2025-12-03 06:47:34,231] Trial 18 finished with value: 0.6435450980392157 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.2, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  36%|███▌      | 18/50 [33:38<48:15, 90.48s/it]

[I 2025-12-03 06:48:48,905] Trial 15 finished with value: 0.6009882352941176 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(400), 'learning_rate': np.float64(0.002130941015366798), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 3, 'gamma': 0.2, 'reg_lambda': np.float64(100.0), 'reg_alpha': 0.001}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  38%|███▊      | 19/50 [36:48<1:02:13, 120.43s/it]

[I 2025-12-03 06:51:59,082] Trial 14 finished with value: 0.5635686274509805 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(1.0), 'colsample_bytree': np.float64(0.8), 'min_child_weight': 7, 'gamma': 0.2, 'reg_lambda': np.float64(100.0), 'reg_alpha': 0.0}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  40%|████      | 20/50 [43:03<1:38:20, 196.67s/it]

[I 2025-12-03 06:58:13,440] Trial 16 finished with value: 0.6434745098039215 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.2, 'reg_lambda': np.float64(1.5848931924611134), 'reg_alpha': 0.01}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  42%|████▏     | 21/50 [45:37<1:28:55, 183.97s/it]

[I 2025-12-03 07:00:47,799] Trial 23 finished with value: 0.5948392156862745 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 7, 'gamma': 0.5, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 0.0}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  44%|████▍     | 22/50 [47:00<1:11:38, 153.51s/it]

[I 2025-12-03 07:02:10,275] Trial 25 finished with value: 0.6097019607843137 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(300), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.5, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 0.0}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  46%|████▌     | 23/50 [51:47<1:27:10, 193.72s/it]

[I 2025-12-03 07:06:57,773] Trial 22 finished with value: 0.6240549019607843 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 7, 'gamma': 0.5, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  48%|████▊     | 24/50 [53:09<1:09:19, 160.00s/it]

[I 2025-12-03 07:08:19,122] Trial 24 finished with value: 0.6246745098039216 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 7, 'gamma': 0.5, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 0.0}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  50%|█████     | 25/50 [54:29<56:39, 136.00s/it]  

[I 2025-12-03 07:09:39,122] Trial 27 finished with value: 0.6471450980392157 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  52%|█████▏    | 26/50 [55:17<43:55, 109.81s/it]

[I 2025-12-03 07:10:27,833] Trial 28 finished with value: 0.6471450980392157 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  54%|█████▍    | 27/50 [56:01<34:27, 89.88s/it] 

[I 2025-12-03 07:11:11,230] Trial 20 finished with value: 0.6121568627450981 and parameters: {'max_depth': np.int64(4), 'n_estimators': np.int64(700), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 5, 'gamma': 0.5, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.1}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  56%|█████▌    | 28/50 [56:46<28:06, 76.65s/it]

[I 2025-12-03 07:11:56,982] Trial 21 finished with value: 0.6340313725490196 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.5, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  58%|█████▊    | 29/50 [1:01:08<46:15, 132.17s/it]

[I 2025-12-03 07:16:18,701] Trial 26 finished with value: 0.6246980392156863 and parameters: {'max_depth': np.int64(7), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.004540909610972475), 'subsample': np.float64(0.8), 'colsample_bytree': np.float64(0.7), 'min_child_weight': 7, 'gamma': 0.5, 'reg_lambda': np.float64(25.118864315095795), 'reg_alpha': 0.0}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  60%|██████    | 30/50 [1:01:28<32:48, 98.43s/it] 

[I 2025-12-03 07:16:38,412] Trial 29 finished with value: 0.6471450980392157 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  62%|██████▏   | 31/50 [1:02:03<25:09, 79.46s/it]

[I 2025-12-03 07:17:13,603] Trial 30 finished with value: 0.6471450980392157 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 10. Best value: 0.662839:  64%|██████▍   | 32/50 [1:04:11<28:13, 94.06s/it]

[I 2025-12-03 07:19:21,724] Trial 31 finished with value: 0.6471450980392157 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(600), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 10 with value: 0.6628392156862745.


Best trial: 34. Best value: 0.679812:  66%|██████▌   | 33/50 [1:06:08<28:34, 100.87s/it]

[I 2025-12-03 07:21:18,474] Trial 34 finished with value: 0.6798117647058823 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.01}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  68%|██████▊   | 34/50 [1:08:23<29:40, 111.30s/it]

[I 2025-12-03 07:23:34,121] Trial 32 finished with value: 0.6455764705882353 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  70%|███████   | 35/50 [1:09:13<23:09, 92.65s/it] 

[I 2025-12-03 07:24:23,267] Trial 33 finished with value: 0.6455764705882353 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(900), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(1.0), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(6.30957344480193), 'reg_alpha': 0.01}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  72%|███████▏  | 36/50 [1:09:55<18:06, 77.61s/it]

[I 2025-12-03 07:25:05,784] Trial 35 finished with value: 0.6567686274509804 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.01}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  74%|███████▍  | 37/50 [1:10:20<13:24, 61.89s/it]

[I 2025-12-03 07:25:30,988] Trial 37 finished with value: 0.6398823529411765 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.9), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.01}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  76%|███████▌  | 38/50 [1:12:31<16:29, 82.47s/it]

[I 2025-12-03 07:27:41,476] Trial 38 finished with value: 0.6567686274509804 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.043939705607607904), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.01}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  78%|███████▊  | 39/50 [1:14:05<15:46, 86.08s/it]

[I 2025-12-03 07:29:15,967] Trial 36 finished with value: 0.6184078431372549 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  80%|████████  | 40/50 [1:16:20<16:46, 100.68s/it]

[I 2025-12-03 07:31:30,692] Trial 39 finished with value: 0.6057882352941176 and parameters: {'max_depth': np.int64(6), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  82%|████████▏ | 41/50 [1:17:02<12:27, 83.06s/it] 

[I 2025-12-03 07:32:12,661] Trial 45 finished with value: 0.6082039215686275 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 1.0}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  84%|████████▍ | 42/50 [1:18:40<11:40, 87.59s/it]

[I 2025-12-03 07:33:50,831] Trial 40 finished with value: 0.6057882352941176 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 10.0}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  86%|████████▌ | 43/50 [1:19:23<08:39, 74.18s/it]

[I 2025-12-03 07:34:33,707] Trial 46 finished with value: 0.6371058823529412 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(200), 'learning_rate': np.float64(0.09363292088239417), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 5, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 1.0}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  88%|████████▊ | 44/50 [1:19:30<05:23, 53.85s/it]

[I 2025-12-03 07:34:40,111] Trial 41 finished with value: 0.6589019607843137 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.6), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.001}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  90%|█████████ | 45/50 [1:20:35<04:46, 57.39s/it]

[I 2025-12-03 07:35:45,789] Trial 42 finished with value: 0.6414823529411764 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.001}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  92%|█████████▏| 46/50 [1:21:04<03:15, 48.91s/it]

[I 2025-12-03 07:36:14,920] Trial 43 finished with value: 0.6414823529411764 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.001}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  94%|█████████▍| 47/50 [1:22:31<03:00, 60.32s/it]

[I 2025-12-03 07:37:41,854] Trial 44 finished with value: 0.6414823529411764 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.020619860095022202), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.001}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  96%|█████████▌| 48/50 [1:28:38<05:04, 152.20s/it]

[I 2025-12-03 07:43:48,435] Trial 47 finished with value: 0.6418274509803921 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.001}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812:  98%|█████████▊| 49/50 [1:28:50<01:50, 110.13s/it]

[I 2025-12-03 07:44:00,391] Trial 48 finished with value: 0.6422117647058825 and parameters: {'max_depth': np.int64(3), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.009676410537094535), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.0, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.01}. Best is trial 34 with value: 0.6798117647058823.


Best trial: 34. Best value: 0.679812: 100%|██████████| 50/50 [1:31:05<00:00, 109.32s/it]


[I 2025-12-03 07:46:16,029] Trial 49 finished with value: 0.6312313725490197 and parameters: {'max_depth': np.int64(5), 'n_estimators': np.int64(800), 'learning_rate': np.float64(0.001), 'subsample': np.float64(0.7), 'colsample_bytree': np.float64(0.6), 'min_child_weight': 7, 'gamma': 0.1, 'reg_lambda': np.float64(0.3981071705534972), 'reg_alpha': 0.001}. Best is trial 34 with value: 0.6798117647058823.
train_dataset_7 CV AUC mean: 0.6798117647058823
Loading 76 .tsv files from test_dataset_7_1 (remaining: []).


Loading TSV files: 100%|██████████| 76/76 [00:15<00:00,  4.75it/s]
Encoding k=3 advanced: 100%|██████████| 76/76 [01:17<00:00,  1.02s/it] 


wrote results_03E_kmer/test_dataset_7_1_submission.tsv
Loading 100 .tsv files from test_dataset_7_2 (remaining: []).


Loading TSV files: 100%|██████████| 100/100 [00:12<00:00,  7.97it/s]
Encoding k=3 advanced: 100%|██████████| 100/100 [01:03<00:00,  1.58it/s]


wrote results_03E_kmer/test_dataset_7_2_submission.tsv


Loading files: 100%|██████████| 302/302 [00:51<00:00,  5.83it/s]
scoring sequences:   0%|          | 283660/67190004 [03:01<6:43:16, 2765.11it/s] 

In [15]:
def concatenate_output_files(out_dir: str) -> None:

    predictions_pattern = os.path.join(out_dir, '*_submission.tsv')
    sequences_pattern = os.path.join(out_dir, '*_important_sequences.tsv')

    predictions_files = sorted(glob.glob(predictions_pattern))
    sequences_files = sorted(glob.glob(sequences_pattern))

    df_list = []

    for pred_file in predictions_files:
        try:
            df = pd.read_csv(pred_file, sep='\t')
            df_list.append(df)
        except Exception as e:
            print(f"Warning: Could not read predictions file '{pred_file}'. Error: {e}. Skipping.")
            continue

    for seq_file in sequences_files:
        try:
            df = pd.read_csv(seq_file, sep='\t')
            df_list.append(df)
        except Exception as e:
            print(f"Warning: Could not read sequences file '{seq_file}'. Error: {e}. Skipping.")
            continue

    if not df_list:
        print("Warning: No output files were found to concatenate.")
        concatenated_df = pd.DataFrame(
            columns=['ID', 'dataset', 'label_positive_probability', 'junction_aa', 'v_call', 'j_call'])
    else:
        concatenated_df = pd.concat(df_list, ignore_index=True)
    submissions_file = os.path.join(out_dir, 'submissions.csv')
    concatenated_df.to_csv(submissions_file, index=False)
    print(f"Concatenated output written to `{submissions_file}`.")

In [16]:
concatenate_output_files('results_03E_kmer/')


Concatenated output written to `results_03E_kmer/submissions.csv`.


In [18]:
concatenate_output_files('test/')


Concatenated output written to `test/submissions.csv`.
