In [None]:
!pip install scikit-learn
!pip install pandas
!pip install numpy

!pip install wandb
!pip install xgboost
!pip install catboost
!pip install DESReg



In [None]:
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import root_mean_squared_error, mean_absolute_error,mean_squared_error

import torch
import pandas as pd
import numpy as np
import wandb
from math import ceil
import re
import gc,os

from xgboost import XGBRegressor, XGBRFRegressor,XGBClassifier
from catboost import CatBoostRegressor


from catboost import CatBoostClassifier, Pool
from sklearn.ensemble import StackingRegressor
from desReg.des.DESRegression import DESRegression
from typing import List, Optional, Union
from sklearn.impute import SimpleImputer


In [None]:
target = "CORRUCYSTIC_DENSITY"
ID_COL = "LOCAL_IDENTIFIER"
train = pd.read_csv("MiNDAT_c.csv",index_col=ID_COL)
test = pd.read_csv("MiNDAT_T.csv",index_col=ID_COL)

In [None]:
def sanitize_columns(df):
    df = df.copy()
    new_cols = []
    seen = {}
    for c in df.columns:
        name = re.sub(r'[\[\]<>]', '_', str(c))  # replace [, ], <
        if name in seen:
            seen[name] += 1
            name = f"{name}__{seen[name]}"      # make unique if collision
        else:
            seen[name] = 0
        new_cols.append(name)
    df.columns = new_cols
    return df

In [None]:
drop_cols = [target,'Z~x0<k','A>.','>?64:','U"r','TSWm',"w-u:jN'qI",'PZ8','jNhEum',
                     'fPqsI','&%)LTaWRb','r2Ng','v0rt3X','b1oRb13','maT_r', "F'3Ku", 'MINDSPIKE_VERSION']

drop_cols_test = ['fPqsI','&%)LTaWRb','r2Ng','v0rt3X','b1oRb13','maT_r', "F'3Ku", 'MINDSPIKE_VERSION']

X = train.drop(columns=[c for c in drop_cols if c in train.columns])
y = train[target]
Z = test.drop(columns=[c for c in drop_cols_test if c in test.columns])

X = sanitize_columns(X)
Z = sanitize_columns(Z)

impute = SimpleImputer(strategy='median')
X_imp = pd.DataFrame(impute.fit_transform(X), columns=X.columns, index=X.index)


X_train, X_valid, y_train, y_valid = train_test_split(X_imp, y, test_size=0.2, random_state=42,shuffle=True)


In [None]:
Z_imp = pd.DataFrame(impute.transform(Z), columns=Z.columns, index=Z.index)

In [None]:

model_bot = CatBoostRegressor(
    loss_function='Quantile:alpha=0.1',
    iterations=2000,
    depth=6,
    learning_rate=0.035,
    verbose=0,
    task_type="CPU",
    thread_count=1
)


model_top = CatBoostRegressor(
    loss_function='Quantile:alpha=0.9',
    iterations=2000,
    depth=6,
    learning_rate=0.035,
    verbose=0,
    task_type="CPU",
    thread_count=1

)
CB_PARAMS = dict(
    task_type="GPU", devices=[0],
    loss_function="RMSE",
    iterations = 8000,
    depth=8, learning_rate=0.0876,
    l2_leaf_reg=6.0, random_state = 42,
    verbose=False,
    bootstrap_type="Bayesian",
    bagging_temperature=0.5,
    thread_count=1
)

mid = CatBoostRegressor(**CB_PARAMS)



In [None]:
def tail_rmse_report(y_true, y_pred, n_deciles: int = 10):
    """

    """

    s = pd.DataFrame({"y_true": y_true, "y_pred": y_pred}).dropna().astype(float).copy()
    if len(s) == 0:
        raise ValueError("Empty data after alignment / dropna.")

    # Squared error
    s["err2"] = (s["y_pred"] - s["y_true"]) ** 2


    ranks = s["y_true"].rank(method="first")

    s["decile"] = pd.qcut(ranks, q=n_deciles, labels=False, duplicates="drop")

    # Aggregate RMSE per decile
    rmse_by_decile = (
        s.groupby("decile", observed=True)
         .agg(
             n=("err2", "size"),
             rmse=("err2", lambda v: float(np.sqrt(np.mean(v)))),
             y_min=("y_true", "min"),
             y_max=("y_true", "max"),
         )
         .reset_index()
         .rename(columns={"decile": "true_y_decile"})
         .sort_values("true_y_decile")
         .reset_index(drop=True)
    )

    # Helper to compute RMSE over selected deciles
    deciles = rmse_by_decile["true_y_decile"].tolist()
    D = len(deciles)
    # sizes for 10%/20% given D bins
    k10 = max(1, int(round(0.10 * D)))
    k20 = max(1, int(round(0.20 * D)))

    bottom10 = set(deciles[:k10])
    bottom20 = set(deciles[:k20])
    top10    = set(deciles[-k10:]) if D >= 1 else set()
    top20    = set(deciles[-k20:]) if D >= 1 else set()
    mid      = set(deciles[k20:D-k20]) if D > 2*k20 else set()

    def _grp_rmse(decile_set):
        if not decile_set:
            return float("nan")
        m = s["decile"].isin(decile_set)
        if not m.any():
            return float("nan")
        return float(np.sqrt(s.loc[m, "err2"].mean()))

    rmse_summary = {
        "rmse_bottom_10p": _grp_rmse(bottom10),
        "rmse_bottom_20p": _grp_rmse(bottom20),
        "rmse_top_10p":    _grp_rmse(top10),
        "rmse_top_20p":    _grp_rmse(top20),
        "rmse_mid_20_80p": _grp_rmse(mid),
    }

    return rmse_summary, rmse_by_decile

In [None]:

class CatBoostSklearnAdapter(BaseEstimator):

    def __init__(
        self,
        model: CatBoostRegressor,

    ):
        self.model = model

    def fit(self, X, y):
        pool = Pool(X, y)
        return self.model.fit(pool)

    def predict(self, X: pd.DataFrame):
        # Ensure X has the expected feature names if feature_names is set

        return self.model.predict(X)



In [None]:
pool_reg = [model_top,model_bot,mid]
DES = DESRegression(
    regressors_list= pool_reg,
    n_estimators_bag= 3,
    DSEL_perc = 0.9,
    XTRAIN_full=False,
    competence_region='output_profiles',
    competence_level= mean_squared_error,
    n_jobs=1
)
DES.fit(X_train,y_train)

In [None]:
y_pred=DES.predict(X_valid)

CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=0]="vzo."": Cannot convert 'vzo."' to float

In [None]:
middle = mid.predict(Z)
top = model_top.predict(Z)
bot = model_bot.predict(Z)
target_test = (middle+top+bot)/3


In [None]:
test[target] = target_test

A = test[target]

A.to_csv("sub_2.csv")
