In [None]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

import xgboost as xgb

**Загрузка и обработка данных**

In [11]:
train_df = pd.read_parquet("train_data.pqt")
test_df = pd.read_parquet("test_data.pqt")

In [12]:
cat_cols = [
    "channel_code", "city", "city_type",
    "okved", "segment", "start_cluster",
    "index_city_code", "ogrn_month", "ogrn_year",
]

In [13]:
train_df[cat_cols] = train_df[cat_cols].astype("category")
test_df[cat_cols] = test_df[cat_cols].astype("category")

train_df

Unnamed: 0,id,date,balance_amt_avg,balance_amt_max,balance_amt_min,balance_amt_day_avg,channel_code,city,city_type,index_city_code,...,cnt_cred_g_oper_3m,cnt_days_cred_g_oper_3m,sum_deb_h_oper_3m,cnt_deb_h_oper_3m,cnt_days_deb_h_oper_3m,sum_cred_h_oper_3m,cnt_cred_h_oper_3m,cnt_days_cred_h_oper_3m,start_cluster,end_cluster
0,0,month_1,0.744845,0.705492,1.287207,0.748101,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.951166,0.568681,0.897565,0.553624,0.774354,0.936506,0.295984,0.967947,"{α, γ}",{other}
1,0,month_2,1.049605,0.831916,2.458609,1.053805,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.948812,0.499716,0.785029,0.551904,0.696576,0.990157,0.298873,0.945969,"{α, γ}",{other}
2,0,month_3,0.692653,0.740253,0.430042,0.695747,channel_code_5,city_23,city_type_0,index_city_code_39,...,0.946458,0.442244,0.877050,0.551044,0.663243,0.810065,0.294829,0.956958,"{α, γ}",{other}
3,1,month_1,-0.081586,-0.091860,-0.114040,-0.080890,channel_code_2,city_14,city_type_0,,...,0.945281,0.407762,0.369318,0.567093,0.785465,-0.184002,0.253523,0.462452,{other},{other}
4,1,month_2,-0.094962,-0.100504,-0.119302,-0.094307,channel_code_2,city_14,city_type_0,,...,0.946066,0.430750,0.067275,0.559928,0.696576,-0.183854,0.255545,0.495419,{other},{other}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599995,199998,month_2,-0.156775,-0.204960,-0.125987,-0.156311,channel_code_9,city_14,city_type_0,,...,,,-0.165588,,,-0.201123,,,{},{}
599996,199998,month_3,-0.156776,-0.204960,-0.125995,-0.156312,channel_code_9,city_14,city_type_0,,...,0.944497,0.384773,-0.165588,0.546889,0.407687,-0.201123,0.250924,0.374540,{α},{}
599997,199999,month_1,-0.156712,-0.204913,-0.125831,-0.156248,channel_code_14,city_1876,city_type_0,index_city_code_195,...,,,-0.165588,,,-0.201123,,,{},{}
599998,199999,month_2,-0.156712,-0.204913,-0.125831,-0.156248,channel_code_14,city_1876,city_type_0,index_city_code_195,...,,,-0.165588,,,-0.201123,,,{},{}


In [14]:
def extend_test(df):
    ddf = df.sort_values(by=['id', 'date'], ascending=True)
    ddf.reset_index(drop=True, inplace=True)
    mask = (ddf['date'] == 'month_5') & (ddf['date'].shift(1) == 'month_6')
    rows_to_insert = df[mask].copy()
    rows_to_insert['date'] = 'month_4'
    ddf = pd.concat([ddf, rows_to_insert], ignore_index=True)
    ddf = ddf.sort_values(by=['id', 'date'], ascending=True)
    ddf.reset_index(drop=True, inplace=True)

    return ddf

def _assert_extend(df):
  for i in range(1, len(df)):
        if df.loc[i, 'date'] == 'month_5':
          assert(df.loc[i - 1, 'date'] == 'month_4')

test_df = extend_test(test_df)
_assert_extend(test_df)
print(test_df.shape)

(300000, 92)


**Удаление загрязненных признаков**

In [15]:
threshold = len(train_df) / 2

num_cols = train_df.columns.difference(cat_cols + ['date'] + ['end_cluster'])

col_with_half_nan = train_df[num_cols].columns[train_df[num_cols].isna().sum() > threshold].tolist()

In [16]:
def prepare(df, col_with_half_nan, num_cols):
    ddf = df.copy()

    ddf.drop(columns = col_with_half_nan, inplace=True)

    ddf[cat_cols] = ddf[cat_cols].astype('object')
    ddf[cat_cols] = ddf[cat_cols].fillna("NaN")
    ddf[cat_cols] = ddf[cat_cols].astype("category")

    num_cols = ddf.columns.difference(cat_cols + ['date'] + ['end_cluster'])
    ddf.loc[:, num_cols] = ddf[num_cols].fillna(-1000)


    return ddf

def fill_start_cluster(df):
    ddf = df.copy()

    mask = ddf['date'] == 'month_6'

    ddf['start_cluster'] = ddf['start_cluster'].fillna(method='ffill')

    return ddf

In [None]:
train_df = prepare(train_df, col_with_half_nan, num_cols)

test_df = fill_start_cluster(test_df)
test_df = prepare(test_df, col_with_half_nan, num_cols)

In [18]:
assert(test_df.isna().any().any() == False)
assert(train_df.isna().any().any() == False)

In [None]:
labels = [
    "{other}",
    "{}",
    "{α, β}",
    "{α, γ}",
    "{α, δ}",
    "{α, ε, η}",
    "{α, ε, θ}",
    "{α, ε, ψ}",
    "{α, ε}",
    "{α, η}",
    "{α, θ}",
    "{α, λ}",
    "{α, μ}",
    "{α, π}",
    "{α, ψ}",
    "{α}",
    "{λ}",
]

**Модели**

In [19]:
from catboost import CatBoostClassifier

cluster_weights = pd.read_excel('cluster_weights.xlsx').set_index("cluster")
weights_dict = cluster_weights["unnorm_weight"].to_dict()

def weighted_roc_auc(y_true, y_pred):
    global weights_dict
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    classes_roc_auc = roc_auc_score(
        y_true, y_pred, labels=labels, multi_class="ovr", average=None
    )
    return sum(weights * classes_roc_auc), classes_roc_auc

def get_class_weights():
    unnorm_weights = np.array([weights_dict[label] for label in labels])
    weights = unnorm_weights / unnorm_weights.sum()
    return unnorm_weights


class ITPurpleModel:
    def __init__(self, cat_cols, lr=0.05, iterations=100, depth=6, set_w=True):
        self.model = CatBoostClassifier(
            cat_features=cat_cols,
            iterations=iterations,
            depth=depth,
            learning_rate=lr,
            random_seed=42,od_type = "Iter",
            od_wait = 100,
            task_type="GPU",
            devices="0:1",
            class_weights = [i for i in get_class_weights()],
        )

    def fit(self, x_train, y_train, x_val=None, y_val=None):
        self.model.fit(x_train, y_train, metric_period=100)

        print("fitting done!")
        print(
            "Your roc_auc on train: ", self.weighted_roc_auc(x_train, y_train)
        )
        if x_val is not None:
            print("Your roc_auc on val: ", self.weighted_roc_auc(x_val, y_val))

    def predict_proba(self, x):
        return self.model.predict_proba(x)

    def output_feature_importances(self):
        feature_importance = self.model.feature_importances_

        feature_importance_df = pd.DataFrame(
            {
                "Feature": self.model.feature_names_,
                "Importance": feature_importance,
                "Top": list(range(1, len(feature_importance) + 1)),
            }
        )
        feature_importance_df = feature_importance_df.sort_values(
            by="Importance", ascending=False
        )
        return feature_importance_df

    def visualize_importances(self, top=15):
        feature_importances = self.output_feature_importances().values
        sorted_feature_importances = feature_importances[feature_importances[:, 1].argsort()][::-1]

        top_features = sorted_feature_importances[:top]

        feature_names = top_features[:, 0]
        importance_values = top_features[:, 1].astype(float)

        plt.figure(figsize=(10, 6))
        plt.barh(feature_names, importance_values, color='skyblue')
        plt.xlabel('Importance')
        plt.ylabel('Feature')
        plt.title('Top 15 Most Important Features')
        plt.gca().invert_yaxis()
        plt.show()

    def weighted_roc_auc(self, x, y):
        return weighted_roc_auc(y, self.predict_proba(x))

In [35]:
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

label_encoder = LabelEncoder()

class ITPurpleXGB:
    def __init__(self, cat_cols, lr=0.05, iterations=100, depth=6, set_w=True):
        self.cat_cols = cat_cols
        self.model = XGBClassifier(
            objective="binary:logistic",
            tree_method = "hist",
            device = "gpu",
            n_estimators=iterations,
            learning_rate=0.05,  # Укажите необходимую скорость обучения
            max_depth=depth,  # Укажите необходимую глубину деревьев
            enable_categorical=True,
            verbosity=1
        )

    def fit(self, x_train, y_train, x_val=None, y_val=None):
        x_train[self.cat_cols] = x_train[self.cat_cols].astype('category')
        
        y_train_encoded = label_encoder.fit_transform(y_train)

        self.model.fit(x_train, y_train_encoded)

        print("fitting done!")
        print(
            "Your roc_auc on train: ", self.weighted_roc_auc(x_train, y_train)
        )
        if x_val is not None:
            print("Your roc_auc on val: ", self.weighted_roc_auc(x_val, y_val))

    def predict_proba(self, x):
        x[self.cat_cols] = x[self.cat_cols].astype('category')
        return self.model.predict_proba(x)
    
    def weighted_roc_auc(self, x, y):
        return weighted_roc_auc(y, self.predict_proba(x))

**Предсказания: подготавливаем данные для композиции моделей**

In [36]:
#Сплитим по месяцам

def merge321(dfm3, dfm2, dfm1):
    ddfm3 = dfm3.copy()
    columns_to_merge = [col for col in dfm2.columns if col != 'end_cluster' and col != 'id']
    merged_columns = []
    for col in columns_to_merge:
        merged_columns.append(dfm1[col].copy().rename(col + '-2'))
        merged_columns.append(dfm2[col].copy().rename(col + '-1'))
    ddfm3 = pd.concat([dfm3] + merged_columns, axis=1)
    return ddfm3

def merge21(dfm2, dfm1):
    ddfm2 = dfm2.copy()
    columns_to_merge = [col for col in dfm1.columns if col != 'end_cluster' and col != 'id']
    merged_columns = []
    for col in columns_to_merge:
        merged_columns.append(dfm1[col].copy().rename(col + '-1'))
    ddfm2 = pd.concat([dfm2] + merged_columns, axis=1)
    return ddfm2


def load_train(train_df):
    dfm1 = train_df[train_df['date'] == 'month_1'].drop(columns=['date']).reset_index(drop=True)
    dfm2 = train_df[train_df['date'] == 'month_2'].drop(columns=['date']).reset_index(drop=True)
    dfm3 = train_df[train_df['date'] == 'month_3'].drop(columns=['date']).reset_index(drop=True)

    dfm3 = merge321(dfm3, dfm2, dfm1)
    dfm2 = merge21(dfm2, dfm1)

    for i in range(17):
        dfm2[str(i) + '-1'] = dfm1[num_cols[0]].copy()

        dfm3[str(i) + '-2'] = dfm1[num_cols[0]].copy()
        dfm3[str(i) + '-1'] = dfm1[num_cols[0]].copy()

    return (dfm1, dfm2, dfm3)

def load_test(test_df):
    tfm1 = test_df[test_df['date'] == 'month_4'].drop(columns=['date']).reset_index(drop=True)
    tfm2 = test_df[test_df['date'] == 'month_5'].drop(columns=['date']).reset_index(drop=True)
    tfm3 = test_df[test_df['date'] == 'month_6'].drop(columns=['date']).reset_index(drop=True)

    tfm3 = merge321(tfm3, tfm2, tfm1)
    tfm2 = merge21(tfm2, tfm1)

    for i in range(17):
        tfm2[str(i) + '-1'] = tfm1[num_cols[0]].copy()

        tfm3[str(i) + '-2'] = tfm1[num_cols[0]].copy()
        tfm3[str(i) + '-1'] = tfm1[num_cols[0]].copy()

    return (tfm1, tfm2, tfm3)

def load_cat_cols(cat_cols):
    cat_cols1 = cat_cols
    cat_cols2 = cat_cols1 + [i + '-1' for i in cat_cols]
    cat_cols3 = cat_cols2 + [i + '-2' for i in cat_cols]

    return (cat_cols1, cat_cols2, cat_cols3)

In [37]:
def get_mpred(df, cat_cols, iterations=1000, depth=4, lr=0.05, m='cat'):
    X = df.drop(["end_cluster"], axis=1)
    y = df["end_cluster"]

    if m == 'cat':
        model = ITPurpleModel(cat_cols, lr=lr, iterations=iterations, depth=depth)
    elif m == 'xgb':        
        model = ITPurpleXGB(cat_cols, lr=lr, iterations=iterations, depth=depth)

    model.fit(X, y)
    return model.predict_proba(X), model

In [38]:
import warnings
warnings.filterwarnings("ignore", category=pd.errors.PerformanceWarning)

dfm1, dfm2, dfm3 = load_train(train_df)
tfm1, tfm2, tfm3 = load_test(test_df)
cat_cols1, cat_cols2, cat_cols3 = load_cat_cols(cat_cols)

**Обучение**

In [39]:
def pipe(m, iters):
    depth = 6 if m == 'cat' else 5
    lr = 0.003 if m == 'cat' else 0.0005
    print("Phase 1")
    mpred1, m1 = get_mpred(dfm1, cat_cols=cat_cols1, iterations=iters[0], depth=depth, m=m, lr=lr)
    
    for i in range(17):
        dfm2[str(i) + '-1'] = mpred1[:, i]

    print("Phase 2")
    mpred2, m2 = get_mpred(dfm2, cat_cols=cat_cols2, iterations=iters[1], depth=depth, m=m, lr=lr)

    for i in range(17):
        dfm3[str(i) + '-2'] = mpred1[:, i]
        dfm3[str(i) + '-1'] = mpred2[:, i]

    print("Phase 3")
    mpred3, m3 = get_mpred(dfm3, cat_cols=cat_cols3, iterations=iters[2], depth=depth, m=m, lr=lr)
    
    return m1, m2, m3

In [None]:
m1, m2, m3 = pipe(m='cat', iters=[6000, 12000, 24000])

**Предсказание end_cluster для month_4, 5**

In [None]:
tpred1 = m1.predict_proba(tfm1)

for i in range(17):
    tfm2[str(i) + '-1'] = tpred1[:, i]
tpred2 = m2.predict_proba(tfm2)

for i in range(17):
    tfm3[str(i) + '-2'] = tpred1[:, i]
    tfm3[str(i) + '-1'] = tpred2[:, i]

**Заполняем start_cluster там, где он пропущен**

In [None]:
pcat_cols3 = cat_cols3[:]
pcat_cols3.remove('start_cluster')

pmodel = ITPurpleModel(pcat_cols3, iterations=2000, depth=4, lr=0.01)
pmodel.model.class_weights=None

data = dfm3.drop('end_cluster', axis=1)

pdata = tfm3.drop('start_cluster', axis=1)

X = data.drop('start_cluster', axis=1)

y = data['start_cluster']

pmodel.fit(X, y)

probas = pmodel.predict_proba(pdata)

probas

In [None]:
probas_array = np.array(probas)

max_indices = np.argmax(probas_array, axis=1)

tfm3['start_cluster'] = [labels[idx] for idx in max_indices]

**Предсказание end_cluster для month_6**

In [None]:
tpred3 = m3.predict_proba(tfm3)

**Отправка решения**

In [None]:
sample_submission_df = pd.read_csv('sample_submission.csv')
test_pred_proba = pd.DataFrame(tpred3)
test_pred_proba.insert(0, 'id', range(200000, 200000 + len(test_pred_proba)))
test_pred_proba.columns = sample_submission_df.columns

print(test_pred_proba.head())

sorted_classes = sorted(test_pred_proba.columns.to_list())
sample_submission_df[sorted_classes] = test_pred_proba
sample_submission_df.to_csv("andrew.csv", index=False)

sample_submission_df.head()