In [1]:
import os 
import random
import numpy as np 

import torch
import tensorflow as tf 
DEFAULT_RANDOM_SEED = 5
def set_all_seeds(seed=DEFAULT_RANDOM_SEED):
    
    # python's seeds
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
    # torch's seeds
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    
    # tensorflow's seed
    tf.random.set_seed(seed)
    

set_all_seeds(seed=DEFAULT_RANDOM_SEED)

In [2]:
from utils import load_and_join_data, load_data, preprocess_dataframe
from utils import Paths

paths = Paths() 

import pandas as pd
import pyarrow.parquet as pq


df_list = load_and_join_data(paths=paths)
df_train = df_list[0]
df_test = df_list[1]

In [3]:
import pandas as pd

# Предполагаем, что df_train и df_test уже загружены с нужными столбцами

# Комбинации для обработки
combinations = [
    ('client_id', 'teamid'),
    #('teamid', 'hypothesisid')
]

i = 0
for combo in combinations:
    i += 1
    # Подсчёт общего количества записей и количества target = 1 для каждой комбинации
    grouped = df_train.groupby(list(combo)).agg(
        **{f'total_records{i}': ('target', 'size'),
           f'success_count{i}': ('target', 'sum')}).reset_index()
    
    # Вычисление успешности
    grouped[f'success_rate{i}'] = grouped[f'success_count{i}'] / grouped[f'total_records{i}']

    # Добавляем полученные статистики в df_train и df_test
    df_train = df_train.merge(grouped, on=list(combo), how='left')
    df_test = df_test.merge(grouped[[*combo, f'total_records{i}', f'success_rate{i}']], on=list(combo), how='left').fillna(-1)

# Проверим результаты
df_train = df_train.drop(['success_count1'], axis=1)
#df_test = df_test.drop(['success_rate1'], axis=1)


In [4]:
df_train_extra = df_train.copy()

In [5]:
df_train = df_train.drop(['success_rate1', 'total_records1'], axis=1)

In [6]:
import pandas as pd

# Предполагаем, что df_train и df_test уже загружены и содержат столбец 'client_id'

# Объединяем train и test для подсчёта общего количества записей по client_id
combined_df = pd.concat([df_train, df_test])

# Подсчёт общего количества записей для каждого client_id
total_records_by_client = combined_df.groupby('client_id').size()

# Создаем DataFrame с результатами
stats_df = pd.DataFrame({
    'client_id': total_records_by_client.index,
    'total_records': total_records_by_client.values
})

# Объединяем статистику с обучающим и тестовым наборами данных
df_train = df_train.merge(stats_df, on='client_id', how='left')
df_train_extra = df_train_extra.merge(stats_df, on='client_id', how='left')
df_test = df_test.merge(stats_df[['client_id', 'total_records']], on='client_id', how='left')

# Заполняем пропущенные значения нулями
df_train.fillna(0, inplace=True)
df_test.fillna(0, inplace=True)
df_train_extra.fillna(0, inplace=True)


In [7]:
df_train = preprocess_dataframe(df_train)
df_test = preprocess_dataframe(df_test)
df_train_extra = preprocess_dataframe(df_train_extra)

In [8]:
from sklearn.model_selection import StratifiedKFold
import copy
from collections import Counter
from sklearn.metrics import f1_score
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from catboost.utils import select_threshold
from sklearn.metrics import f1_score
from catboost.utils import get_roc_curve, select_threshold, get_fpr_curve, get_fnr_curve
from sklearn.metrics import roc_curve, f1_score

class CvModel:
    def __init__(self, clf):
        self.models = []
        self.clf = clf
        self.scores = []
        self.thresholds = []

    def fit(self, X, y, cv=5, desired_fpr=0.01):
        self.models = []
        self.scores = []
        self.thresholds = []

        skf = StratifiedKFold(n_splits=cv)
        for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
            now_model = copy.deepcopy(self.clf)
            X_train, X_val, y_train, y_val = X.iloc[train_idx], X.iloc[test_idx], y.iloc[train_idx], y.iloc[test_idx]
            
            now_model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=350, verbose=False)
            probas = now_model.predict_proba(X_val)[:, 1]
            fpr, tpr, thresholds = roc_curve(y_val, probas)
            
            # Выбор порога, чтобы FPR был не больше заданного значения
            threshold = thresholds[np.where(fpr <= desired_fpr)[0][-1]]
            
            self.thresholds.append(threshold)
            
            preds = (probas >= threshold).astype(int)
            acc = f1_score(y_val, preds)
            print(f'Fold {i}/{cv} F1 = {acc}, Threshold = {threshold}')
            
            self.scores.append(acc)
            self.models.append(now_model)
        
        print('Mean F1 = {}'.format(np.mean(self.scores)))

    def predict(self, X, treshold = 0.5):
        if not self.models:
            raise Exception("There is no fitted model")

        # Получаем предсказания всех моделей
        preds = np.array([model.predict_proba(X)[:, 1] for model in self.models])
        
        # Вычисление взвешенных голосов на основе порогов, установленных для каждой модели
        weighted_votes = np.zeros_like(preds[0])
        total_score = sum(self.scores)

        for i, model_preds in enumerate(preds):
            model_votes = (model_preds >= self.thresholds[i]).astype(int)
            weighted_votes += (self.scores[i] / total_score) * model_votes
        
        # Итоговые предсказания на основе среднего взвешенного
        final_predictions = (weighted_votes >= treshold).astype(int)
        return final_predictions

from sklearn.preprocessing import StandardScaler, MinMaxScaler
    

features2drop = ['id'] # то, что надо выбросить 
targets = ['target'] # таргеты
cat_features = df_train.select_dtypes(include=['object', 'category']).columns.tolist() # категориальные признаки
if 'id' in cat_features:
    cat_features.remove('id')
filtered_features = [i for i in df_train.columns if (i not in targets and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]

scaler1 = MinMaxScaler()
scaler2 = MinMaxScaler()
X1 = df_train[filtered_features].drop(targets, axis=1, errors='ignore')
X1[num_features] = scaler1.fit_transform(X1[num_features])
y1 = df_train['target']


cat_features_extra = df_train_extra.select_dtypes(include=['object', 'category']).columns.tolist() # категориальные признаки
if 'id' in cat_features_extra:
    cat_features_extra.remove('id')
filtered_features_extra = [i for i in df_train_extra.columns if (i not in targets and i not in features2drop)]
num_features_extra = [i for i in filtered_features_extra if i not in cat_features_extra]

X2 = df_train_extra[filtered_features_extra].drop(targets, axis=1, errors='ignore')
X2[num_features_extra] = scaler2.fit_transform(X2[num_features_extra])
y2 = df_train_extra['target']

In [9]:
params_cat = {
    "n_estimators": 10000,
    "depth": 6,
    "use_best_model": True,
    "cat_features": cat_features,
    "text_features": [],
    "random_state": DEFAULT_RANDOM_SEED,
    "loss_function": 'Logloss',
    'auto_class_weights': 'Balanced',
    'thread_count' : 8,
    'task_type':"GPU"

}

clf = CatBoostClassifier(**params_cat)
cv_model1 = CvModel(clf)
cv_model1.fit(X1, y1)

Fold 0/5 F1 = 0.10485603781693167, Threshold = 0.8365001964469112
Fold 1/5 F1 = 0.11622901420576841, Threshold = 0.8347843403522709
Fold 2/5 F1 = 0.12140575079872204, Threshold = 0.8352877135881329
Fold 3/5 F1 = 0.12550349798600804, Threshold = 0.831192620215794
Fold 4/5 F1 = 0.11305460750853243, Threshold = 0.833091003507249
Mean F1 = 0.11620978166319251


In [10]:
params_cat2 = {
    "n_estimators": 8000,
    "depth": 6,
    "use_best_model": True,
    "cat_features": cat_features,
    "text_features": [],
    "l2_leaf_reg": 1,
    "bagging_temperature": 1.,
    "random_state": DEFAULT_RANDOM_SEED,
    "loss_function": 'Logloss',
    'eval_metric': 'F1',
    'auto_class_weights': 'Balanced',
    'thread_count' : 8,
    'task_type':"GPU"

}
clf = CatBoostClassifier(**params_cat2)
cv_model2 = CvModel(clf)
cv_model2.fit(X2, y2)

Fold 0/5 F1 = 0.707959393850228, Threshold = 1.934702828033008e-06
Fold 1/5 F1 = 0.9610545236668664, Threshold = 0.44136886657614627
Fold 2/5 F1 = 0.705158264947245, Threshold = 1.435597894311408e-06
Fold 3/5 F1 = 0.7161559059803629, Threshold = 6.123447300018816e-05
Fold 4/5 F1 = 0.7048483960744104, Threshold = 5.8362287510138426e-05
Mean F1 = 0.7590352969038225


In [11]:
y_pred = cv_model1.predict(X1, 0.4)
from sklearn.metrics import confusion_matrix, classification_report
conf_matrix = confusion_matrix(y1, y_pred)
report = classification_report(y1, y_pred)
print(report)

              precision    recall  f1-score   support

           0       0.99      0.99      0.99   1009063
           1       0.14      0.14      0.14     12031

    accuracy                           0.98   1021094
   macro avg       0.56      0.56      0.56   1021094
weighted avg       0.98      0.98      0.98   1021094



In [12]:
y_pred = cv_model2.predict(X2, 0.95)
from sklearn.metrics import confusion_matrix, classification_report
conf_matrix = confusion_matrix(y2, y_pred)
report = classification_report(y2, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1009063
           1       0.98      1.00      0.99     12031

    accuracy                           1.00   1021094
   macro avg       0.99      1.00      0.99   1021094
weighted avg       1.00      1.00      1.00   1021094



In [13]:
features2drop = ['id'] # то, что надо выбросить 
targets = ['target'] # таргеты
cat_features = df_train.select_dtypes(include=['object', 'category']).columns.tolist() # категориальные признаки
if 'id' in cat_features:
    cat_features.remove('id')
filtered_features = [i for i in df_train.columns if (i not in targets and i not in features2drop)]
num_features = [i for i in filtered_features if i not in cat_features]

In [14]:

mask = (df_test['success_rate1'] == -1) & (df_test['total_records1'] == -1)
df_test_1 = df_test[mask].drop(columns=['success_rate1', 'total_records1'])
df_test_2 = df_test[~mask]

# Предсказания для первой группы
if not df_test_1.empty:
    data = df_test_1.drop(columns=['target', 'id'], errors='ignore')
    data[num_features] = scaler1.transform(data[num_features])
    predictions_1 = cv_model1.predict(data, 0.4)
    df_test.loc[mask, 'target'] = predictions_1

# Предсказания для второй группы
if not df_test_2.empty:
    data = df_test_2.drop(columns=['target', 'id'], errors='ignore')
    data[num_features_extra] = scaler2.transform(data[num_features_extra])
    predictions_2 = cv_model2.predict(data, 0.95)
    df_test.loc[~mask, 'target'] = predictions_2

In [15]:
df_test.reset_index(level=0, inplace=True)
df_test[['id', 'target']].to_csv('../subs/overfit.csv', index=False)
print('CSV file saved!')

CSV file saved!


In [16]:
df_test['target'].value_counts()

target
0.0    337028
1.0      3336
Name: count, dtype: int64

In [17]:
feature_importance=   cv_model1.models[0].feature_importances_

feature_names = X1.columns

# Создаем DataFrame с признаками и их значимостью
df_feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Сортируем DataFrame по значимости признаков
df_feature_importance = df_feature_importance.sort_values(by='Importance', ascending=False)
df_feature_importance

Unnamed: 0,Feature,Importance
27,timestamp_year,18.71288
26,total_records,12.268836
22,position,6.934413
21,role,5.214798
24,employed_days,4.845013
11,egrul_reg_months_ago,4.804881
10,egrul_region,3.476452
1,days_since_last_take,3.291593
9,egrul_business_type,2.879293
29,avg_days_between_takes,2.366537


In [18]:
lol = df_train.sample(1000)
lol.to_csv("lol.csv")

In [19]:
feature_importance=   cv_model1.models[1].feature_importances_

feature_names = X1.columns

# Создаем DataFrame с признаками и их значимостью
df_feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Сортируем DataFrame по значимости признаков
df_feature_importance = df_feature_importance.sort_values(by='Importance', ascending=False)
df_feature_importance

Unnamed: 0,Feature,Importance
27,timestamp_year,20.437083
26,total_records,12.68966
22,position,6.155222
24,employed_days,5.054988
11,egrul_reg_months_ago,5.04167
21,role,4.973976
10,egrul_region,3.285403
1,days_since_last_take,3.17795
9,egrul_business_type,2.754464
29,avg_days_between_takes,2.537377


In [20]:
feature_importance=   cv_model1.models[2].feature_importances_

feature_names = X1.columns

# Создаем DataFrame с признаками и их значимостью
df_feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Сортируем DataFrame по значимости признаков
df_feature_importance = df_feature_importance.sort_values(by='Importance', ascending=False)
df_feature_importance

Unnamed: 0,Feature,Importance
27,timestamp_year,19.726302
26,total_records,12.658251
22,position,6.629855
11,egrul_reg_months_ago,4.988026
21,role,4.942132
24,employed_days,4.304321
1,days_since_last_take,3.488846
10,egrul_region,3.18954
9,egrul_business_type,2.826968
29,avg_days_between_takes,2.390579


In [21]:
feature_importance=   cv_model1.models[3].feature_importances_

feature_names = X1.columns

# Создаем DataFrame с признаками и их значимостью
df_feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Сортируем DataFrame по значимости признаков
df_feature_importance = df_feature_importance.sort_values(by='Importance', ascending=False)
df_feature_importance

Unnamed: 0,Feature,Importance
27,timestamp_year,21.461962
26,total_records,13.255615
22,position,5.889444
11,egrul_reg_months_ago,5.332424
21,role,5.033197
24,employed_days,4.298942
1,days_since_last_take,3.289184
10,egrul_region,3.16726
9,egrul_business_type,2.539708
29,avg_days_between_takes,2.212651


In [22]:
feature_importance=   cv_model1.models[4].feature_importances_

feature_names = X1.columns

# Создаем DataFrame с признаками и их значимостью
df_feature_importance = pd.DataFrame({'Feature': feature_names, 'Importance': feature_importance})

# Сортируем DataFrame по значимости признаков
df_feature_importance = df_feature_importance.sort_values(by='Importance', ascending=False)
df_feature_importance

Unnamed: 0,Feature,Importance
27,timestamp_year,21.577028
26,total_records,12.908438
22,position,5.882824
11,egrul_reg_months_ago,5.519952
21,role,4.935011
24,employed_days,4.4483
1,days_since_last_take,3.194585
10,egrul_region,2.791918
9,egrul_business_type,2.645055
29,avg_days_between_takes,2.395068
