## VER 2

In [21]:
import os
import pandas as pd

import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.utils import resample

from functools import reduce
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
import catboost



In [2]:
# Основная директория
#base_path = '/Users/migdalia/goinfre/description/filtered_data'
base_path = '/Users/shogun/Desktop/ds_projects/masterskaya2/filtered_data'

# Файлы
file1 = 'apparel-messages.csv'
file2 = 'apparel-purchases.csv'
file3 = 'apparel-target_binary.csv'
file4 = 'full_campaign_daily_event_channel.csv'
file5 = 'full_campaign_daily_event.csv'

# Полные пути
path1 = os.path.join(base_path, file1)
path2 = os.path.join(base_path, file2)
path3 = os.path.join(base_path, file3)
path4 = os.path.join(base_path, file4)
path5 = os.path.join(base_path, file5)

In [58]:
# Загрузка с проверкой

import pandas as pd

if os.path.exists(path1):
    messages = pd.read_csv(path1)
    print('[OK1] Загружен apparel-messages.csv')
else:
    print(f'[ОШИБКА1] {file1} не найден')

    
if os.path.exists(path2):
    purchases = pd.read_csv(path2)
    print(f'[OK2] Загружен {file2}')
else:
    print(f'[ОШИБКА2] {file2} не найден')

if os.path.exists(path3):
    target = pd.read_csv(path3)
    print(f'[OK3] Загружен {file3}')
else:
    print(f'[ОШИБКА3] {file3} не найден')

    
if os.path.exists(path4):
    agg_event = pd.read_csv(path4)
    print(f'[OK4] Загружен {file4}')
else:
    print(f'[ОШИБКА4] {file4} не найден')
    
if os.path.exists(path5):
    agg_event_channel = pd.read_csv(path5)
    print(f'[OK5] Загружен {file5}')
else:
    print(f'[ОШИБКА5] {file5} не найден')

[OK1] Загружен apparel-messages.csv
[OK2] Загружен apparel-purchases.csv
[OK3] Загружен apparel-target_binary.csv
[OK4] Загружен full_campaign_daily_event_channel.csv
[OK5] Загружен full_campaign_daily_event.csv


### Создание фичей

In [59]:

df = purchases.merge(messages, on=["client_id", "message_id"], how="left", suffixes=('_purchase', '_message'))


In [60]:
df = df.merge(target, on="client_id", how="left")


In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 862248 entries, 0 to 862247
Data columns (total 12 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   client_id         862248 non-null  int64  
 1   quantity          862248 non-null  int64  
 2   price             862248 non-null  float64
 3   category_ids      862248 non-null  object 
 4   date_purchase     862248 non-null  object 
 5   message_id        862248 non-null  object 
 6   bulk_campaign_id  805820 non-null  float64
 7   event             805820 non-null  object 
 8   channel           805820 non-null  object 
 9   date_message      805820 non-null  object 
 10  created_at        805820 non-null  object 
 11  target            862248 non-null  int64  
dtypes: float64(2), int64(3), object(7)
memory usage: 78.9+ MB


In [62]:
df['date'] = pd.to_datetime(df['date_message'])


In [63]:
agg_event['date'] = pd.to_datetime(agg_event['date'])
agg_event_channel['date'] = pd.to_datetime(agg_event_channel['date'])

df = df.merge(agg_event, on=["bulk_campaign_id", "date"], how="left", suffixes=('', '_agg'))
df = df.merge(agg_event_channel, on=["bulk_campaign_id", "date"], how="left", suffixes=('', '_channel'))


In [64]:
import ast
df['category_ids'] = df['category_ids'].apply(ast.literal_eval)


In [65]:
df = df.dropna(subset=['message_id'])


In [66]:
df.info()
df.isna().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 862248 entries, 0 to 862247
Data columns (total 69 columns):
 #   Column                           Non-Null Count   Dtype         
---  ------                           --------------   -----         
 0   client_id                        862248 non-null  int64         
 1   quantity                         862248 non-null  int64         
 2   price                            862248 non-null  float64       
 3   category_ids                     862248 non-null  object        
 4   date_purchase                    862248 non-null  object        
 5   message_id                       862248 non-null  object        
 6   bulk_campaign_id                 805820 non-null  float64       
 7   event                            805820 non-null  object        
 8   channel                          805820 non-null  object        
 9   date_message                     805820 non-null  object        
 10  created_at                       805820 non-

client_id                  0
quantity                   0
price                      0
category_ids               0
date_purchase              0
                       ...  
nunique_unsubscribe    56492
count_hbq_spam         56492
nunique_hbq_spam       56492
count_close            56492
nunique_close          56492
Length: 69, dtype: int64

In [67]:

# Разворачиваем в DataFrame
cat_levels = pd.DataFrame(df['category_ids'].to_list(), index=df.index)

# Переименуем и добавим
df['cat_lvl1'] = cat_levels[0]
df['cat_lvl2'] = cat_levels[1]
df['cat_lvl3'] = cat_levels[2]



In [68]:
df = df.drop(columns=['category_ids'])

In [79]:
date_columns = ['date_purchase', 'date_message', 'created_at', 'date']
for col in date_columns:
    df[col] = pd.to_datetime(df[col], errors='coerce')


In [80]:

X = df.drop(columns=['target'])
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [71]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


In [82]:
ros = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = ros.fit_resample(X_train, y_train)




In [81]:

print("До оверсэмплинга:", np.bincount(y_train))
print("После оверсэмплинга:", np.bincount(y_train_resampled))


До оверсэмплинга: [663145  26653]
После оверсэмплинга: [663145 663145]


In [77]:

print("До оверсэмплинга:", np.bincount(y_train))
print("После оверсэмплинга:", np.bincount(y_train_resampled))


До оверсэмплинга: [663145  26653]
После оверсэмплинга: [663145 663145]


In [42]:
#df.info()

In [83]:
categorical_features = [
    'message_id',
    'bulk_campaign_id',   # хотя float64, но по смыслу ID — категориальный
    'event',
    'channel',
    'cat_lvl1',
    'cat_lvl2',
    'cat_lvl3',
    #'client_id'         
]
for col in categorical_features:
    X_train_resampled[col] = X_train_resampled[col].astype(str)
    X_test[col] = X_test[col].astype(str)


In [84]:
datetime_cols = df.select_dtypes(include=['datetime64[ns]']).columns
print(df[datetime_cols].isnull().sum())


date_purchase        0
date_message     56428
created_at       56428
date             56428
dtype: int64


In [85]:
for col in ['date_purchase', 'date_message', 'created_at', 'date']:
    # Создаём признаки месяца и дня
    X_train_resampled[f'{col}_month'] = X_train_resampled[col].dt.month.fillna(0).astype(int)
    X_train_resampled[f'{col}_day'] = X_train_resampled[col].dt.day.fillna(0).astype(int)

    X_test[f'{col}_month'] = X_test[col].dt.month.fillna(0).astype(int)
    X_test[f'{col}_day'] = X_test[col].dt.day.fillna(0).astype(int)

# Удаляем исходные колонки с датами
X_train_resampled = X_train_resampled.drop(columns=['date_purchase', 'date_message', 'created_at', 'date'])
X_test = X_test.drop(columns=['date_purchase', 'date_message', 'created_at', 'date'])


In [86]:


cat_model = catboost.CatBoostClassifier(
    iterations=30,
    learning_rate=0.05,
    depth=6,
    verbose=100,
    random_seed=42,
    #eval_metric='F1',
    eval_metric='AUC',
    task_type='CPU'
)

cat_model.fit(
    X_train_resampled, y_train_resampled,
    cat_features=categorical_features,
    eval_set=(X_test, y_test),
    early_stopping_rounds=50
)

0:	test: 0.9985513	best: 0.9985513 (0)	total: 688ms	remaining: 20s
29:	test: 0.9999914	best: 0.9999915 (28)	total: 21.4s	remaining: 0us

bestTest = 0.9999915321
bestIteration = 28

Shrink model to first 29 iterations.


<catboost.core.CatBoostClassifier at 0x2f0b3ff10>

In [49]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 172450 entries, 652099 to 39805
Data columns (total 74 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   client_id                        172450 non-null  int64  
 1   quantity                         172450 non-null  int64  
 2   price                            172450 non-null  float64
 3   message_id                       172450 non-null  object 
 4   bulk_campaign_id                 172450 non-null  object 
 5   event                            172450 non-null  object 
 6   channel                          172450 non-null  object 
 7   count_click_email                161123 non-null  float64
 8   count_click_mobile_push          161123 non-null  float64
 9   count_open_email                 161123 non-null  float64
 10  count_open_mobile_push           161123 non-null  float64
 11  count_purchase_email             161123 non-null  float64
 12  cou

In [87]:
y_proba = cat_model.predict_proba(X_test)[:, 1]

roc_auc = roc_auc_score(y_test, y_proba)
print(f"ROC AUC: {roc_auc:.4f}")

ROC AUC: 1.0000


In [88]:
# Оставить только числовые столбцы
df_numeric = df.select_dtypes(include=['number'])

# Посчитать корреляцию по числовым столбцам
corr_matrix = df_numeric.corr()

print(corr_matrix)


                     client_id  quantity     price  bulk_campaign_id  \
client_id             1.000000 -0.001427  0.004296          0.277426   
quantity             -0.001427  1.000000 -0.003825         -0.016515   
price                 0.004296 -0.003825  1.000000         -0.079562   
bulk_campaign_id      0.277426 -0.016515 -0.079562          1.000000   
target               -0.020941 -0.002335 -0.035889          0.032843   
...                        ...       ...       ...               ...   
nunique_unsubscribe  -0.088932  0.008566  0.017276         -0.048594   
count_hbq_spam       -0.007870  0.006938 -0.016415          0.028867   
nunique_hbq_spam     -0.007866  0.006938 -0.016420          0.028874   
count_close                NaN       NaN       NaN               NaN   
nunique_close              NaN       NaN       NaN               NaN   

                       target  count_click_email  count_click_mobile_push  \
client_id           -0.020941          -0.161056          

In [96]:


# Считаем корреляционную матрицу (только числовые)
# Оставить только числовые столбцы
df_numeric = df_reduced.select_dtypes(include=['number'])


corr_matrix = df_numeric.corr().abs()

# Создаём маску, чтобы смотреть только верхний треугольник матрицы (без диагонали)
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Задаём порог сильной корреляции, например 0.9
threshold = 0.85

# Находим имена колонок, которые коррелируют больше threshold с другими
to_drop = [column for column in upper.columns if any(upper[column] > threshold)]

print("Признаки для удаления из-за высокой корреляции:", to_drop)


Признаки для удаления из-за высокой корреляции: ['nunique_click_email', 'nunique_click_mobile_push', 'nunique_open_email', 'nunique_purchase_mobile_push', 'nunique_soft_bounce_email', 'nunique_hard_bounce_mobile_push', 'nunique_send_mobile_push', 'nunique_hard_bounce_email', 'nunique_send_email', 'nunique_soft_bounce_mobile_push', 'nunique_click', 'nunique_hard_bounce', 'nunique_purchase', 'nunique_send']


In [97]:
# Удаляем из датафрейма
# df_reduced = df.drop(columns=to_drop)
df_reduced = df.drop(columns=list(to_drop)).copy()


In [134]:

X2 = df_reduced.drop(columns=['target', 'client_id', 'message_id', 'bulk_campaign_id'])
y2 = df_reduced['target']

X_train2, X_test2, y_train2, y_test2 = train_test_split(
    X2, y2,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [119]:
#df_reduced.info()

In [135]:
categorical_features2 = [
    #'message_id',
    #'bulk_campaign_id',   # хотя float64, но по смыслу ID — категориальный
    'event',
    'channel',
    'cat_lvl1',
    'cat_lvl2',
    'cat_lvl3',
    #'client_id'         
]
for col in categorical_features2:
    X_train2[col] = X_train2[col].astype(str)
    X_test2[col] = X_test2[col].astype(str)


In [136]:
datetime_cols = X_train2.select_dtypes(include=['datetime']).columns.tolist()
print(datetime_cols)


['date_purchase', 'date_message', 'created_at', 'date']


In [137]:
for col in ['date_purchase', 'date_message', 'created_at', 'date']:
    # Признаки: месяц и день
    X_train2[f'{col}_month'] = X_train2[col].dt.month.fillna(0).astype(int)
    X_train2[f'{col}_day'] = X_train2[col].dt.day.fillna(0).astype(int)

    X_test2[f'{col}_month'] = X_test2[col].dt.month.fillna(0).astype(int)
    X_test2[f'{col}_day'] = X_test2[col].dt.day.fillna(0).astype(int)

# Удаляем исходные datetime-колонки
X_train2 = X_train2.drop(columns=['date_purchase', 'date_message', 'created_at', 'date'])
X_test2 = X_test2.drop(columns=['date_purchase', 'date_message', 'created_at', 'date'])


In [138]:
for col in ['date_purchase', 'date_message', 'created_at', 'date']:
    if col in X_train2.columns:
        X_train2[f'{col}_month'] = X_train2[col].dt.month.fillna(0).astype(int)
        X_train2[f'{col}_day'] = X_train2[col].dt.day.fillna(0).astype(int)
        X_train2 = X_train2.drop(columns=[col])

    if col in X_test2.columns:
        X_test2[f'{col}_month'] = X_test2[col].dt.month.fillna(0).astype(int)
        X_test2[f'{col}_day'] = X_test2[col].dt.day.fillna(0).astype(int)
        X_test2 = X_test2.drop(columns=[col])


In [139]:
missing = [col for col in categorical_features2 if col not in X_train2.columns]
if missing:
    print("Нет в X_train2:", missing)


In [129]:
X_train2[categorical_features2].dtypes


message_id          object
bulk_campaign_id    object
event               object
channel             object
cat_lvl1            object
cat_lvl2            object
cat_lvl3            object
dtype: object

In [133]:
X_train_final.columns

Index(['quantity', 'price', 'message_id', 'bulk_campaign_id', 'event',
       'channel', 'count_click_email', 'count_click_mobile_push',
       'count_open_email', 'count_open_mobile_push', 'count_purchase_email',
       'count_purchase_mobile_push', 'count_soft_bounce_email',
       'count_subscribe_email', 'count_unsubscribe_email',
       'nunique_open_mobile_push', 'nunique_purchase_email',
       'nunique_subscribe_email', 'nunique_unsubscribe_email',
       'count_hard_bounce_mobile_push', 'count_send_mobile_push',
       'count_hard_bounce_email', 'count_hbq_spam_email', 'count_send_email',
       'nunique_hbq_spam_email', 'count_soft_bounce_mobile_push',
       'count_complain_email', 'nunique_complain_email',
       'count_close_mobile_push', 'nunique_close_mobile_push', 'count_click',
       'count_complain', 'count_hard_bounce', 'count_open', 'count_purchase',
       'count_send', 'count_soft_bounce', 'count_subscribe',
       'count_unsubscribe', 'nunique_complain', 'nuniqu

In [140]:
# Разбиваем X_train2 на обучающую и валидационную части
X_train_final, X_valid, y_train_final, y_valid = train_test_split(
    X_train2, y_train2,
    test_size=0.2,
    stratify=y_train2,
    random_state=42
)


In [143]:
cat_model2 = catboost.CatBoostClassifier(
    iterations=700,
    learning_rate=0.05,
    depth=6,
    verbose=100,
    random_seed=42,
    eval_metric='AUC',
    task_type='CPU',
    early_stopping_rounds=150
)

cat_model2.fit(
    X_train_final, y_train_final,
    cat_features=categorical_features2,
    eval_set=(X_valid, y_valid),
    use_best_model=True
)


0:	test: 0.5633029	best: 0.5633029 (0)	total: 511ms	remaining: 5m 57s
100:	test: 0.7650980	best: 0.7650980 (100)	total: 36.3s	remaining: 3m 35s
200:	test: 0.8209275	best: 0.8209275 (200)	total: 1m 16s	remaining: 3m 9s
300:	test: 0.8487355	best: 0.8487355 (300)	total: 1m 58s	remaining: 2m 37s
400:	test: 0.8649843	best: 0.8649843 (400)	total: 2m 46s	remaining: 2m 4s
500:	test: 0.8778992	best: 0.8778992 (500)	total: 3m 52s	remaining: 1m 32s
600:	test: 0.8861545	best: 0.8861545 (600)	total: 4m 44s	remaining: 46.8s
699:	test: 0.8943426	best: 0.8943426 (699)	total: 5m 38s	remaining: 0us

bestTest = 0.8943425702
bestIteration = 699



<catboost.core.CatBoostClassifier at 0x4ba43d460>

In [None]:
X_train_final

In [144]:

y_pred_proba = cat_model2.predict_proba(X_test2)[:, 1]
roc_auc_score(y_test2, y_pred_proba)


0.8902456281769814