# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.preprocessing import MinMaxScaler
import torch.optim as optim
from tqdm.notebook import tqdm
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import SMOTE
from collections import Counter

# Read the data and classes balance

In [2]:
train_session = pd.read_csv('train_sessions.csv')
train_session.head()

Unnamed: 0,session_id,site1,time1,site2,time2,site3,time3,site4,time4,site5,...,time6,site7,time7,site8,time8,site9,time9,site10,time10,target
0,1,718,2014-02-20 10:02:45,,,,,,,,...,,,,,,,,,,0
1,2,890,2014-02-22 11:19:50,941.0,2014-02-22 11:19:50,3847.0,2014-02-22 11:19:51,941.0,2014-02-22 11:19:51,942.0,...,2014-02-22 11:19:51,3847.0,2014-02-22 11:19:52,3846.0,2014-02-22 11:19:52,1516.0,2014-02-22 11:20:15,1518.0,2014-02-22 11:20:16,0
2,3,14769,2013-12-16 16:40:17,39.0,2013-12-16 16:40:18,14768.0,2013-12-16 16:40:19,14769.0,2013-12-16 16:40:19,37.0,...,2013-12-16 16:40:19,14768.0,2013-12-16 16:40:20,14768.0,2013-12-16 16:40:21,14768.0,2013-12-16 16:40:22,14768.0,2013-12-16 16:40:24,0
3,4,782,2014-03-28 10:52:12,782.0,2014-03-28 10:52:42,782.0,2014-03-28 10:53:12,782.0,2014-03-28 10:53:42,782.0,...,2014-03-28 10:54:42,782.0,2014-03-28 10:55:12,782.0,2014-03-28 10:55:42,782.0,2014-03-28 10:56:12,782.0,2014-03-28 10:56:42,0
4,5,22,2014-02-28 10:53:05,177.0,2014-02-28 10:55:22,175.0,2014-02-28 10:55:22,178.0,2014-02-28 10:55:23,177.0,...,2014-02-28 10:55:59,175.0,2014-02-28 10:55:59,177.0,2014-02-28 10:55:59,177.0,2014-02-28 10:57:06,178.0,2014-02-28 10:57:11,0


In [3]:
train_session.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 253561 entries, 0 to 253560
Data columns (total 22 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   session_id  253561 non-null  int64  
 1   site1       253561 non-null  int64  
 2   time1       253561 non-null  object 
 3   site2       250098 non-null  float64
 4   time2       250098 non-null  object 
 5   site3       246919 non-null  float64
 6   time3       246919 non-null  object 
 7   site4       244321 non-null  float64
 8   time4       244321 non-null  object 
 9   site5       241829 non-null  float64
 10  time5       241829 non-null  object 
 11  site6       239495 non-null  float64
 12  time6       239495 non-null  object 
 13  site7       237297 non-null  float64
 14  time7       237297 non-null  object 
 15  site8       235224 non-null  float64
 16  time8       235224 non-null  object 
 17  site9       233084 non-null  float64
 18  time9       233084 non-null  object 
 19  si

Очень много пропусков, потому что site_i может и не быть, например, как в первой строке только 1 сайт был посещен, добавим в качестве признака количество посещенных сайтов

Также у нас много временных фичей, все их разобъём на год, день, месяц, день недели и час, минута, секунда

Проверим сбалансированность (спойлер нет) таргета

In [4]:
len(train_session.loc[(train_session['target'] == 1)]) / len(train_session.loc[(train_session['target'] == 0)])

0.009141779164544065

:))))))))))

# Data Preporation

In [4]:
time_features = ['time1', 'time2', 'time3', 'time4', 'time5', 
                 'time6', 'time7', 'time8', 'time9', 'time10']

for feature in time_features:
    train_session[feature] = pd.to_datetime(train_session[feature])


def calculate_time_differences(row):
    time_differences = []
    for i in range(1, len(time_features)):
        t_prev = row[f"time{i}"]
        t_next = row[f"time{i+1}"] if i + 1 <= len(time_features) - 1 else None
        if pd.notna(t_prev) and pd.notna(t_next):
            time_differences.append((t_next - t_prev).total_seconds())
        else:
            time_differences.append(None)
    return pd.Series(time_differences, index=[f"time_diff{i+1}" for i in range(len(time_features) - 1)])


for feature in time_features:
    train_session[f'year_{feature}'] = train_session[feature].dt.year
    train_session[f'month_{feature}'] = train_session[feature].dt.month
    train_session[f'day_{feature}'] = train_session[feature].dt.day
    train_session[f'weekday_{feature}'] = train_session[feature].dt.weekday
    train_session[f'hour_{feature}'] = train_session[feature].dt.hour
    train_session[f'minute_{feature}'] = train_session[feature].dt.minute
    train_session[f'second_{feature}'] = train_session[feature].dt.second

time_differences_df = train_session.apply(calculate_time_differences, axis=1)
train_session = pd.concat([train_session, time_differences_df], axis=1)


train_session = train_session.drop(time_features, axis=1)

In [5]:
train_session

Unnamed: 0,session_id,site1,site2,site3,site4,site5,site6,site7,site8,site9,...,second_time10,time_diff1,time_diff2,time_diff3,time_diff4,time_diff5,time_diff6,time_diff7,time_diff8,time_diff9
0,1,718,,,,,,,,,...,,,,,,,,,,
1,2,890,941.0,3847.0,941.0,942.0,3846.0,3847.0,3846.0,1516.0,...,16.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,23.0,
2,3,14769,39.0,14768.0,14769.0,37.0,39.0,14768.0,14768.0,14768.0,...,24.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,
3,4,782,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,...,42.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,30.0,
4,5,22,177.0,175.0,178.0,177.0,178.0,175.0,177.0,177.0,...,11.0,137.0,0.0,1.0,0.0,36.0,0.0,0.0,67.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253556,253557,3474,3474.0,141.0,2428.0,106.0,2428.0,2428.0,2428.0,2428.0,...,53.0,4.0,5.0,1.0,9.0,3.0,12.0,12.0,12.0,
253557,253558,12727,12727.0,2215.0,38.0,2215.0,23.0,25444.0,2215.0,23.0,...,18.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,
253558,253559,2661,15004.0,5562.0,5562.0,5562.0,,,,,...,,7.0,0.0,79.0,1.0,,,,,
253559,253560,812,676.0,814.0,22.0,39.0,812.0,814.0,570.0,22.0,...,24.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,


In [6]:
train_session.columns

Index(['session_id', 'site1', 'site2', 'site3', 'site4', 'site5', 'site6',
       'site7', 'site8', 'site9', 'site10', 'target', 'year_time1',
       'month_time1', 'day_time1', 'weekday_time1', 'hour_time1',
       'minute_time1', 'second_time1', 'year_time2', 'month_time2',
       'day_time2', 'weekday_time2', 'hour_time2', 'minute_time2',
       'second_time2', 'year_time3', 'month_time3', 'day_time3',
       'weekday_time3', 'hour_time3', 'minute_time3', 'second_time3',
       'year_time4', 'month_time4', 'day_time4', 'weekday_time4', 'hour_time4',
       'minute_time4', 'second_time4', 'year_time5', 'month_time5',
       'day_time5', 'weekday_time5', 'hour_time5', 'minute_time5',
       'second_time5', 'year_time6', 'month_time6', 'day_time6',
       'weekday_time6', 'hour_time6', 'minute_time6', 'second_time6',
       'year_time7', 'month_time7', 'day_time7', 'weekday_time7', 'hour_time7',
       'minute_time7', 'second_time7', 'year_time8', 'month_time8',
       'day_time8', 'we

In [7]:
features = ['year_time1', 'month_time1', 'day_time1', 'weekday_time1', 'hour_time1',
       'minute_time1', 'second_time1', 'year_time2', 'month_time2',
       'day_time2', 'weekday_time2', 'hour_time2', 'minute_time2',
       'second_time2', 'year_time3', 'month_time3', 'day_time3',
       'weekday_time3', 'hour_time3', 'minute_time3', 'second_time3',
       'year_time4', 'month_time4', 'day_time4', 'weekday_time4', 'hour_time4',
       'minute_time4', 'second_time4', 'year_time5', 'month_time5',
       'day_time5', 'weekday_time5', 'hour_time5', 'minute_time5',
       'second_time5', 'year_time6', 'month_time6', 'day_time6',
       'weekday_time6', 'hour_time6', 'minute_time6', 'second_time6',
       'year_time7', 'month_time7', 'day_time7', 'weekday_time7', 'hour_time7',
       'minute_time7', 'second_time7', 'year_time8', 'month_time8',
       'day_time8', 'weekday_time8', 'hour_time8', 'minute_time8',
       'second_time8', 'year_time9', 'month_time9', 'day_time9',
       'weekday_time9', 'hour_time9', 'minute_time9', 'second_time9',
       'year_time10', 'month_time10', 'day_time10', 'weekday_time10',
       'hour_time10', 'minute_time10', 'second_time10']


train_session = pd.get_dummies(train_session, columns=features, dtype=int)

In [8]:
train_session

Unnamed: 0,session_id,site1,site2,site3,site4,site5,site6,site7,site8,site9,...,second_time10_50.0,second_time10_51.0,second_time10_52.0,second_time10_53.0,second_time10_54.0,second_time10_55.0,second_time10_56.0,second_time10_57.0,second_time10_58.0,second_time10_59.0
0,1,718,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
1,2,890,941.0,3847.0,941.0,942.0,3846.0,3847.0,3846.0,1516.0,...,0,0,0,0,0,0,0,0,0,0
2,3,14769,39.0,14768.0,14769.0,37.0,39.0,14768.0,14768.0,14768.0,...,0,0,0,0,0,0,0,0,0,0
3,4,782,782.0,782.0,782.0,782.0,782.0,782.0,782.0,782.0,...,0,0,0,0,0,0,0,0,0,0
4,5,22,177.0,175.0,178.0,177.0,178.0,175.0,177.0,177.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253556,253557,3474,3474.0,141.0,2428.0,106.0,2428.0,2428.0,2428.0,2428.0,...,0,0,0,1,0,0,0,0,0,0
253557,253558,12727,12727.0,2215.0,38.0,2215.0,23.0,25444.0,2215.0,23.0,...,0,0,0,0,0,0,0,0,0,0
253558,253559,2661,15004.0,5562.0,5562.0,5562.0,,,,,...,0,0,0,0,0,0,0,0,0,0
253559,253560,812,676.0,814.0,22.0,39.0,812.0,814.0,570.0,22.0,...,0,0,0,0,0,0,0,0,0,0


# Data exploration

In [9]:
train_anomaly = train_session.loc[(train_session['target'] == 1)]

In [10]:
train_not_anomaly = train_session.loc[(train_session['target'] == 0)]

for feature in train_session.columns:
    plt.figure(figsize=(16, 8))
    sns.histplot(train_anomaly[feature], kde=True, color='red')
    sns.histplot(train_not_anomaly[feature][:3000], kde=True, color='blue')

In [11]:
train_not_anomaly = train_not_anomaly.drop('session_id', axis=1)

In [12]:
X = train_not_anomaly.drop('target', axis=1)
y = train_not_anomaly['target']

In [13]:
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

  return xp.asarray(numpy.nanmin(X, axis=axis))
  return xp.asarray(numpy.nanmax(X, axis=axis))


На этом этапе я понимаю, что в данных есть просто огромное число пропущенных значений и надо учитывать это и подавать в нейронку инфу о том что было пропущено

# Танцуем с пропусками

In [14]:
data_tensor = torch.tensor(X, dtype=torch.float32)
y = torch.tensor(y, dtype=torch.float32)

In [15]:
mask = ~torch.isnan(data_tensor)
mask = mask.float()

In [16]:
X = torch.nan_to_num(data_tensor, nan=0.0)

In [17]:
X_train = torch.cat((X, mask), dim=1)

In [18]:
train_dataset = TensorDataset(X_train, y)

In [19]:
train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)

In [20]:
X_train.shape

torch.Size([251264, 3682])

# НейронО4кА

In [21]:
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        #dropout mb leaky relu, 
        self.encoder = nn.Sequential(
            nn.LazyLinear(1800),
            nn.ReLU(),
            nn.LazyLinear(900),
            nn.ReLU(),
            nn.LazyLinear(100),
            nn.ReLU(),
        )
        self.decoder = nn.Sequential(
            nn.LazyLinear(900),
            nn.ReLU(),
            nn.LazyLinear(1800),
            nn.ReLU(),
            nn.LazyLinear(3682)
        )
    def forward(self, x):
        out = self.encoder(x)
        out = self.decoder(out)
        return out
    

In [22]:
model = Encoder()

In [23]:
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [24]:
epochs = 15

for epoch in tqdm(range(epochs)):
    model.train()
    running_loss = 0.0

    for data, labels in train_loader:
        optimizer.zero_grad()

        outputs = model(data)
        loss = criterion(outputs, data)

        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    
    print(f'epoch {epoch + 1} / {epochs}, Loss: {running_loss / len(train_loader)}')

  0%|          | 0/15 [00:00<?, ?it/s]

epoch 1 / 15, Loss: 0.026398317247264984
epoch 2 / 15, Loss: 0.015183754360147746
epoch 3 / 15, Loss: 0.012596791938265933
epoch 4 / 15, Loss: 0.009201497702561547
epoch 5 / 15, Loss: 0.007367164499739037
epoch 6 / 15, Loss: 0.0063091632426392034
epoch 7 / 15, Loss: 0.006115125063541222
epoch 8 / 15, Loss: 0.005971361858326403
epoch 9 / 15, Loss: 0.005668982372158292
epoch 10 / 15, Loss: 0.005623671331475138
epoch 11 / 15, Loss: 0.00554468728651249
epoch 12 / 15, Loss: 0.005443244867117851
epoch 13 / 15, Loss: 0.005363508007183762
epoch 14 / 15, Loss: 0.0053557929110593806
epoch 15 / 15, Loss: 0.005075307774439059


In [25]:
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict()
}, 'model.pth')

In [26]:
model.eval()

Encoder(
  (encoder): Sequential(
    (0): Linear(in_features=3682, out_features=1800, bias=True)
    (1): ReLU()
    (2): Linear(in_features=1800, out_features=900, bias=True)
    (3): ReLU()
    (4): Linear(in_features=900, out_features=100, bias=True)
    (5): ReLU()
  )
  (decoder): Sequential(
    (0): Linear(in_features=100, out_features=900, bias=True)
    (1): ReLU()
    (2): Linear(in_features=900, out_features=1800, bias=True)
    (3): ReLU()
    (4): Linear(in_features=1800, out_features=3682, bias=True)
  )
)

# Analyizing losses

In [27]:
def get_ire(model, dataloader):
    criterion = nn.MSELoss(reduction='none')
    losses = []
    with torch.no_grad():

        for X_batch, _ in dataloader:
            output = model(X_batch)
            loss = criterion(X_batch, output)

            losses.extend(loss.mean(dim=1).tolist())

    return losses

In [28]:
losses = get_ire(model, train_loader)

In [30]:
max(losses)

0.012601460330188274

In [33]:
type(losses[90])

float

# Собираем новый датасет

In [31]:
y = train_session['target']
X = train_session.drop(['target', 'session_id'], axis=1)

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1488, train_size=0.8)

In [82]:
def inference(X, y, model):
    X = scaler.transform(X)
    y = torch.tensor(y, dtype=torch.float32)

    # делаем маску для данных
    data_tensor = torch.tensor(X, dtype=torch.float32)
    mask = ~torch.isnan(data_tensor)
    mask = mask.float()
    X = torch.nan_to_num(data_tensor, nan=0.0)

    # собираем train с маской
    X_train = torch.cat((X, mask), dim=1)
    train_dataset = TensorDataset(X_train, y)
    train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True)
    
    criterion = nn.MSELoss(reduction='none')

    model.eval()
    pred = []
    loss_data = []
    with torch.no_grad():
        for data, _ in train_loader:
            pred.append(model.encoder(data))
            output = model(data)
            loss = criterion(data, output)
            loss_data.extend(loss.mean(dim=1).tolist())

    
    pred = torch.cat(pred, dim=0).numpy()
    loss_frame = pd.DataFrame(data={'loss': loss_data})

    data_orig = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
    data_pred = pd.DataFrame(pred, columns=[f'reconstructed_{i}' for i in range(pred.shape[1])])
    data = pd.concat([data_orig, data_pred], axis=1)
    data = pd.concat([data, loss_frame], axis=1)

    return data

In [38]:
train_X = inference(X_train, y_train, model)

In [39]:
train_X

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,reconstructed_91,reconstructed_92,reconstructed_93,reconstructed_94,reconstructed_95,reconstructed_96,reconstructed_97,reconstructed_98,reconstructed_99,loss
0,0.001899,0.001899,0.001803,0.000505,0.001827,0.001803,0.001923,0.031394,0.001899,0.001851,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004581
1,0.511514,0.000697,0.511539,0.001226,0.001082,0.001250,0.001082,0.511514,0.015313,0.000865,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004718
2,0.001899,0.021154,0.001803,0.008029,0.067623,0.067646,0.008029,0.008029,0.033101,0.067644,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004770
3,0.019495,0.019496,0.019496,0.018583,0.019496,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004863
4,0.000481,0.000481,0.326915,0.000505,0.000481,0.326907,0.326907,0.023077,0.023077,0.326899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
202843,0.108726,0.009375,0.108635,0.000529,0.108827,0.108873,0.000481,0.108702,0.108750,0.841490,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003895
202844,0.376130,0.033655,0.536444,0.067984,0.376148,0.067982,0.372269,0.067981,0.372260,0.033654,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004781
202845,0.084014,0.009568,0.009568,0.011347,0.011347,0.018149,0.272434,0.018149,0.272428,0.018774,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004953
202846,0.092788,0.068968,0.000673,0.000769,0.000817,0.056275,0.056227,0.071587,0.000673,0.053341,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004679


# Data sampling

In [41]:
smote = SMOTE(random_state=42, sampling_strategy={1: 5_000})
X_smote, y_smote = smote.fit_resample(train_X, y_train)

In [42]:
X_smote

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,reconstructed_91,reconstructed_92,reconstructed_93,reconstructed_94,reconstructed_95,reconstructed_96,reconstructed_97,reconstructed_98,reconstructed_99,loss
0,0.001899,0.001899,0.001803,0.000505,0.001827,0.001803,0.001923,0.031394,0.001899,0.001851,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004581
1,0.511514,0.000697,0.511539,0.001226,0.001082,0.001250,0.001082,0.511514,0.015313,0.000865,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004718
2,0.001899,0.021154,0.001803,0.008029,0.067623,0.067646,0.008029,0.008029,0.033101,0.067644,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004770
3,0.019495,0.019496,0.019496,0.018583,0.019496,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004863
4,0.000481,0.000481,0.326915,0.000505,0.000481,0.326907,0.326907,0.023077,0.023077,0.326899,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004982
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205989,0.008649,0.016171,0.016936,0.016970,0.016947,0.016958,0.016958,0.009346,0.016238,0.016946,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004760
205990,0.068674,0.030350,0.277844,0.115355,0.084104,0.067099,0.332956,0.179576,0.286351,0.001693,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005099
205991,0.001803,0.002428,0.001803,0.001923,0.001899,0.001921,0.002309,0.025385,0.001827,0.001805,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004560
205992,0.022580,0.042774,0.043600,0.043616,0.085533,0.085555,0.043591,0.043606,0.043582,0.043606,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005093


# Котяшим

In [43]:
class_counts = Counter(y_smote)
total = sum(class_counts.values())
class_weights = {cls: total / count for cls, count in class_counts.items()}

class_weights_list = [class_weights[cls] for cls in sorted(class_weights.keys())]

In [44]:
cat = CatBoostClassifier(iterations=6_500, learning_rate=0.01, depth=6, verbose=100, random_state=69, class_weights=class_weights_list)

In [45]:
cat.fit(X_smote, y_smote)

0:	learn: 0.6834656	total: 133ms	remaining: 14m 25s
100:	learn: 0.3086108	total: 7.68s	remaining: 8m 6s
200:	learn: 0.2223669	total: 15.6s	remaining: 8m 9s
300:	learn: 0.1734775	total: 23.5s	remaining: 8m 3s
400:	learn: 0.1492673	total: 30.9s	remaining: 7m 50s
500:	learn: 0.1311349	total: 38.3s	remaining: 7m 38s
600:	learn: 0.1180017	total: 45.9s	remaining: 7m 30s
700:	learn: 0.1082932	total: 53.3s	remaining: 7m 21s
800:	learn: 0.1008067	total: 1m	remaining: 7m 13s
900:	learn: 0.0953443	total: 1m 8s	remaining: 7m 5s
1000:	learn: 0.0906314	total: 1m 16s	remaining: 6m 57s
1100:	learn: 0.0862307	total: 1m 23s	remaining: 6m 50s
1200:	learn: 0.0818948	total: 1m 31s	remaining: 6m 43s
1300:	learn: 0.0774121	total: 1m 39s	remaining: 6m 37s
1400:	learn: 0.0731772	total: 1m 47s	remaining: 6m 31s
1500:	learn: 0.0696278	total: 1m 55s	remaining: 6m 24s
1600:	learn: 0.0666001	total: 2m 3s	remaining: 6m 17s
1700:	learn: 0.0638751	total: 2m 11s	remaining: 6m 10s
1800:	learn: 0.0612079	total: 2m 19s	re

<catboost.core.CatBoostClassifier at 0x1790ba540>

In [46]:
test_X = inference(X_test, y_test.values, model)

In [47]:
test_X

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,reconstructed_91,reconstructed_92,reconstructed_93,reconstructed_94,reconstructed_95,reconstructed_96,reconstructed_97,reconstructed_98,reconstructed_99,loss
0,0.134784,0.134859,0.134886,0.134934,0.134790,0.134907,0.000505,0.000913,0.022885,0.022885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004485
1,0.006178,0.000865,0.124453,0.070869,0.123299,0.704440,0.123272,0.000865,0.070865,0.000505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005043
2,0.735072,0.004327,0.004351,0.735107,0.004327,0.737830,0.737902,0.015313,0.000889,0.737885,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006722
3,0.018678,0.018678,0.018679,0.018679,0.018679,0.018678,0.018678,0.018678,0.018678,0.018678,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006159
4,0.001226,0.001779,0.001827,0.001803,0.001803,0.001899,0.021106,0.001899,0.021106,0.001779,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004346
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50708,0.502620,0.502632,0.502644,0.018727,0.018727,0.018726,0.018678,0.018678,0.018726,0.018678,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004778
50709,0.002356,0.002428,0.002596,0.002524,0.002428,0.604245,0.002452,0.002500,0.000913,0.000505,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004819
50710,0.023029,0.022765,0.023030,0.022766,0.022766,0.023029,0.023029,0.023029,0.022764,0.001298,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004980
50711,0.390938,0.000481,0.390980,0.390980,0.390980,0.005481,0.000481,0.033918,0.000481,0.033894,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004200


# Cat results

In [48]:
predictions = cat.predict(test_X)

In [49]:
roc_auc_score(y_test, predictions)

0.9612411712643373

In [50]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99     50270
           1       0.41      0.93      0.57       443

    accuracy                           0.99     50713
   macro avg       0.70      0.96      0.78     50713
weighted avg       0.99      0.99      0.99     50713



In [51]:
cat.save_model('cat_model.cbm')

# Cat results with threshold

In [75]:
probabilities = cat.predict_proba(test_X)

class_probabilities = probabilities[:, 1]

threshold = 0.125

predictions = (class_probabilities >= threshold).astype(int)

print(roc_auc_score(y_test, predictions))
print(classification_report(y_test, predictions))

0.9796058395274995
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     50270
           1       0.20      0.99      0.34       443

    accuracy                           0.97     50713
   macro avg       0.60      0.98      0.66     50713
weighted avg       0.99      0.97      0.98     50713



Очень много ложных срабатываний, но метрика хорошая, хороший recall

# Trying Isolation Forest for detecting anomalies

In [76]:
iso_forest = IsolationForest(n_estimators=1_000, contamination=0.005, random_state=42)

In [77]:
iso_forest.fit(test_X)

In [78]:
predictions = iso_forest.predict(test_X)

In [79]:
predictions = (predictions == -1).astype(int)

In [80]:
roc_auc_score(y_test, predictions)

0.49861225679300175

In [81]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99     50270
           1       0.00      0.00      0.00       443

    accuracy                           0.99     50713
   macro avg       0.50      0.50      0.50     50713
weighted avg       0.98      0.99      0.98     50713



говно.

Латентное пространство: 500, catboost (5k estimators, lr=0.01, depth=6, class_weights) roc_auc - 0.95

Латентное пространство: 100 + loss, catboost(6.5k estimators, lr=0.01, depth=6, class_weights) roc_auc - 0.96

Латентное пространство: 100 + loss, catboost(6.5k estimators, lr=0.01, depth=6, class_weights) + threshold 0.125 roc_auc - 0.979