In [None]:
%autosave 60

Autosaving every 60 seconds


# Оглавление
* [0. Установка зависимостей и импорт библиотек](#chapter_0)
* [1. Построение итоговой модели](#chapter_1)
    * [Intro (*public F1=0.954641*, *privat F1=0.946733*)](#chapter_1_1)
        * [1. Обработка данных](#chapter_1_1_1)
        * [2. Модель](#chapter_1_1_2)
        * [3. Использование особенностей при обучении модели](#chapter_1_1_3)
        * [3. Валидация алгоритма](#chapter_1_1_4)
    * [Outro](#chapter_1_2)

## 0. Установка зависимостей и импорт библиотек <a class="anchor" id="chapter_0"></a>

In [None]:
!pip install -U -q kaggle
!mkdir -p ~/.kaggle
!echo '{"username":"dotyushka666","key":"unk"}' > ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json

!kaggle competitions download -c teta-ml-1-2025
!unzip /kaggle/working/teta-ml-1-2025.zip

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.2/173.2 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hArchive:  /kaggle/working/teta-ml-1-2025.zip
  inflating: sample_submition.csv    
  inflating: test.csv                
  inflating: train.csv               


In [3]:
!pip install reverse_geocoder
!pip install usaddress-scourgify

Collecting reverse_geocoder
  Downloading reverse_geocoder-1.5.1.tar.gz (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m43.3 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: reverse_geocoder
  Building wheel for reverse_geocoder (setup.py) ... [?25l[?25hdone
  Created wheel for reverse_geocoder: filename=reverse_geocoder-1.5.1-py3-none-any.whl size=2268068 sha256=efb4980aaa45664150681fe547a5c7379be95f335c5268f6fddf12bdfd76eaec
  Stored in directory: /root/.cache/pip/wheels/bd/e5/88/eb139b6d6a26b8022d370ab991f7a836802fed9871975ec6d9
Successfully built reverse_geocoder
Installing collected packages: reverse_geocoder
Successfully installed reverse_geocoder-1.5.1
Collecting usaddress-scourgify
  Downloading usaddress_scourgify-0.6.0-py3-none-any.whl.metadata (910 bytes)
Collecting usaddress>=0.5.9 (from usaddress-scourgify)
  Downloading usaddress-0.5.13-p

In [4]:
import usaddress
import numpy as np
import pandas as pd
import reverse_geocoder as rg
from catboost import CatBoostClassifier
from geopy.distance import great_circle
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from functools import lru_cache

In [5]:
RANDOM_STATE = 64

In [6]:
df_fraud = pd.read_csv("train.csv")
df_fraud.head()

Unnamed: 0,transaction_time,merch,cat_id,amount,name_1,name_2,gender,street,one_city,us_state,post_code,lat,lon,population_city,jobs,merchant_lat,merchant_lon,target
0,2019-12-27 15:21,fraud_Cormier LLC,health_fitness,148.04,Daniel,Martinez,M,8510 Acevedo Burgs,Kent,OR,97033,45.0838,-120.6649,60,Museum education officer,45.042827,-120.709327,0
1,2019-04-17 23:09,"fraud_Brown, Homenick and Lesch",health_fitness,39.4,Grace,Williams,F,28812 Charles Mill Apt. 628,Plantersville,AL,36758,32.6176,-86.9475,1412,Drilling engineer,31.872266,-87.828247,0
2,2019-09-23 15:02,fraud_Ruecker-Mayert,kids_pets,52.96,Kyle,Park,M,7507 Larry Passage Suite 859,Mount Perry,OH,43760,39.8788,-82.188,1831,Barrister's clerk,40.010874,-81.841249,0
3,2019-05-13 16:00,"fraud_Mante, Luettgen and Hackett",health_fitness,7.66,Monique,Martin,F,68276 Matthew Springs,Ratcliff,TX,75858,31.3833,-95.0619,43,"Engineer, production",30.888406,-95.141609,0
4,2019-08-18 07:27,fraud_Luettgen PLC,gas_transport,51.59,Christine,Johnson,F,8011 Chapman Tunnel Apt. 568,Blairsden-Graeagle,CA,96103,39.8127,-120.6405,1725,Chartered legal executive (England and Wales),39.376017,-121.311691,0


## 1. Построение итоговой модели <a id="chapter_1"></a>
### Intro (*public F1=0.954641*, *privat F1=0.946733*) <a id="chapter_1_1"></a>
#### 1. Обработка данных <a id="chapter_1_1_1"></a>
- Логарифмирование числовых признаков (amount, population_city) для уменьшения skewness.
- Генерация временных фичей: разбивка времени транзакции на час, часть дня, день недели.
- Агрегации по пользователям:
    - Кумулятивные суммы и средние за 7/30 предыдущих транзакций по потраченным суммам.
    - Сдвиги для анализа предыдущих трат (до 4 транзакций назад).
    - Сдвига для анализа предыдущей категории.
- Агрегация по индефикатору продавца и категории товара:
    - Cредние и стандартные отклонения за 7/30 предыдущих транзакций по потраченным суммам.
- Гео-фичи:
    - Расстояние между клиентом и мерчантом.
    - Флаги нахождения в разных административных округах.
- Обработка категориальных признаков:
    - Замена пропусков в сдвигах на "unk" (для категорий) и -1 (для числовых).
    - Разбор структурированных полей (адреса).
#### 2. Модель <a id="chapter_1_1_2"></a>
В качетсве модели был взят градиентный бустинг в исполнении *catboost*, т.к. есть следующие преимущества:
- Автоматическая обработка категориальных фичей и Nan'ов.
- Встроенная поддержка текстовых фичей.

#### 3. Использование особенностей при обучении модели <a id="chapter_1_1_3"></a>
- *Стратифицированная выборка* по *target*
- *Ранняя остановка*, если метрика не улучшается.
- Мониторинг важности фичей через *get_feature_importance*.


#### 4. Валидация алгоритма <a id="chapter_1_1_4"></a>
- Стратифицированное разбиение 80/10/10:
    *Train* (80%) → *Validation* (10%) → *Test* (10%).
- Метрика: *F1-score* (оптимизирована как основная).


### Outro <a id="chapter_1_2"></a>
- Генерация поведенческих паттернов
    - Добавить фичи типа "количество транзакций в последние 24 часа". Может улучшить детекцию аномальных всплесков активности. Но требует точной настройки временных окон
- Более детальная проработка гео-фичей
    - Добавить данные о плотности мерчантов в n километров. Может помочь выявлять подозрительные "одиночные" транзакции. Но требуется интеграция внешних API геоданных.
- Тонкая настройка подбора гиперпараметров.
    - Перенастроить разбивку данных и ее предобработку. До этого пытался подбирать через optuna, но в итоге шло к переобучению под трейн и валидационную выборки.
- Учет важности фичей через *permutation importance*. 
    - Дописать алгоритм для извлечения значимых признаков, т.к. их много и видно, что много из них ничего не привносят в целом.

In [7]:
@lru_cache(maxsize=128)
def calculate_distance(row_client: tuple[float], merchant_coords: tuple[float]):
    return great_circle(row_client, merchant_coords).km


@lru_cache(maxsize=256)
def get_info_coord(coord: tuple[tuple]):
    try:
        location = rg.search(coord)
        return location
    except:
        return {}


@lru_cache(maxsize=256)
def parse_address(address: str):
    try:
        parsed = usaddress.tag(address)[0]
        return {
            "street_number": parsed.get("AddressNumber", -1),
            "street_name": parsed.get("StreetName", "unk"),
            "street_suffix": parsed.get("StreetNamePostType", "unk"),
            "apartment": int(parsed.get("OccupancyIdentifier", -1)),
            "occupancy_identifier": parsed.get("OccupancyType", "unk"),
        }
    except:
        return {
            "street_number": -1,
            "street_name": "unk",
            "street_suffix": "unk",
            "apartment": -1,
            "occupancy_identifier": "unk",
        }

In [None]:
def preprocessing_data_catboost_v2(df_fraud: pd.DataFrame):
    df_train = df_fraud.copy(deep=True)
    df_train = df_train.sort_values("transaction_time").reset_index(drop=True)

    # Логарифмирование числовых переменных
    df_train["population_city"] = np.log(df_train["population_city"] + 1)
    df_train["is_round_amount"] = df_train["amount"].apply(lambda x: not bool(x % 10))
    df_train["amount"] = np.log(df_train["amount"] + 1)

    # Кумулятивные суммы и количество трат по каждому пользователю 
    df_train["user_cumsum"] = df_train.groupby(
        by=["name_1", "name_2", "one_city", "us_state", "post_code"]
    )["amount"].cumsum()
    df_train["user_cumcount"] = (
        df_train.groupby(by=["name_1", "name_2", "one_city", "us_state", "post_code"])[
            "amount"
        ].cumcount()
        + 1
    )

    # По каждому пользователю оконные функции за предыдущих 7, 30 транзакций (среднее)
    df_train["user_mean_amount_7_prev"] = (
        df_train.groupby(by=["name_1", "name_2", "one_city", "us_state", "post_code"])[
            "amount"
        ]
        .rolling(7)
        .mean()
        .values
    )
    df_train["user_mean_amount_7_prev"] = df_train["user_mean_amount_7_prev"].fillna(-1)

    df_train["user_mean_amount_30_prev"] = (
        df_train.groupby(by=["name_1", "name_2", "one_city", "us_state", "post_code"])[
            "amount"
        ]
        .rolling(30)
        .mean()
        .values
    )
    df_train["user_mean_amount_30_prev"] = df_train["user_mean_amount_30_prev"].fillna(
        -1
    )

    # Средние траты пользователя по кумулятивным суммам
    df_train["user_avg_amount"] = df_train["user_cumsum"] / df_train["user_cumcount"]

    # По идентификатору продавца и идентификатор категории товара 
    # оконные функции за предыдущих 30 транзакций (среднее, среднеквадратичное отклонение)
    df_train["mean_amount_merch_30_prev"] = (
        df_train.groupby("merch")["amount"].rolling(30).mean().values
    )
    df_train["mean_amount_merch_30_prev"] = df_train[
        "mean_amount_merch_30_prev"
    ].fillna(-1)

    df_train["std_amount_merch_30_prev"] = (
        df_train.groupby("merch")["amount"].rolling(30).std().values
    )
    df_train["std_amount_merch_30_prev"] = df_train["std_amount_merch_30_prev"].fillna(
        -1
    )

    df_train["mean_amount_cat_30_prev"] = (
        df_train.groupby("cat_id")["amount"].rolling(30).mean().values
    )
    df_train["mean_amount_cat_30_prev"] = df_train["mean_amount_cat_30_prev"].fillna(-1)

    df_train["std_amount_cat_30_prev"] = (
        df_train.groupby("cat_id")["amount"].rolling(30).std().values
    )
    df_train["std_amount_cat_30_prev"] = df_train["std_amount_cat_30_prev"].fillna(-1)


    # Предыдущая траты клиента (до 4-ех предыдущих)
    df_train["prev_amount"] = df_train.groupby(
        by=["name_1", "name_2", "one_city", "us_state", "post_code"]
    )["amount"].shift(1)
    df_train["prev_amount"] = df_train["prev_amount"].fillna(-1)

    df_train["prev_two_amount"] = df_train.groupby(
        by=["name_1", "name_2", "one_city", "us_state", "post_code"]
    )["amount"].shift(2)
    df_train["prev_two_amount"] = df_train["prev_two_amount"].fillna(-1)

    df_train["prev_three_amount"] = df_train.groupby(
        by=["name_1", "name_2", "one_city", "us_state", "post_code"]
    )["amount"].shift(3)
    df_train["prev_three_amount"] = df_train["prev_three_amount"].fillna(-1)

    df_train["prev_four_amount"] = df_train.groupby(
            by=["name_1", "name_2", "one_city", "us_state", "post_code"]
        )["amount"].shift(4)
    df_train["prev_four_amount"] = df_train["prev_four_amount"].fillna(-1)

    # Предыдущая категория траты клиента
    df_train["prev_cat_id"] = df_train.groupby(
        by=["name_1", "name_2", "one_city", "us_state", "post_code"]
    )["cat_id"].shift(1)
    df_train["prev_cat_id"] = df_train["prev_cat_id"].fillna("unk")

    # Извлечение фичей из времени
    df_train["transaction_time"] = pd.to_datetime(df_train["transaction_time"])
    df_train["hour"] = df_train["transaction_time"].dt.hour
    df_train["part_of_day"] = pd.cut(
        df_train["hour"],
        bins=[-1, 6, 12, 18, 24],
        labels=["night", "morning", "afternoon", "evening"],
    )
    df_train["year"] = df_train["transaction_time"].dt.year
    df_train["month"] = df_train["transaction_time"].dt.month
    df_train["day_of_month"] = df_train["transaction_time"].dt.day
    df_train["day_of_week"] = df_train["transaction_time"].dt.dayofweek

    # Извлечение фичей из адреса
    df_train = df_train.join(
        df_train["street"].apply(lambda x: pd.Series(parse_address(x)))
    )

    df_train["job_title_length"] = df_train["jobs"].str.len()

    # Рассчитываем расстояние от клиента до продавца
    df_train["distance_km_in_table_info"] = df_train.apply(
        lambda x: calculate_distance(
            (x["lat"], x["lon"]), (x["merchant_lat"], x["merchant_lon"])
        ),
        axis=1,
    )

    # Геокодирование координат (широта/долгота) с добавлением административных регионов
    uniq_coord = tuple(df_train[["lat", "lon"]].itertuples(index=False, name=None))
    merchant_uniq_coord = tuple(
        df_train[["merchant_lat", "merchant_lon"]].itertuples(index=False, name=None)
    )
    dict_coords = get_info_coord(uniq_coord)
    merchant_dict_coords = get_info_coord(merchant_uniq_coord)
    coords = pd.DataFrame(dict_coords)
    coords = coords.rename(
        columns={
            "lat": "lat_coords",
            "lon": "lon_coords",
            "name": "name_coords",
            "admin1": "admin1_coords",
            "admin2": "admin2_coords",
            "cc": "cc_coords",
        }
    )
    coords = coords[["name_coords", "admin2_coords"]]

    merchant_coords = pd.DataFrame(merchant_dict_coords)
    merchant_coords = merchant_coords.rename(
        columns={
            "lat": "merchant_lat_coords",
            "lon": "merchant_lon_coords",
            "name": "merchant_name_coords",
            "admin1": "merchant_admin1_coords",
            "admin2": "merchant_admin2_coords",
            "cc": "merchant_cc_coords",
        }
    )
    merchant_coords = merchant_coords[
        ["merchant_name_coords", "merchant_admin2_coords"]
    ]

    df_train = pd.concat([df_train, coords], axis=1)
    df_train = pd.concat([df_train, merchant_coords], axis=1)

    df_train["is_out_of_county"] = (
        df_train["admin2_coords"] != df_train["merchant_admin2_coords"]
    ).astype(int)
    df_train["is_out_of_name"] = (
        df_train["name_coords"] != df_train["merchant_name_coords"]
    ).astype(int)


    # Удаляем лишнее
    df_train.drop(
        [
            "lat",
            "lon",
            "merchant_lat",
            "merchant_lon",
            "admin2_coords",
            "merchant_admin2_coords",
            "name_coords",
            "merchant_name_coords",
            "transaction_time",
            "street",
        ],
        axis=1,
        inplace=True,
    )

    return df_train

In [9]:
df_train = preprocessing_data_catboost_v2(df_fraud)

df_train.head()

Loading formatted geocoded file...


Unnamed: 0,merch,cat_id,amount,name_1,name_2,gender,one_city,us_state,post_code,population_city,...,day_of_week,street_number,street_name,street_suffix,apartment,occupancy_identifier,job_title_length,distance_km_in_table_info,is_out_of_county,is_out_of_name
0,"fraud_Heller, Gutmann and Zieme",grocery_pos,4.684259,Stephanie,Gill,F,Orient,WA,99160,5.010635,...,1,43039,Riley,Greens,393,Suite,33,30.212218,0,0
1,fraud_Keeling-Crist,misc_pos,3.760269,Tyler,Garcia,M,Doe Hill,VA,24433,4.60517,...,1,408,Bradley,Rest,-1,unk,30,77.556853,1,1
2,fraud_Rowe-Vandervort,grocery_net,3.818591,Kelsey,Richards,F,Holcomb,KS,67851,7.89804,...,1,889,Sarah,Station,624,Suite,15,118.119942,1,1
3,"fraud_Stroman, Hudson and Erdman",gas_transport,4.560487,Jennifer,Conner,F,Dublin,PA,18917,7.6774,...,1,4655,David,Island,-1,unk,17,85.922764,1,1
4,fraud_Herzog Ltd,misc_pos,1.66203,Heather,Chase,F,Manor,PA,15665,7.295056,...,1,6888,Hicks,Stream,954,Suite,25,25.270529,1,1


In [10]:
df_train.columns

Index(['merch', 'cat_id', 'amount', 'name_1', 'name_2', 'gender', 'one_city',
       'us_state', 'post_code', 'population_city', 'jobs', 'target',
       'is_round_amount', 'user_cumsum', 'user_cumcount',
       'user_mean_amount_7_prev', 'user_mean_amount_30_prev',
       'user_avg_amount', 'mean_amount_merch_30_prev',
       'std_amount_merch_30_prev', 'mean_amount_cat_30_prev',
       'std_amount_cat_30_prev', 'prev_amount', 'prev_two_amount',
       'prev_three_amount', 'prev_four_amount', 'prev_cat_id', 'hour',
       'part_of_day', 'year', 'month', 'day_of_month', 'day_of_week',
       'street_number', 'street_name', 'street_suffix', 'apartment',
       'occupancy_identifier', 'job_title_length', 'distance_km_in_table_info',
       'is_out_of_county', 'is_out_of_name'],
      dtype='object')

In [11]:
X = df_train.drop("target", axis=1)
y = df_train["target"]

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42, shuffle=True
)

X_valid, X_test, y_valid, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42, shuffle=True
)

In [12]:
text_features = ["jobs"]

cat_features = [
    column
    for column in X_train.select_dtypes(include=["object", "category"]).columns.tolist()
    if column not in text_features
]

In [13]:
catbclf = CatBoostClassifier(
    iterations=1024,
    learning_rate=0.06,
    depth=10,
    eval_metric="TotalF1:average=Macro",
    early_stopping_rounds=50,
    verbose=30,
    cat_features=cat_features,
    text_features=text_features,
    subsample=0.8,
    random_state=RANDOM_STATE,
    task_type="CPU",
)

In [14]:
catbclf.fit(X_train, y_train, eval_set=(X_valid, y_valid), use_best_model=True)

0:	learn: 0.4985642	test: 0.4985654	best: 0.4985654 (0)	total: 1.73s	remaining: 29m 25s
30:	learn: 0.9350436	test: 0.9351516	best: 0.9351516 (29)	total: 2m 4s	remaining: 1h 6m 25s
60:	learn: 0.9601588	test: 0.9541936	best: 0.9541936 (60)	total: 4m 5s	remaining: 1h 4m 41s
90:	learn: 0.9670775	test: 0.9589511	best: 0.9589511 (85)	total: 6m	remaining: 1h 1m 33s
120:	learn: 0.9729928	test: 0.9627834	best: 0.9627834 (117)	total: 7m 50s	remaining: 58m 34s
150:	learn: 0.9753651	test: 0.9646794	best: 0.9646794 (148)	total: 9m 37s	remaining: 55m 38s
180:	learn: 0.9783864	test: 0.9646794	best: 0.9659361 (161)	total: 11m 27s	remaining: 53m 21s
210:	learn: 0.9804037	test: 0.9671107	best: 0.9671107 (197)	total: 13m 6s	remaining: 50m 31s
240:	learn: 0.9821206	test: 0.9671107	best: 0.9671107 (197)	total: 14m 49s	remaining: 48m 9s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9671107258
bestIteration = 197

Shrink model to first 198 iterations.


<catboost.core.CatBoostClassifier at 0x7fde27ac8f70>

In [None]:
y_pred = catbclf.predict(X_valid)
y_proba = catbclf.predict_proba(X_valid)[:, 1]

print("Validation".center(50, "_"))
print(classification_report(y_valid, y_pred))
print(f"ROC-AUC: {roc_auc_score(y_valid, y_proba)}")

____________________Validation____________________
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     78193
           1       0.99      0.89      0.93       450

    accuracy                           1.00     78643
   macro avg       0.99      0.94      0.97     78643
weighted avg       1.00      1.00      1.00     78643

Valid ROC-AUC: 0.9977864457886966


In [20]:
y_pred = catbclf.predict(X_valid)
y_proba = catbclf.predict_proba(X_valid)[:, 1]

print("Test".center(50, "_"))
print(classification_report(y_valid, y_pred))
print(f"ROC-AUC: {roc_auc_score(y_valid, y_proba)}")

_______________________Test_______________________
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     78193
           1       0.99      0.89      0.93       450

    accuracy                           1.00     78643
   macro avg       0.99      0.94      0.97     78643
weighted avg       1.00      1.00      1.00     78643

ROC-AUC: 0.9977864457886966


In [21]:
importance_features_ctb = pd.DataFrame(
    data={
        "features": X_train.columns.to_list(),
        "score": catbclf.get_feature_importance(),
    }
)

importance_features_ctb.sort_values(by="score", ascending=False)

Unnamed: 0,features,score
2,amount,26.074788
26,hour,21.499586
1,cat_id,11.68857
21,prev_amount,7.646029
22,prev_two_amount,5.382224
16,user_avg_amount,4.310259
32,street_number,2.636487
27,part_of_day,2.363208
9,population_city,2.285002
23,prev_three_amount,1.982182


In [22]:
catbclf = CatBoostClassifier(**catbclf.get_params())
catbclf.set_params(iterations=197)

catbclf.fit(X, y)

0:	learn: 0.4992298	total: 2.83s	remaining: 9m 14s
30:	learn: 0.9348557	total: 1m 30s	remaining: 8m 6s
60:	learn: 0.9565841	total: 2m 59s	remaining: 6m 39s
90:	learn: 0.9665246	total: 4m 27s	remaining: 5m 11s
120:	learn: 0.9691340	total: 5m 54s	remaining: 3m 42s
150:	learn: 0.9719266	total: 7m 21s	remaining: 2m 14s
180:	learn: 0.9734471	total: 8m 48s	remaining: 46.7s
196:	learn: 0.9745446	total: 9m 34s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7fde8a373d30>

In [23]:
df_fraud_test = pd.read_csv("test.csv")

df_fraud_train = df_fraud.copy(deep=True).reset_index()
df_fraud_test = df_fraud_test.reset_index()

df_fraud_train["test_columns"] = 0
df_fraud_test["test_columns"] = 1

df_full = pd.concat(
    [df_fraud_train.drop(columns=["target"]), df_fraud_test], axis=0, ignore_index=True
)

df_test = preprocessing_data_catboost_v2(df_full)

df_test = (
    df_test[df_test["test_columns"] == 1]
    .sort_values(by="index")
    .drop(columns=["test_columns", "index"])
)

assert df_test.shape[1] == df_train.shape[1] - 1

In [24]:
y_proba_test = catbclf.predict_proba(df_test)[:, 1]

In [25]:
df_predict_test = pd.DataFrame(y_proba_test, columns=["prediction"]).reset_index()
df_predict_test["prediction"] = df_predict_test["prediction"].apply(
    lambda x: 1 if x >= 0.5 else 0
)

In [26]:
df_predict_test.to_csv(
    "catboost_additional_features_and_timeseries_features_v3.csv", index=None
)