In [None]:
import numpy as np
import pandas as pd
from geopy.distance import great_circle
from catboost import CatBoostClassifier

from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('teta-ml-1-2025/train.csv')

In [None]:
df.head()

In [None]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])
    elif np.issubdtype(df[col].dtype, np.number):
        df[col] = df[col].fillna(df[col].median())

In [None]:
def add_distance_features(df):
    df['distance'] = df.apply(
        lambda x: great_circle(
            (x['lat'], x['lon']),
            (x['merchant_lat'], x['merchant_lon'])
        ).km,
        axis=1
    )
    return df.drop(columns=['lat', 'lon', 'merchant_lat', 'merchant_lon'])

In [None]:
df = add_distance_features(df)

In [None]:
df.head()

In [None]:
df['transaction_time'] = pd.to_datetime(df['transaction_time'])

In [None]:

df['tx_hour'] = df['transaction_time'].dt.hour
df['tx_minute'] = df['transaction_time'].dt.minute
df['tx_dow'] = df['transaction_time'].dt.dayofweek  # 0=Mon
df['tx_day'] = df['transaction_time'].dt.day
df['tx_month'] = df['transaction_time'].dt.month
df['is_weekend'] = df['tx_dow'].isin([5,6]).astype(int)
df['tx_unix'] = (df['transaction_time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')


In [None]:
df = df.drop(columns=['transaction_time'])

In [None]:
df.head()

In [None]:
X = df.drop("target", axis=1)
y = df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
cat_features = X_train.select_dtypes(include=['object']).columns.tolist()

In [None]:
model = CatBoostClassifier(
    iterations=400,          # уменьшайте при необходимости
    depth=6,                 # 4–6 обычно сильно экономят вес
    border_count=64,         # 32–64 уменьшает размер квантовки признаков
    l2_leaf_reg=6.0,
    one_hot_max_size=5,      # ограничивает one-hot для категориальных
    max_ctr_complexity=1,    # упрощает CTR-комбинации
    ctr_leaf_count_limit=8,  # ограничивает размер CTR-таблиц
    cat_features=cat_features)
model.fit(X_train, y_train)


In [None]:
class_predictions = model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report

class_report = classification_report(y_test, class_predictions)
print(class_report)

In [None]:
model.save_model("base_fraudmodel.cbm", format="cbm")

In [None]:
class_predictions

In [None]:
predictions_df = pd.DataFrame(class_predictions, columns=['target'])
predictions_df.to_csv('predictions.csv')

In [None]:
predictions_df

In [None]:
class_predictions