In [7]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import HistGradientBoostingClassifier 
from sklearn.preprocessing import StandardScaler 
import joblib 
from tqdm import tqdm
import sklearn
import matplotlib.pyplot as plt

In [8]:
def haversine_km(lat1, lon1, lat2, lon2): 
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2]) 
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2 * np.sin(dlon/2.0)**2 
    c = 2 * np.arcsin(np.sqrt(a)) 
    return 6371.0088 * c

In [9]:
train = pd.read_csv('teta-ml-1-2025/train.csv') 
test = pd.read_csv('teta-ml-1-2025/test.csv') 
print(train.shape, test.shape) 
print(train['merch'].unique().shape, train['cat_id'].unique().shape, train['one_city'].unique().shape, train['us_state'].unique().shape)

(786431, 18) (262144, 17)
(693,) (14,) (879,) (51,)


In [10]:
for df in [train, test]: df['transaction_time'] = pd.to_datetime(df['transaction_time']) 
df['hour'] = df['transaction_time'].dt.hour 
df['dayofweek'] = df['transaction_time'].dt.dayofweek 
df['month'] = df['transaction_time'].dt.month
 # Гео-дистанция 
for df in [train, test]: 
    df['distance'] = haversine_km(df['lat'], df['lon'], df['merchant_lat'], df['merchant_lon']) 
    df['gender'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0)

In [11]:
# Frequency encoding для категориальных фичей 
cat_cols = ['merch', 'cat_id', 'one_city', 'us_state', 'jobs'] 
for col in cat_cols:
    freq = train[col].value_counts() 
for df in [train, test]:
    df[col + '_freq'] = df[col].map(freq).fillna(0)

In [12]:
# Убираем ненужное 
drop_cols = ['transaction_time', 'name_1', 'name_2', 'merch', 'cat_id', 'one_city', 'us_state', 'lat', 'lon', 'merchant_lat', 'merchant_lon', 'street', 'jobs'] 
features = [c for c in train.columns if c not in drop_cols + ['target']] 
print("Используем фичей:", len(features)) 

X = train[features] 
y = train['target']

Используем фичей: 6


In [13]:
# Делим на train/val
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=False) 

model = HistGradientBoostingClassifier( 
    learning_rate=0.05, 
    max_depth=6, 
    max_iter=500, 
    random_state=42 
)

In [14]:
model.fit(X_train, y_train) 
# Предсказываем классы вместо вероятностей 
preds_val_classes = model.predict(X_val)
preds_val_proba = model.predict_proba(X_val)[:, 1] 

# если все же нужны вероятности для AUC 
print("AUC на валидации:", roc_auc_score(y_val, preds_val_proba)) 
preds_test = model.predict(test[features])

AUC на валидации: 0.9645008326000009


In [15]:
import joblib, pickle

joblib.dump(model, 'artifacts/model_sklearn.pkl')

with open('artifacts/features.txt', 'w') as f:
    for c in features:
        f.write(c + '\n')

freq_maps = {}
for col in ['merch','cat_id','one_city','us_state','jobs']:
    freq_maps[col] = train[col].value_counts()
pickle.dump(freq_maps, open('artifacts/freq_maps.pkl','wb'))
