In [None]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import HistGradientBoostingClassifier 
from sklearn.preprocessing import StandardScaler 
import joblib 
from tqdm import tqdm
import sklearn
import matplotlib.pyplot as plt

In [None]:
def haversine_km(lat1, lon1, lat2, lon2): 
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2]) 
    dlat = lat2 - lat1 
    dlon = lon2 - lon1 
    a = np.sin(dlat/2.0)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2.0)**2 * np.sin(dlon/2.0)**2 
    c = 2 * np.arcsin(np.sqrt(a)) 
    return 6371.0088 * c

In [None]:
train = pd.read_csv('teta-ml-1-2025/train.csv') 
test = pd.read_csv('teta-ml-1-2025/test.csv') 
print(train.shape, test.shape) 
print(train['merch'].unique().shape, train['cat_id'].unique().shape, train['one_city'].unique().shape, train['us_state'].unique().shape)

In [None]:
for df in [train, test]: df['transaction_time'] = pd.to_datetime(df['transaction_time']) 
df['hour'] = df['transaction_time'].dt.hour 
df['dayofweek'] = df['transaction_time'].dt.dayofweek 
df['month'] = df['transaction_time'].dt.month
 # Гео-дистанция 
for df in [train, test]: 
    df['distance'] = haversine_km(df['lat'], df['lon'], df['merchant_lat'], df['merchant_lon']) 
    df['gender'] = df['gender'].apply(lambda x: 1 if x == 'M' else 0)

In [None]:
# Frequency encoding для категориальных фичей 
cat_cols = ['merch', 'cat_id', 'one_city', 'us_state', 'jobs'] 
for col in cat_cols:
    freq = train[col].value_counts() 
for df in [train, test]:
    df[col + '_freq'] = df[col].map(freq).fillna(0)