In [21]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


In [22]:
trx_train    = pd.read_parquet('/kaggle/input/yaprofi/yaprofi-hack/trx_train.parquet')
train_target = pd.read_parquet('/kaggle/input/yaprofi/yaprofi-hack/train_target.parquet')

train_target['mon'] = pd.to_datetime(train_target['mon'])

In [None]:
# geo_train    = pd.read_parquet('/kaggle/input/yaprofi/yaprofi-hack/geo_train.parquet')


In [None]:
dial_train = pd.read_parquet('/kaggle/input/yaprofi/yaprofi-hack/dial_train.parquet')

In [None]:
trx_train['amount_log'] = np.log1p(trx_train['amount'])

In [None]:
dial_train_fix = pd.concat(
    [
        dial_train.drop(columns=['embedding']), 
        pd.DataFrame(np.vstack(dial_train.embedding.to_list()), columns=[f'emb_{i}' for i in range(768)])
    ],
    axis=1
) 

In [12]:
def generate_pivot_features(df, col, value, aggfunc):
    pivot = df.pivot_table(values=[value], index=['client_id'], columns=[col], aggfunc=aggfunc)
    pivot.columns = [f'{col}_{v}_{value}_{aggfunc}' for _, v in pivot.columns]
    pivot = pivot.reset_index()

    return pivot

In [None]:
start_date = '2021-11-01'
end_date = '2023-01-31'

# Generate a date range with the end of each month
dates = pd.date_range(start=start_date, end=end_date, freq='ME')

# Convert the dates to numpy datetime64 array
end_of_months = dates.to_numpy(dtype='datetime64[ns]')

print(end_of_months)

In [None]:
dataframes = []

# dates = train_target['mon'].unique()

for i in range(2, len(end_of_months)):
    trx_before_date = trx_train[
        (trx_train['event_time'] <= end_of_months[i-1]) &
        (trx_train['event_time'] >  end_of_months[max(i-4, 0)])
    ]

#     geo_before_date = geo_train[
#         (geo_train['event_time'] <= end_of_months[i]) & 
#         (geo_train['event_time'] > end_of_months[i-1])
#     ]

    dial_before_data = dial_train_fix[
        (dial_train_fix['event_time'] <= end_of_months[i-1]) & 
        (dial_train_fix['event_time'] > end_of_months[max(i-4, 0)])
    ]

    amount_features = trx_before_date.groupby('client_id')['amount_log'].agg(['mean', 'std', 'min', 'max']).reset_index()
    event_type_count = generate_pivot_features(trx_before_date, 'event_type', 'amount_log', 'count')
    event_type_amount_mean = generate_pivot_features(trx_before_date, 'event_type', 'amount_log', 'mean')
#     event_type_amount_std  = generate_pivot_features(trx_before_date, 'event_type', 'amount_log', 'std')
#     event_type_amount_std = generate_pivot_features(trx_before_date, 'event_type', 'amount', 'std')
#     event_type_tfidf = generate_tfidf_features(before_date, 'event_type', 100)

    dial_mean = dial_before_data.drop(columns=['event_time', 'mon']).groupby('client_id').mean().reset_index()

    df = (
        train_target[train_target['mon'] == dates[i]]
        .merge(amount_features, on='client_id', how='left')
#         .merge(event_type_amount_mean, on='client_id', how='left')
        .merge(event_type_count, on='client_id', how='left')
#         .merge(event_type_amount_std, on='client_id', how='left')
#         .merge(geo_counts, on='client_id', how='left')
        .merge(dial_mean, on='client_id', how='left')
    )

    dataframes.append(df)

In [None]:
df = pd.concat(dataframes, axis=0)

In [None]:
df.to_parquet('features_train.parquet', index=False)

In [2]:
df = pd.read_parquet('features_train.parquet')

In [3]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['mon', 'target_1', 'target_2', 'target_3', 'target_4', 'client_id']).fillna(0)
y = df[['target_1', 'target_2', 'target_3', 'target_4']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

In [4]:
import pickle

def gini(y_true, y_pred):
    return 2*roc_auc_score(y_true, y_pred) - 1

for i in range(1, 5):
    model = CatBoostClassifier(n_estimators=100, verbose=0, task_type="GPU", devices=['0', '1'])
#     model = Pipeline([
#         ('scaler', StandardScaler()),
#         ('model', LogisticRegression())
#     ])
    model.fit(X_train, y_train[f'target_{i}'])

    with open(f'model_{i}.pkl', 'wb') as f:
        pickle.dump(model, f)
    
    print(f'Target {i}')
    print("Train Gini:", gini(y_train[f'target_{i}'], model.predict_proba(X_train)[:, 1]))
    print("Test Gini:", gini(y_test[f'target_{i}'], model.predict_proba(X_test)[:, 1]), end='\n\n')

Target 1
Train Gini: 0.43004196379281856
Test Gini: 0.41895532769029975

Target 2
Train Gini: 0.7093675087444806
Test Gini: 0.6516723046398405

Target 3
Train Gini: 0.6260210494938541
Test Gini: 0.6131484811751617

Target 4
Train Gini: 0.6101420408085416
Test Gini: 0.5850041440070775



In [5]:
pd.Series(model.feature_importances_, index=X_train.columns).sort_values()

emb_653                            0.000000
emb_387                            0.000000
emb_388                            0.000000
emb_655                            0.000000
emb_390                            0.000000
                                    ...    
mean                               4.282828
event_type_56_amount_log_count     4.870840
event_type_40_amount_log_count     5.351470
event_type_37_amount_log_count    15.753955
event_type_54_amount_log_count    36.602654
Length: 827, dtype: float64

In [3]:
trx_test    = pd.read_parquet('/kaggle/input/yaprofi/yaprofi-hack/trx_test.parquet')
test_target = pd.read_parquet('/kaggle/input/yaprofi/yaprofi-hack/test_target.parquet')

In [4]:
trx_test['amount_log'] = np.log1p(trx_test['amount'])

In [5]:
dial_test = pd.read_parquet('/kaggle/input/yaprofi/yaprofi-hack/dial_test.parquet')

In [6]:
dial_test_fix = pd.concat(
    [
        dial_test.drop(columns=['embedding']), 
        pd.DataFrame(np.vstack(dial_test.embedding.to_list()), columns=[f'emb_{i}' for i in range(768)])
    ],
    axis=1
) 

In [7]:
start_date = '2021-11-01'
end_date = '2023-01-31'

# Generate a date range with the end of each month
dates = pd.date_range(start=start_date, end=end_date, freq='ME')

# Convert the dates to numpy datetime64 array
end_of_months = dates.to_numpy(dtype='datetime64[ns]')

print(end_of_months)

['2021-11-30T00:00:00.000000000' '2021-12-31T00:00:00.000000000'
 '2022-01-31T00:00:00.000000000' '2022-02-28T00:00:00.000000000'
 '2022-03-31T00:00:00.000000000' '2022-04-30T00:00:00.000000000'
 '2022-05-31T00:00:00.000000000' '2022-06-30T00:00:00.000000000'
 '2022-07-31T00:00:00.000000000' '2022-08-31T00:00:00.000000000'
 '2022-09-30T00:00:00.000000000' '2022-10-31T00:00:00.000000000'
 '2022-11-30T00:00:00.000000000' '2022-12-31T00:00:00.000000000'
 '2023-01-31T00:00:00.000000000']


In [8]:
dataframes = []

# dates = train_target['mon'].unique()

for i in range(len(end_of_months) - 1, len(end_of_months)):
    trx_before_date = trx_test[
        (trx_test['event_time'] <= end_of_months[i-1]) &
        (trx_test['event_time'] >  end_of_months[max(i-4, 0)])
    ]

#     geo_before_date = geo_train[
#         (geo_train['event_time'] <= end_of_months[i]) & 
#         (geo_train['event_time'] > end_of_months[i-1])
#     ]

    dial_before_data = dial_test_fix[
        (dial_test_fix['event_time'] <= end_of_months[i-1]) & 
        (dial_test_fix['event_time'] > end_of_months[max(i-4, 0)])
    ]

    amount_features = trx_before_date.groupby('client_id')['amount_log'].agg(['mean', 'std', 'min', 'max']).reset_index()
    event_type_count = generate_pivot_features(trx_before_date, 'event_type', 'amount_log', 'count')
    event_type_amount_mean = generate_pivot_features(trx_before_date, 'event_type', 'amount_log', 'mean')
#     event_type_amount_std  = generate_pivot_features(trx_before_date, 'event_type', 'amount_log', 'std')
#     event_type_amount_std = generate_pivot_features(trx_before_date, 'event_type', 'amount', 'std')
#     event_type_tfidf = generate_tfidf_features(before_date, 'event_type', 100)

    dial_mean = dial_before_data.drop(columns=['event_time']).groupby('client_id').mean().reset_index()

    df = (
        pd.read_parquet('/kaggle/input/yaprofi/yaprofi-hack/test_target_ids.parquet')
        .merge(amount_features, on='client_id', how='left')
#         .merge(event_type_amount_mean, on='client_id', how='left')
        .merge(event_type_count, on='client_id', how='left')
#         .merge(event_type_amount_std, on='client_id', how='left')
#         .merge(geo_counts, on='client_id', how='left')
        .merge(dial_mean, on='client_id', how='left')
    )

    dataframes.append(df)

In [9]:
cols = pd.read_parquet('/kaggle/working/features_train.parquet').columns[5:]

In [13]:
df['event_type_39_amount_log_count'] = 0
df['event_type_18_amount_log_count'] = 0

In [14]:
X = df.fillna(0)[cols].drop(columns=['client_id'])

In [15]:
import pickle


for i in range(1, 5):
    model = pd.read_pickle(f'model_{i}.pkl')
    y_pred = model.predict_proba(X)[:, 1]
    
    df[f'target_{i}'] = y_pred


In [18]:
df.iloc[:, -4:].to_csv('my_first_submission.csv', index=False)