In [None]:
import pandas as pd
import numpy as np
import datetime
import lightgbm as lgb
import fire

In [9]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [13]:
train_csv='train.csv'
test_csv='test.csv'
hist_csv = 'C:/Users/user/Documents/Salamat/ELO/historical_transactions.csv'
new_csv = 'C:/Users/user/Documents/Salamat/ELO/new_merchant_transactions.csv'


In [14]:
def binarize(df):
    for col in ['authorized_flag', 'category_1']:
        df[col] = df[col].map({'Y':1, 'N':0})
    return df

def read_data(input_file):
    df = pd.read_csv(input_file)
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['elapsed_time'] = (datetime.date(2018, 2, 1) - df['first_active_month'].dt.date).dt.days
    return df

In [23]:
def agg_trans(history):
    history.purchase_date = pd.DatetimeIndex(history.purchase_date).astype(np.int) * 1e-9
    agg_func = {
        'category_1': ['sum', 'mean'],
        'category_2_1': ['mean'],
        'category_2_2': ['mean'],
        'category_2_3': ['mean'],
        'category_2_4': ['mean'],
        'category_2_5': ['mean'],
        'category_3_A': ['mean'],
        'category_3_B': ['mean'],
        'category_3_C': ['mean'],
        'merchant_id': ['nunique'],
        'merchant_category_id': ['nunique'],
        'state_id': ['nunique'],
        'city_id': ['nunique'],
        'subsector_id': ['nunique'],
        'purchase_amount': ['sum', 'mean', 'max', 'min', 'std'],
        'installments': ['sum', 'mean', 'max', 'min', 'std'],
        'purchase_month': ['mean', 'max', 'min', 'std'],
        'purchase_date': [np.ptp, 'min', 'max'],
        'month_lag': ['min', 'max'],
        'month_diff': ['mean'],
        'weekend': ['sum', 'mean'],
    }
    for col in ['month','hour','weekofyear','dayofweek','year']:
        agg_func[col] = ['nunique']
    
    
    agg_history = history.groupby(['card_id']).agg(agg_func)
    agg_history.columns = ['_'.join(col).strip() for col in agg_history.columns.values]
    df = history.groupby('card_id').size()
    df.name = 'transactions_count'
    agg_history = agg_history.join(df)

    return agg_history

def agg_per_month(history):
    grouped = history.groupby(['card_id', 'month_lag'])

    agg_func = {
            'purchase_amount': ['count', 'sum', 'mean', 'min', 'max', 'std'],
            'installments': ['count', 'sum', 'mean', 'min', 'max', 'std'],
            }

    intermediate_group = grouped.agg(agg_func)
    intermediate_group.columns = ['_'.join(col).strip() for col in intermediate_group.columns.values]
    intermediate_group.reset_index(inplace=True)

    final_group = intermediate_group.groupby('card_id').agg(['mean', 'std'])
    final_group.columns = ['_'.join(col).strip() for col in final_group.columns.values]
    
    return final_group

def convert(train_csv, test_csv, hist_csv, new_csv, out_prefix, nrows=None):
    # read train/test
    train = read_data(train_csv)
    test = read_data(test_csv)
    target = train.target.values
    train.drop('target', axis=1, inplace=True)

    # read hist/new
    hist_trans = pd.read_csv(hist_csv, parse_dates=['purchase_date'], nrows=nrows)
    new_trans = pd.read_csv(new_csv, parse_dates=['purchase_date'], nrows=nrows)
    hist_trans = binarize(hist_trans)
    new_trans = binarize(new_trans)

    # fill missing values
    hist_trans.category_2 = hist_trans.category_2.fillna(0).astype(int)
    new_trans.category_2 = new_trans.category_2.fillna(0).astype(int)
    hist_trans.category_3 = hist_trans.category_3.fillna('A')
    new_trans.category_3 = new_trans.category_3.fillna('A')
    hist_trans.merchant_id = hist_trans.merchant_id.fillna('M_ID_00a6ca8a8a')
    new_trans.merchant_id = new_trans.merchant_id.fillna('M_ID_00a6ca8a8a')
    

    # add datetime features
    for df in [hist_trans, new_trans]:
        df['year'] = df.purchase_date.dt.year
        df['weekofyear'] = df.purchase_date.dt.weekofyear
        df['month'] = df.purchase_date.dt.month
        df['dayofweek'] = df.purchase_date.dt.dayofweek
        df['weekend'] = (df.purchase_date.dt.weekday >= 5).astype(int)
        df['hour'] = df.purchase_date.dt.hour
        df['month_diff'] = (datetime.datetime.today() - df.purchase_date).dt.days // 30
        df['month_diff'] += df.month_lag
    # feature engineering
    hist_trans = pd.get_dummies(hist_trans, columns=['category_2', 'category_3'])
    new_trans = pd.get_dummies(new_trans, columns=['category_2', 'category_3'])

    auth_mean = hist_trans.groupby('card_id').agg({'authorized_flag': ['sum', 'mean']})
    auth_mean.columns = ['_'.join(col).strip() for col in auth_mean.columns]

    authed_trans = hist_trans[hist_trans.authorized_flag == 1]
    hist_trans = hist_trans[hist_trans.authorized_flag == 0]

    hist_trans['purchase_month'] = hist_trans['purchase_date'].dt.month
    authed_trans['purchase_month'] = authed_trans['purchase_date'].dt.month
    new_trans['purchase_month'] = new_trans['purchase_date'].dt.month

    hist = agg_trans(hist_trans)
    hist.columns = [f'hist_{c}' for c in hist.columns]
    authed = agg_trans(authed_trans)
    authed.columns = [f'auth_{c}' for c in authed.columns]
    new = agg_trans(new_trans)
    new.columns = [f'new_{c}' for c in new.columns]

    final_group = agg_per_month(hist_trans)

    train = train.join(hist, on='card_id')
    train = train.join(authed, on='card_id')
    train = train.join(new, on='card_id')
    train = train.join(final_group, on='card_id')
    train = train.join(auth_mean, on='card_id')
    test = test.join(hist, on='card_id')
    test = test.join(authed, on='card_id')
    test = test.join(new, on='card_id')
    test = test.join(final_group, on='card_id')
    test = test.join(auth_mean, on='card_id')
    print(train.shape, test.shape)

    np.savetxt(f'{out_prefix}.target.txt', target)
    train.to_csv(f'{out_prefix}.tr.csv', index=False)
    test.to_csv(f'{out_prefix}.te.csv', index=False)


In [25]:
convert(train_csv, test_csv, hist_csv, new_csv, 'adam_solution', nrows=None)

(201917, 163) (123623, 163)


In [26]:
def model(prefix, out_submit):
    target = np.loadtxt(f'{prefix}.target.txt')
    train = pd.read_csv(f'{prefix}.tr.csv', index_col=0)
    test = pd.read_csv(f'{prefix}.te.csv', index_col=0)
    features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
    categorical_feats = [c for c in features if 'feature_' in c]

    param = {'num_leaves': 31,
         'min_data_in_leaf': 150, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.8,
         "bagging_freq": 1,
         "bagging_fraction": 0.8 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         #"nthread": 4,
         #"verbosity": -1
    }

    tr_data = lgb.Dataset(train[features], label=target, categorical_feature=categorical_feats)

    clf = lgb.train(param, tr_data, 3000, verbose_eval=200)
    y_pred = clf.predict(test[features], num_iteration=clf.best_iteration)

    pd.DataFrame({
        'card_id': test.card_id,
        'target': y_pred
    }).to_csv(out_submit, index=False)

In [7]:
def outlier(prefix, out_pred):
    target = np.loadtxt(f'{prefix}.target.txt')
    target_outlier = (target < -33).astype(int)
    train = pd.read_csv(f'{prefix}.tr.csv', index_col=0)
    test = pd.read_csv(f'{prefix}.te.csv', index_col=0)
    features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
    categorical_feats = [c for c in features if 'feature_' in c]

    param = {'num_leaves': 31,
         'min_data_in_leaf': 150, 
         'objective':'binary',
         'max_depth': -1,
         'learning_rate': 0.005,
         "boosting": "gbdt",
         "feature_fraction": 0.8,
         "bagging_freq": 1,
         "bagging_fraction": 0.8 ,
         "bagging_seed": 11,
         "metric": 'auc',
         "lambda_l1": 0.1,
         'scale_pos_weight': 15
         #"nthread": 4,
         #"verbosity": -1
    }

    tr_data = lgb.Dataset(train[features], label=target_outlier, categorical_feature=categorical_feats)

    clf = lgb.train(param, tr_data, 1000, verbose_eval=200)
    y_pred = clf.predict(test[features], num_iteration=clf.best_iteration)

    pd.DataFrame({
        'card_id': test.card_id,
        'target': y_pred
    }).to_csv(out_submit, index=False)

In [28]:
def model_bk(prefix, out_submit):
    target = np.loadtxt(f'{prefix}.target.txt')
    train = pd.read_csv(f'{prefix}.tr.csv')
    test = pd.read_csv(f'{prefix}.te.csv')
    features = [c for c in train.columns if c not in ['card_id', 'first_active_month']]
    categorical_feats = [c for c in features if 'feature_' in c]

    lgbr = LGBMRegressor(n_estimators=1000, n_jobs=32)
    lgbr.fit(train[features], target)

    y_pred = lgbr.predict(test[features])

    pd.DataFrame({
        'card_id': test.card_id,
        'target': y_pred
    }).to_csv(out_submit, index=False)

# def main():
#     pass


# if __name__ == "__main__":
#     fire.Fire({
#         'convert': convert,
#         'model': model
#     })


In [30]:
prefix='adam_solution'
out_submit='with_outliers'
model(prefix, out_submit)



In [31]:
prefix='adam_solution'
out_submit='outliers_pred'
outlier(prefix, out_submit)



In [32]:
outliers=pd.read_csv('outliers_pred')

In [35]:
outliers_sorted=outliers.sort_values('target',ascending=False)

In [38]:
outliers_sorted[outliers_sorted.target>0.5].shape

(4133, 2)

In [42]:
outliers_adam=pd.read_csv('outlier_prob.txt',header=None)

In [47]:
outliers_adam.sort_values(0,ascending=False).head(10)

Unnamed: 0,0
78078,0.862953
118268,0.860737
5008,0.860169
96354,0.858382
104991,0.857541
88754,0.854475
6026,0.84536
57626,0.845235
20556,0.844683
59178,0.844309


In [58]:
outliers_adam.shape

(123623, 1)

In [49]:
sent_idx=outliers_adam.sort_values(0,ascending=False).head(10).index.values

In [50]:
calc_idx=outliers_sorted.head(10).index.values

In [52]:
sent_idx=set(sent_idx)

In [53]:
calc_idx=set(calc_idx)

In [57]:
sent_idx.intersection(calc_idx)

{5008, 6026, 57626, 78078, 88754, 96354, 104991, 118268}

In [48]:
outliers_sorted.head(10)

Unnamed: 0,card_id,target
5008,C_ID_be92f84f5c,0.860618
78078,C_ID_922f9c5ea6,0.858936
96354,C_ID_b237ce01cb,0.853038
88754,C_ID_02871a2207,0.851883
104991,C_ID_86ddafb51c,0.850811
118268,C_ID_3420e285b9,0.849455
100556,C_ID_70c457436a,0.84405
32446,C_ID_ac114ef831,0.843812
6026,C_ID_91cc0c06ca,0.841243
57626,C_ID_944c62886f,0.83921


In [61]:
T=outliers_sorted.shape[0]

In [64]:
T=outliers_sorted.shape[0]
np.sqrt(T)*(3.731-3.678)

18.634833162655337