# Predicting

## Training

In [None]:
# Import libraries
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import platform

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Define global variables
DATA_ROOT = '../input/data-fraud'

!dir "../input/ieee-fraud-detection"

if platform.system() == "Windows":
    local = True
else:
    local = False

if local:
    DATA_ROOT = '../input/ieee-fraud-detection'


In [None]:
# Splitting training data function
def split_data(data, sample: float = 1.0, fraction: float = 0.8):
    # Shuffle data
    data = data.sample(frac=sample, random_state=0)

    # Split data into train and test
    train_data = data.sample(frac=fraction, random_state=0)
    test_data = data.drop(train_data.index)

    return train_data, test_data


In [None]:
def get_predictions(traindatafile, testdatafile, label, id):
    # Read data
    train = pd.read_csv(traindatafile)
    test = pd.read_csv(testdatafile)

    y = train[label].to_numpy()
    predictions = np.zeros(len(test))
    predictions_id = test[id].to_numpy()

    # Drop id column and label column
    train = train.drop(columns=[id, label])
    test = test.drop(columns=[id])

    # Init kfold
    N_FOLD = 5
    folds = StratifiedKFold(n_splits=N_FOLD, shuffle=True, random_state=0)

    oof = np.zeros(len(train))
    X = train.to_numpy()
    X_test = test.to_numpy()

    for train_idx, valid_idx in folds.split(X, y):
        X_train, X_valid = X[train_idx, :], X[valid_idx, :]
        y_train, y_valid = y[train_idx], y[valid_idx]
        
        model = lgb.LGBMClassifier(n_estimators=10000, metric='auc')
        model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_valid, y_valid)], verbose=200,
                        early_stopping_rounds=250)
        val_preds = model.predict(X_valid)
        predictions += model.predict(X_test)/N_FOLD
        oof[valid_idx] = val_preds

        lgb.plot_metric(model.evals_result_, metric='auc')
        plt.show()
        lgb.plot_importance(model, max_num_features=50)
        plt.show()

    AUC_OOF = round(roc_auc_score(y, oof), 4)
    print('Model ensemble OOF AUC score: {}'.format(AUC_OOF))

    result = pd.DataFrame({'TransactionID': predictions_id, 'isFraud': predictions})

    return result

In [None]:
# For testing
# get_predictions(
#     f'{DATA_ROOT}/final_train_transaction.csv',
#     f'{DATA_ROOT}/final_test_transaction.csv',
#     'isFraud',
#     'TransactionID'
# )

## Data post-processing
Note that we are not predicting if a transaction is fraudulent or not but rather predicting if a card used for those transactions is fraudulent as the competition host mentioned [here](https://www.kaggle.com/c/ieee-fraud-detection/discussion/101203#589276).

So we are going to group those predictions together as cards and then take average then if a group is fraudulent then the whole group is going to be fraudulent. We will be grouping them by using card and addr columns.

In [None]:
def group_by_card(datafile):
    # Read data
    df = pd.read_csv(datafile)

    tempdf = df.copy()
    # Create new column named cardID where it equals to card1, card2, addr1 and addr2 combination as string
    tempdf['cardID'] = tempdf['card1'].astype(str) + '_' + tempdf['card2'].astype(
        str) + '_' + tempdf['addr1'].astype(str) + '_' + tempdf['addr2'].astype(str)

    # Group TransactionIDs by cardID
    groups = {}
    for _, row in tempdf.iterrows():
        if row['cardID'] not in groups:
            groups[row['cardID']] = [row['TransactionID']]
        else:
            groups[row['cardID']].append(row['TransactionID'])

    return groups


print(len(group_by_card(f'{DATA_ROOT}/final_test_transaction.csv')))


In [None]:
def get_group_avg(df, group, label, id):
    # Get the average of the values of the group
    return round(df.loc[df[id].isin(group), label].mean())

In [None]:
# Export prediction function
def export_prediction(traindatafile, testdatafile, label, id):
    # Group test data by cardID
    groups = group_by_card(testdatafile)

    # Predict
    result = get_predictions(traindatafile, testdatafile, label, id)

    result.to_csv(f'./raw_result.csv', index=False)

    # Post-processing with group average
    for _, group in groups.items():
        result.loc[result[id].isin(group), label] = get_group_avg(
            result, group, label, id)

    # Export result
    result.to_csv(f'./result.csv', index=False)

    return result


In [None]:
# Check output file function
def check_output(filepath, label):
    # Read data
    data = pd.read_csv(filepath)

    # Check data
    print(f'{data.info()}\n{"---"*10}')
    print(f'{data.head()}\n{"---"*10}')

    # Value counts of label column
    print(f'{data[label].value_counts()}\n{"---"*10}')


In [None]:
export_prediction(
    f'{DATA_ROOT}/final_train_transaction.csv',
    f'{DATA_ROOT}/final_test_transaction.csv',
    'isFraud',
    'TransactionID'
)


In [None]:
check_output(f'./result.csv', 'isFraud')