In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

import os

In [None]:
train_transaction = pd.read_csv('../input/train_transaction.csv', index_col='TransactionID')
test_transaction = pd.read_csv('../input/test_transaction.csv', index_col='TransactionID')

train_identity = pd.read_csv('../input/train_identity.csv', index_col='TransactionID')
test_identity = pd.read_csv('../input/test_identity.csv', index_col='TransactionID')

sample_submission = pd.read_csv('../input/sample_submission.csv', index_col='TransactionID')

train = train_transaction.merge(train_identity, how='left', left_index=True, right_index=True)
test = test_transaction.merge(test_identity, how='left', left_index=True, right_index=True)
del train_transaction, test_transaction, train_identity, test_identity

# Counting 'duplicate' transactions

It's been noted [publically](https://www.kaggle.com/c/ieee-fraud-detection/discussion/105261#latest-605153) that there are some duplicate transactions in the databases. In this kernel I show how you can make a feature which counts the number of duplicate transactions in a given time window. 

**Notes:**
- This is not a fast implementation, some vectorized method using pandas methods would be better. Please share if you are able to speed up this implementation. 
- I haven't tested this feature yet. Still struggling with a CV mechanism to use! Please share if you find it improves your CV.
- You can tune the window_size.

In [None]:
def make_duplicate_feature(df, cols_to_match, window_size=3):
    """
    Counts the number of duplicate transactions in a temporal window
    of width 2*window_size.
    
    Parameters:
    -----------
    df : pd.DataFrame, 
        The dataset.
        
    cols_to_match : list,
        Columns which are required to be equal to be considered a match. 
        
    window_size : (float, int)
        Controls the size of the window (in minutes) to search for
        duplicates.

    Returns:
    --------
    duplicate_counts : list,
        List of length df.shape[0] which counts the duplicate transactions.
    """
    window_size = window_size * 60 # convert to seconds
    df['TransactionDT'] = df['TransactionDT']
    # Thanks to WeNYoBen on SO for speeding this bit up: https://stackoverflow.com/questions/57101482/counting-duplicate-row-within-a-rolling-window-of-a-pandas-df
    s = pd.Series(df[cols_to_match].apply(tuple,1).map(hash).values,
                  index=df.TransactionDT)
    duplicate_count = [sum(s.loc[x-window_size:x+window_size]==y)-1 for x ,y in zip(s.index,s)]
    
    return duplicate_count

In [None]:
cols = ['TransactionAmt','ProductCD','card1','card2','card3',
        'card4','card5','card6','addr1','addr2']

test['duplicate_count'] = make_duplicate_feature(test, cols_to_match=cols)
train['duplicate_count'] = make_duplicate_feature(train, cols_to_match=cols)

## Quick analysis of the new feature.

In [None]:
train[train['duplicate_count']>0].head()

#### Percentage of transactions which have duplicates

A significant number of transactions have duplicates. There is a large difference between train and test sets.

In [None]:
for df, name in zip([train,test], ['train','test']):
    num_duplicates = df[df['duplicate_count']>0].shape[0] / df.shape[0]
    print(f'{num_duplicates*100:.2f} % transactions have duplicates in the {name} set.')

#### How fraudlent are duplicate transactions

Features with a duplicate appear to be more likely to be a fraudulent. Some binning of this feature may help.

In [None]:
train.groupby('duplicate_count').mean()['isFraud']

In [None]:
# its important to look at the count, small counts are unreliable
train.groupby('duplicate_count').count()['isFraud']

##### And a graph of this table to help with visualisation

In [None]:
plt.plot(train.groupby('duplicate_count').mean()['isFraud'], color='k')
plt.ylabel('Fraction fraudulent')
plt.xlabel('Duplicate count')
plt.xlim(0, 40)