In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import re
import csv
from tqdm import tqdm


## Read the Data

In [3]:
data = "train_data.csv"
df1 = pd.read_csv(data)

## Data Preprocessing

In [4]:
#Prepare card id and ip id for plotting

# 1.Card ID
card_enc = LabelEncoder()
card_enc.fit(df1['card_id'])
df1['card_id'] = card_enc.transform(df1.card_id)

# 2.IP ID
ip_enc = LabelEncoder()
ip_enc.fit(df1['ip_id'])
df1['ip_id'] = ip_enc.transform(df1.ip_id)


## Data Preprocessing

In [5]:
data = "train_data.csv"
df1 = pd.read_csv(data)

#Prepare card id and ip id for plotting

# 1.Card ID
card_enc = LabelEncoder()
card_enc.fit(df1['card_id'])
df1['card_id'] = card_enc.transform(df1.card_id)

# 2.IP ID
ip_enc = LabelEncoder()
ip_enc.fit(df1['ip_id'])
df1['ip_id'] = ip_enc.transform(df1.ip_id)

#Encoding, Cleaning the data
df1.loc[df1['cardverificationcodesupplied'].isna(),'cardverificationcodesupplied'] = False
df1.loc[df1['issuercountrycode'].isna(),'issuercountrycode'] = 'ZZ'
df1.loc[df1['shoppercountrycode'].isna(),'shoppercountrycode'] = 'ZZ'

unique_issuer_cc = df1['issuercountrycode'].unique()
unique_shopper_cc = df1['shoppercountrycode'].unique()
both = np.append(unique_issuer_cc, unique_shopper_cc)
df_countrycodes = pd.DataFrame(both)
unique_codes = df_countrycodes[0].unique()
enc = LabelEncoder()
enc.fit(unique_codes)
df1['issuercountrycode'] = enc.transform(df1.issuercountrycode)
df1['shoppercountrycode'] = enc.transform(df1.shoppercountrycode)
def conv(row):
    currency_dict = {"BGN": 1.9558, "NZD": 1.6805, "ILS": 4.0448, "RUB": 72.2099, "CAD": 1.5075, "USD": 1.1218,
                     "PHP": 58.125, "CHF": 1.1437, "ZAR": 16.0224, "AUD": 1.5911, "JPY": 124.93, "TRY": 6.6913,
                     "HKD": 8.8007, "MYR": 4.6314, "THB": 35.802, "HRK": 7.413, "NOK": 9.6678, "IDR": 15953.68,
                     "DKK": 7.4646, "CZK": 25.659, "HUF": 322.97, "GBP": 0.86248, "MXN": 21.2829, "KRW": 1308.01,
                     "ISK": 136.2, "SGD": 1.5263, "BRL": 4.405, "PLN": 4.2868, "INR": 78.0615, "RON": 4.7596,
                     "CNY": 7.5541, "SEK": 10.635}
    return row['amount'] / (currency_dict[row['currencycode']]*100)

df1['amount_eur'] = df1.apply(lambda x: conv(x), axis=1)


enc1 = LabelEncoder()
enc1.fit(df1['txvariantcode'])
df1['txvariantcode'] = enc1.transform(df1.txvariantcode)

enc2 = LabelEncoder()
enc2.fit(df1['currencycode'])
df1['currencycode'] = enc2.transform(df1.currencycode)

enc3 = LabelEncoder()
enc3.fit(df1['shopperinteraction'])
df1['shopperinteraction'] = enc3.transform(df1.shopperinteraction)

df1['accountcode'] = df1['accountcode'].apply(lambda x: re.sub('Account','',x))
df1['accountcode_cc'] = 0
df1.loc[(df1['accountcode'] == 'UK'),'accountcode_cc'] = 'GB'
df1.loc[(df1['accountcode'] == 'Mexico'),'accountcode_cc'] = 'MX'
df1.loc[(df1['accountcode'] == 'Sweden'),'accountcode_cc'] = 'SE'
df1.loc[(df1['accountcode'] == 'APAC'),'accountcode_cc'] = 'APAC'

enc4 = LabelEncoder()
enc4.fit(df1['accountcode'])
df1['accountcode'] = enc4.transform(df1.accountcode)

enc5 = LabelEncoder()
enc5.fit(df1['cardverificationcodesupplied'])
df1['cardverificationcodesupplied'] = enc5.transform(df1.cardverificationcodesupplied)

df1.loc[df1['mail_id'].str.contains('na',case=False),'mail_id'] = 'email99999'

enc6 = LabelEncoder()
enc6.fit(df1['mail_id'])
df1['mail_id'] = enc6.transform(df1.mail_id)

df1.loc[df1['cvcresponsecode'] > 2,'cvcresponsecode'] = 3

#Feature Engineering
df1['countries_equal'] = (df1['shoppercountrycode'] == df1['issuercountrycode'])
df1.loc[df1['countries_equal'] == False,'countries_equal'] = 0
df1.loc[df1['countries_equal'] == True,'countries_equal'] = 1


X = df1[['issuercountrycode','txvariantcode','bin','amount','currencycode','shoppercountrycode','shopperinteraction','cardverificationcodesupplied','cvcresponsecode','accountcode','mail_id','ip_id','card_id','amount_eur','countries_equal']]


In [6]:
print(df1.head())

   Id  issuercountrycode  txvariantcode       bin   amount  currencycode  \
0   1                 49              2  546097.0   4500.0             1   
1   2                 49              3  557349.0   3395.0             1   
2   3                 49              2  542011.0   3895.0             1   
3   4                 49              8  465943.0  15345.0             1   
4   6                 49              8  475127.0   5095.0             1   

   shoppercountrycode  shopperinteraction  cardverificationcodesupplied  \
0                  49                   1                             1   
1                  49                   1                             1   
2                  49                   1                             1   
3                  49                   0                             0   
4                  49                   1                             1   

   cvcresponsecode  accountcode  mail_id   ip_id  card_id  label  amount_eur  \
0           

In [11]:
def rank_swapping(data_original, p, num_swaps, columns):
    data = data_original.copy()
    for i, column in enumerate(columns):
        data = data.sort_values(data.columns[column])
        data = data.to_numpy()
        print('data: ', data)
        for _ in tqdm(range(num_swaps)):
            swap_index1 = np.random.randint(0, len(data))
            max_index = min(len(data), swap_index1 + p)
            min_index = max(0, swap_index1 - p)
            swap_index2 = np.random.randint(min_index, max_index)
            temp = data[swap_index1][column]
            data[swap_index1][column] = data[swap_index2][column]
            data[swap_index2][column] = temp
        data = pd.DataFrame(data, columns=data_original.columns)

    data = data.sort_values(data.columns[0])
    return data

print(df1)
ranked_frame = rank_swapping(df1, 300, 10000000, [1, 4, 5, 6, 15])
print(ranked_frame)
print(np.equal(ranked_frame, df1.to_numpy()).all())

            Id  issuercountrycode  txvariantcode       bin   amount  \
0            1                 49              2  546097.0   4500.0   
1            2                 49              3  557349.0   3395.0   
2            3                 49              2  542011.0   3895.0   
3            4                 49              8  465943.0  15345.0   
4            6                 49              8  475127.0   5095.0   
...        ...                ...            ...       ...      ...   
189623  237030                 49              2  522948.0  10900.0   
189624  237031                 49              8  475128.0   1895.0   
189625  237032                 49              8  475130.0   3195.0   
189626  237034                 49              8  475130.0   7895.0   
189627  237035                  9              2  521729.0  11050.0   

        currencycode  shoppercountrycode  shopperinteraction  \
0                  1                  49                   1   
1                  

100%|██████████| 10000000/10000000 [01:03<00:00, 158388.14it/s]


data:  [[213174 9 8 ... 0.3142480045251713 'APAC' 1]
 [11678 9 4 ... 0.6284960090503426 'APAC' 1]
 [212769 49 4 ... 1.1594471755866802 'GB' 1]
 ...
 [148555 125 2 ... 1045.6041372825575 'SE' 1]
 [79742 100 8 ... 624.8208655775294 'MX' 1]
 [50318 100 2 ... 918.9372688872287 'MX' 1]]


100%|██████████| 10000000/10000000 [01:04<00:00, 155810.72it/s]


data:  [[213174 9 8 ... 0.3142480045251713 'APAC' 1]
 [93103 9 8 ... 74.79102507699076 'APAC' 1]
 [220948 9 2 ... 74.79102507699076 'APAC' 1]
 ...
 [49784 125 8 ... 35.166901739539256 'SE' 1]
 [48417 125 8 ... 80.77103902209684 'SE' 1]
 [60668 125 8 ... 29.99529854254819 'SE' 1]]


100%|██████████| 10000000/10000000 [01:03<00:00, 158677.32it/s]


data:  [[168361 9 2 ... 66.03051664966144 'GB' 1]
 [147 49 10 ... 158.61237362025784 'GB' 0]
 [206245 9 10 ... 520.4758371208608 'GB' 1]
 ...
 [34746 49 8 ... 75.53798348947221 'GB' 0]
 [142558 107 8 ... 142.81463850044628 'APAC' 0]
 [112424 49 8 ... 70.61033299322882 'GB' 0]]


100%|██████████| 10000000/10000000 [01:02<00:00, 160071.15it/s]


data:  [[213174 9 8 ... 0.3142480045251713 'APAC' 1]
 [221904 9 8 ... 0.6284960090503426 'APAC' 1]
 [118282 9 2 ... 0.6284960090503426 'APAC' 1]
 ...
 [230503 49 3 ... 2822.0364530192005 'GB' 1]
 [115236 49 3 ... 2884.7045728596604 'GB' 1]
 [234240 49 3 ... 3465.5876078285874 'GB' 1]]


100%|██████████| 10000000/10000000 [01:05<00:00, 152415.17it/s]


            Id issuercountrycode txvariantcode       bin   amount  \
67271        1                49             2  546097.0   4400.0   
43357        2                49             3  557349.0   3400.0   
54139        3                49             2  542011.0   3600.0   
181203       4                49             8  465943.0  17395.0   
80154        6                49             8  475127.0   5000.0   
...        ...               ...           ...       ...      ...   
167255  237030                49             2  522948.0  11000.0   
10890   237031                49             8  475128.0   1800.0   
38491   237032                49             8  475130.0   3395.0   
142594  237034                49             8  475130.0   8075.0   
101571  237035                 9             2  521729.0  11200.0   

       currencycode shoppercountrycode shopperinteraction  \
67271             1                 49                  1   
43357             1                 49           

In [12]:
print(ranked_frame.to_numpy()[:, 4])
print(df1.to_numpy()[:, 4])

[4400.0 3400.0 3600.0 ... 3395.0 8075.0 11200.0]
[4500.0 3395.0 3895.0 ... 3195.0 7895.0 11050.0]
