In [7]:
import io
import nbformat
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification
from sklearn.neighbors import NearestNeighbors
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from tqdm import tqdm
import re
import csv

In [8]:
data = "train_data.csv"
df1 = pd.read_csv(data)

In [9]:
data = "train_data.csv"
df1 = pd.read_csv(data)

#Prepare card id and ip id for plotting

# 1.Card ID
card_enc = LabelEncoder()
card_enc.fit(df1['card_id'])
df1['card_id'] = card_enc.transform(df1.card_id)

# 2.IP ID
ip_enc = LabelEncoder()
ip_enc.fit(df1['ip_id'])
df1['ip_id'] = ip_enc.transform(df1.ip_id)

#Encoding, Cleaning the data
df1.loc[df1['cardverificationcodesupplied'].isna(),'cardverificationcodesupplied'] = False
df1.loc[df1['issuercountrycode'].isna(),'issuercountrycode'] = 'ZZ'
df1.loc[df1['shoppercountrycode'].isna(),'shoppercountrycode'] = 'ZZ'

unique_issuer_cc = df1['issuercountrycode'].unique()
unique_shopper_cc = df1['shoppercountrycode'].unique()
both = np.append(unique_issuer_cc, unique_shopper_cc)
df_countrycodes = pd.DataFrame(both)
unique_codes = df_countrycodes[0].unique()
enc = LabelEncoder()
enc.fit(unique_codes)
df1['issuercountrycode'] = enc.transform(df1.issuercountrycode)
df1['shoppercountrycode'] = enc.transform(df1.shoppercountrycode)
def conv(row):
    currency_dict = {"BGN": 1.9558, "NZD": 1.6805, "ILS": 4.0448, "RUB": 72.2099, "CAD": 1.5075, "USD": 1.1218,
                     "PHP": 58.125, "CHF": 1.1437, "ZAR": 16.0224, "AUD": 1.5911, "JPY": 124.93, "TRY": 6.6913,
                     "HKD": 8.8007, "MYR": 4.6314, "THB": 35.802, "HRK": 7.413, "NOK": 9.6678, "IDR": 15953.68,
                     "DKK": 7.4646, "CZK": 25.659, "HUF": 322.97, "GBP": 0.86248, "MXN": 21.2829, "KRW": 1308.01,
                     "ISK": 136.2, "SGD": 1.5263, "BRL": 4.405, "PLN": 4.2868, "INR": 78.0615, "RON": 4.7596,
                     "CNY": 7.5541, "SEK": 10.635}
    return row['amount'] / (currency_dict[row['currencycode']]*100)

df1['amount_eur'] = df1.apply(lambda x: conv(x), axis=1)


enc1 = LabelEncoder()
enc1.fit(df1['txvariantcode'])
df1['txvariantcode'] = enc1.transform(df1.txvariantcode)

enc2 = LabelEncoder()
enc2.fit(df1['currencycode'])
df1['currencycode'] = enc2.transform(df1.currencycode)

enc3 = LabelEncoder()
enc3.fit(df1['shopperinteraction'])
df1['shopperinteraction'] = enc3.transform(df1.shopperinteraction)

df1['accountcode'] = df1['accountcode'].apply(lambda x: re.sub('Account','',x))
df1['accountcode_cc'] = 0
df1.loc[(df1['accountcode'] == 'UK'),'accountcode_cc'] = 'GB'
df1.loc[(df1['accountcode'] == 'Mexico'),'accountcode_cc'] = 'MX'
df1.loc[(df1['accountcode'] == 'Sweden'),'accountcode_cc'] = 'SE'
df1.loc[(df1['accountcode'] == 'APAC'),'accountcode_cc'] = 'APAC'

lb = OneHotEncoder()
df1['accountcode_cc'] = lb.fit_transform(df1['accountcode_cc'])

enc4 = LabelEncoder()
enc4.fit(df1['accountcode'])
df1['accountcode'] = enc4.transform(df1.accountcode)

enc5 = LabelEncoder()
enc5.fit(df1['cardverificationcodesupplied'])
df1['cardverificationcodesupplied'] = enc5.transform(df1.cardverificationcodesupplied)

df1.loc[df1['mail_id'].str.contains('na',case=False),'mail_id'] = 'email99999'

enc6 = LabelEncoder()
enc6.fit(df1['mail_id'])
df1['mail_id'] = enc6.transform(df1.mail_id)

df1.loc[df1['cvcresponsecode'] > 2,'cvcresponsecode'] = 3

#Feature Engineering
df1['countries_equal'] = (df1['shoppercountrycode'] == df1['issuercountrycode'])
df1.loc[df1['countries_equal'] == False,'countries_equal'] = 0
df1.loc[df1['countries_equal'] == True,'countries_equal'] = 1


X = df1[['issuercountrycode','txvariantcode','bin','amount','currencycode','shoppercountrycode','shopperinteraction','cardverificationcodesupplied','cvcresponsecode','accountcode','mail_id','ip_id','card_id','amount_eur','countries_equal']]


In [10]:
# classify majority and minority class
minority_class = df1[df1['label'] == 1]
majority_class = df1[df1['label'] == 0]

In [11]:
# calculate the difference between the two classes
nsamples = len(majority_class) - len(minority_class)

# get samples from the minority class
samples = minority_class.sample(n=nsamples, replace=True)

print(nsamples)
print(samples)

189076
            Id  issuercountrycode  txvariantcode       bin    amount  \
115952  144945                100              2  554629.0  109900.0   
75366    94040                107              6  499977.0   12850.0   
93817   117255                  9              2  552033.0   47500.0   
110908  138644                  9              2  552033.0   45300.0   
4188      5240                  9             10  455701.0   16000.0   
...        ...                ...            ...       ...       ...   
178896  223695                100             10  477214.0  119940.0   
182849  228586                  9              9  407220.0   54000.0   
178896  223695                100             10  477214.0  119940.0   
112623  140759                 49              8  476367.0   12700.0   
43175    53865                100              6  418080.0   83900.0   

        currencycode  shoppercountrycode  shopperinteraction  \
115952             2                 100                   1   


In [12]:
# find k nearest neighbors of each sample in the minority class
neigh = NearestNeighbors(n_neighbors=5)
nbrs = neigh.fit(minority_class)
distances, indices = nbrs.kneighbors(samples)

# create synthetic samples
synthetic_samples = []
for i in tqdm(range(len(samples))):
    nn = indices[i]
    nn_index = np.random.choice(nn)
    diff = minority_class.iloc[nn_index] - samples.iloc[i]
    r = np.random.rand()
    synthetic_sample = samples.iloc[i] + r * diff
    synthetic_samples.append(synthetic_sample)
    
# convert the synthetic samples to a dataframe
synthetic_samples = pd.DataFrame(synthetic_samples, columns=df1.columns)

# add the synthetic samples to the minority class
oversampled = pd.concat([minority_class, synthetic_samples])

In [21]:
oversampled.head()

Unnamed: 0,Id,issuercountrycode,txvariantcode,bin,amount,currencycode,shoppercountrycode,shopperinteraction,cardverificationcodesupplied,cvcresponsecode,accountcode,mail_id,ip_id,card_id,label,amount_eur,accountcode_cc,countries_equal
827,1031.0,9.0,2.0,552033.0,48000.0,0.0,9.0,1.0,0.0,0.0,0.0,131337.0,7864.0,70797.0,1.0,301.678084,1.0,1
1132,1399.0,9.0,2.0,537196.0,42000.0,0.0,9.0,1.0,0.0,0.0,0.0,27797.0,16851.0,26078.0,1.0,263.968324,1.0,1
1632,2022.0,9.0,2.0,532655.0,3850.0,0.0,9.0,1.0,0.0,0.0,0.0,112493.0,77607.0,60410.0,1.0,24.197096,1.0,1
3219,4032.0,100.0,2.0,520416.0,129900.0,2.0,100.0,1.0,0.0,0.0,1.0,127811.0,158579.0,171761.0,1.0,61.034915,0.0,1
3370,4217.0,107.0,8.0,483741.0,25200.0,3.0,107.0,1.0,0.0,0.0,0.0,94589.0,24404.0,157140.0,1.0,149.95537,1.0,1


In [None]:
# split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df1['label'], test_size=0.2, random_state=42)

# scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# train a logistic regression model on the oversampled data
clf = LogisticRegression()
clf.fit(X_train, y_train)
