In [6]:
#Import packages and define helper functions and objects
import datetime
import time
import matplotlib.pyplot as plt
from sklearn import neighbors
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from operator import itemgetter
from itertools import groupby
import numpy as np

#Converts time string to float value
def string_to_timestamp(date_string):
    time_stamp = time.strptime(date_string, '%Y-%m-%d %H:%M:%S')
    return time.mktime(time_stamp)

#Helper objects for encoding the categorical data
(issuercountry_set, txvariantcode_set, currencycode_set, shoppercountry_set, interaction_set,
verification_set, accountcode_set, mail_id_set, ip_id_set, card_id_set) = [set() for _ in range(10)]
(issuercountry_dict, txvariantcode_dict, currencycode_dict, shoppercountry_dict, interaction_dict,
verification_dict, accountcode_dict, mail_id_dict, ip_id_dict, card_id_dict) = [{} for _ in range(10)]

In [3]:
#Reads the data from the given csv
def get_raw_data() : 
    ah = open('data_for_student_case.csv', 'r')
    data = []
    ah.readline()#skip first line
    for line_ah in ah:
        if line_ah.strip().split(',')[9]=='Refused':# remove the row with 'refused' label, since it's uncertain about fraud
            continue
        if 'na' in str(line_ah.strip().split(',')[14]).lower() or 'na' in str(line_ah.strip().split(',')[4].lower()):
            continue
        bookingdate = string_to_timestamp(line_ah.strip().split(',')[1])# date reported flaud
        issuercountry = line_ah.strip().split(',')[2]#country code
        issuercountry_set.add(issuercountry)
        txvariantcode = line_ah.strip().split(',')[3]#type of card: visa/master
        txvariantcode_set.add(txvariantcode)
        issuer_id = float(line_ah.strip().split(',')[4])#bin card issuer identifier
        amount = float(line_ah.strip().split(',')[5])#transaction amount in minor units
        currencycode = line_ah.strip().split(',')[6]
        currencycode_set.add(currencycode)
        shoppercountry = line_ah.strip().split(',')[7]#country code
        shoppercountry_set.add(shoppercountry)
        interaction = line_ah.strip().split(',')[8]#online transaction or subscription
        interaction_set.add(interaction)
        if line_ah.strip().split(',')[9] == 'Chargeback':
            label = 1#label fraud
        else:
            label = 0#label save
        verification = line_ah.strip().split(',')[10]#shopper provide CVC code or not
        verification_set.add(verification)
        cvcresponse = int(line_ah.strip().split(',')[11])#0 = Unknown, 1=Match, 2=No Match, 3-6=Not checked
        if cvcresponse > 2:
            cvcresponse = 3
        year_info = datetime.datetime.strptime(line_ah.strip().split(',')[12],'%Y-%m-%d %H:%M:%S').year
        month_info = datetime.datetime.strptime(line_ah.strip().split(',')[12],'%Y-%m-%d %H:%M:%S').month
        day_info = datetime.datetime.strptime(line_ah.strip().split(',')[12],'%Y-%m-%d %H:%M:%S').day
        creationdate = str(year_info)+'-'+str(month_info)+'-'+str(day_info)#Date of transaction 
        creationdate_stamp = string_to_timestamp(line_ah.strip().split(',')[12])#Date of transaction-time stamp
        accountcode = line_ah.strip().split(',')[13]#merchant’s webshop
        accountcode_set.add(accountcode)
        mail_id = int(float(line_ah.strip().split(',')[14].replace('email','')))#mail
        mail_id_set.add(mail_id)
        ip_id = int(float(line_ah.strip().split(',')[15].replace('ip','')))#ip
        ip_id_set.add(ip_id)
        card_id = int(float(line_ah.strip().split(',')[16].replace('card','')))#card
        card_id_set.add(card_id)
        data.append([issuercountry, txvariantcode, issuer_id, amount, currencycode,
                    shoppercountry, interaction, verification, cvcresponse, creationdate_stamp,
                     accountcode, mail_id, ip_id, card_id, label, creationdate])
    data = sorted(data, key = lambda k: k[-1])
    return data

In [12]:
#Process the raw data so one can apply ML to it 
def pre_process_data(data):
    x = []
    y = []
    
    for item in data:
        feats = item[0:-2]
        label = item[-2]
        amount_GBP = conv_curr_2_GBP (item[4], item[3])
        feats.append(amount_GBP)
        x.append(feats)
        y.append(label)
        
    x = encode_categorical_features(x)
    return (np.array(x),np.array(y))

#Encode the categorical features by mapping strings to integers 
def encode_categorical_features(x):
    for item in list(issuercountry_set):
        issuercountry_dict[item] = list(issuercountry_set).index(item)
    for item in list(txvariantcode_set):
        txvariantcode_dict[item] = list(txvariantcode_set).index(item)
    for item in list(currencycode_set):
        currencycode_dict[item] = list(currencycode_set).index(item)
    for item in list(shoppercountry_set):
        shoppercountry_dict[item] = list(shoppercountry_set).index(item)
    for item in list(interaction_set):
        interaction_dict[item] = list(interaction_set).index(item)
    for item in list(verification_set):
        verification_dict[item] = list(verification_set).index(item)
    for item in list(accountcode_set):
        accountcode_dict[item] = list(accountcode_set).index(item)
    for item in x:
        item[0] = issuercountry_dict[item[0]]
        item[1] = txvariantcode_dict[item[1]]
        item[4] = currencycode_dict[item[4]]
        item[5] = shoppercountry_dict[item[5]]
        item[6] = interaction_dict[item[6]]
        item[7] = verification_dict[item[7]]
        item[10] = accountcode_dict[item[10]]
    return x

#Convert currency to British Pounds
def conv_curr_2_GBP (currencycode, amount):
    rates = {'NZD':0.46, 'AUD':0.49, 'GBP':1, 'MXN':0.04, 'SEK':0.08}
    rate = rates[currencycode]
    return rate*amount


In [21]:
#Run the ML algorithm using cross-validation
def evaluate_classifier(x,y,clf):
    TP, FP, FN, TN = 0, 0, 0, 0
    kf = KFold(n_splits=10)
    for train_index, test_index in kf.split(x):
        x_train, x_test = x[train_index], x[test_index]
        y_train, y_test = y[train_index], y[test_index]
        clf.fit(x_train, y_train)
        y_predict = clf.predict(x_test)
        for i in range(len(y_predict)):
            if y_test[i]==1 and y_predict[i]==1:
                TP += 1
            if y_test[i]==0 and y_predict[i]==1:
                FP += 1
            if y_test[i]==1 and y_predict[i]==0:
                FN += 1
            if y_test[i]==0 and y_predict[i]==0:
                TN += 1
    print ('TP: '+ str(TP/10))
    print ('FP: '+ str(FP/10))
    print ('FN: '+ str(FN/10))
    print ('TN: '+ str(TN/10))



In [13]:
#Pipeline
data = get_raw_data()
(x,y) = pre_process_data(data)
clf = svm.LinearSVC()
print(currencycode_set)
#evaluate_classifier(x,y,clf)

{'NZD', 'AUD', 'GBP', 'MXN', 'SEK'}
[[9.80000e+01 2.00000e+00 5.54629e+05 ... 1.11262e+05 2.19189e+05
  7.69440e+03]
 [9.80000e+01 2.00000e+00 5.48234e+05 ... 3.96400e+04 2.58889e+05
  1.17432e+03]
 [9.80000e+01 2.00000e+00 5.47046e+05 ... 6.48110e+04 1.80723e+05
  6.79280e+03]
 ...
 [1.10000e+02 4.00000e+00 4.12497e+05 ... 1.83787e+05 1.56668e+05
  5.83200e+03]
 [1.10000e+02 6.00000e+00 5.54501e+05 ... 1.24731e+05 1.75855e+05
  1.03200e+04]
 [1.10000e+02 6.00000e+00 5.54501e+05 ... 3.32449e+05 1.06653e+05
  7.19200e+03]]


In [14]:
print(x[0])


[9.80000000e+01 2.00000000e+00 5.54629000e+05 1.92360000e+05
 3.00000000e+00 1.22000000e+02 0.00000000e+00 0.00000000e+00
 0.00000000e+00 1.44365086e+09 0.00000000e+00 1.54469000e+05
 1.11262000e+05 2.19189000e+05 7.69440000e+03]
