In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import datatable as dt

In [2]:
test_transaction = dt.fread("/home/rmodi/Documents/CSE519/hw2/dataset/test_transaction.csv").to_pandas()
test_identity = dt.fread("/home/rmodi/Documents/CSE519/hw2/dataset/test_identity.csv").to_pandas()

testset = pd.merge(test_transaction, test_identity, on='TransactionID', how='outer')
%reset_selective -f test_transaction
%reset_selective -f test_identity

testset = testset[["TransactionID","DeviceType","DeviceInfo","TransactionDT","TransactionAmt","ProductCD","card4","card6","P_emaildomain","R_emaildomain","addr1","addr2","dist1","dist2"]]

testset['TransactionDT_day'] = testset['TransactionDT'].apply(lambda x: int(x/86400))
testset['TransactionDT_hour'] = testset['TransactionDT'].apply(lambda x: int(x/86400%1*24))
testset['TransactionDT_min'] = testset['TransactionDT'].apply(lambda x: int(x/86400%1*24%1*60))
testset['TransactionDT_sec'] = testset['TransactionDT'].apply(lambda x: int(x/86400%1*24%1*60%1*60))

In [18]:
# Create a copy of trainset for easy resetting
dataset = testset.copy()

In [19]:
# Replace columns that have empty value with 'unknown' value
cols = ['card4', 'card6', 'DeviceType', 'DeviceInfo', 'P_emaildomain', 'R_emaildomain']
dataset[cols] = dataset[cols].replace({'': 'unknown'})

# Keep the top 5 column values and group remaining ones into 'Others'
top5_deviceinfo = set(dataset['DeviceInfo'].value_counts()[:5].index)
dataset['DeviceInfo'] = dataset['DeviceInfo'].apply(lambda x: x if x in top5_deviceinfo else 'Others')

# Group categories that are similar into one category [4]
regex_patterns = {
    r'^frontier.*$': 'frontier.com',
    r'^gmail.*$': 'gmail.com',
    r'^hotmail.*$': 'hotmail.com',
    r'^live.*$': 'live.com',
    r'^netzero.*$': 'netzero.com',
    r'^outlook.*$': 'outlook.com',
    r'^yahoo.*$': 'yahoo.com'
}
replacements = {
    'P_emaildomain': regex_patterns,
    'R_emaildomain': regex_patterns
}

dataset.replace(replacements, regex=True, inplace=True)

dataset['hr_sin'] = np.sin((dataset['TransactionDT_hour'] + dataset['TransactionDT_min']/60.0)*(np.pi/12.0))
dataset['hr_cos'] = np.cos((dataset['TransactionDT_hour'] + dataset['TransactionDT_min']/60.0)*(np.pi/12.0))
dataset['TransactionAmt_lg'] = np.log(dataset['TransactionAmt'])

from sklearn.preprocessing import RobustScaler

rob_scaler = RobustScaler()
dataset['TransactionAmt_scaled'] = rob_scaler.fit_transform(dataset['TransactionAmt'].values.reshape(-1,1))

categorical_cols = ['DeviceType', 'ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain']
ohe = pd.get_dummies(dataset[categorical_cols])
ohe.drop('P_emaildomain_scranton.edu', axis=1, inplace=True)
dataset = dataset.join(ohe)

train_cols = ["TransactionAmt_scaled","hr_sin","hr_cos"] + list(ohe.columns)

In [30]:
import joblib
clf = joblib.load('models/rs_smote_xgb.pkl')
test_preds_proba = clf.predict_proba(dataset[train_cols].values)
test_preds = clf.predict(dataset[train_cols].values)

In [32]:
pd.DataFrame(test_preds)[0].value_counts()

False    376794
True     129897
Name: 0, dtype: int64

In [33]:
output = pd.DataFrame({'TransactionID': testset.TransactionID, 'isFraud': test_preds_proba[:,1]})

In [34]:
output.to_csv('dataset/submission_6.csv', index=False)