In [6]:
import numpy as np
import pandas as pd
import sklearn
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import dill
import gzip
plt.style.use('seaborn')

In [12]:
train = pd.read_csv('training.csv')
train = train.iloc[:-100]
train.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [13]:
test = pd.read_csv('test.csv')
test.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy
0,TransactionId_50600,BatchId_35028,AccountId_2441,SubscriptionId_4426,CustomerId_2857,UGX,256,ProviderId_5,ProductId_3,airtime,ChannelId_3,1000.0,1000,2019-02-13T10:01:40Z,4
1,TransactionId_95109,BatchId_45139,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_5,ProductId_15,financial_services,ChannelId_3,2000.0,2000,2019-02-13T10:02:12Z,2
2,TransactionId_47357,BatchId_74887,AccountId_4841,SubscriptionId_3829,CustomerId_2857,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-50.0,50,2019-02-13T10:02:30Z,2
3,TransactionId_28185,BatchId_11025,AccountId_2685,SubscriptionId_4626,CustomerId_3105,UGX,256,ProviderId_5,ProductId_10,airtime,ChannelId_3,3000.0,3000,2019-02-13T10:02:38Z,4
4,TransactionId_22140,BatchId_29804,AccountId_4841,SubscriptionId_3829,CustomerId_3105,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-60.0,60,2019-02-13T10:02:58Z,2


In [14]:
from sklearn.preprocessing import LabelEncoder

def preprocessdata(data):
    data.drop_duplicates(keep="first", inplace=True) #removing duplicate data if any
    #Extracting time and day from the TransactionStartTime column to create new features

    data['TransactionStartTime'] = data['TransactionStartTime'].str.replace('T', ' ')
    data['TransactionStartTime'] = data['TransactionStartTime'].str.replace('Z', '')
    data['TransactionStartTime'] = pd.to_datetime(data['TransactionStartTime'], infer_datetime_format=True)
    data['hour'] = pd.to_datetime(data.TransactionStartTime).dt.hour
    data['minute'] = pd.to_datetime(data.TransactionStartTime).dt.minute
    data['day'] = pd.to_datetime(data.TransactionStartTime).dt.dayofweek
    # dropping the transaction starttime column
    data = data.drop(["TransactionStartTime"], axis=1)
    
    
    
    #encoding the categorical data
    for col in data.columns:
        if data[col].dtype == 'object' and col not in ['TransactionId', 'FraudResult']:
            print(col)
            lbl = LabelEncoder()
            data[col] = lbl.fit_transform(list(data[col].values.astype('str')))
    #Normalizing Amount and value columns
    data["Value"] = data["Value"].abs()
    data["Amount"] = data["Amount"].abs()
    
    # dropping non-predictor feature columns and the target(train-set only)
    if "FraudResult" in data.columns:
        target = data["FraudResult"]
        data = data.drop(["FraudResult"], axis=1)
        
    else:
        target = None
    if "TransactionId" in data.columns:
        transaction_id = data["TransactionId"]
        data = data.drop(["TransactionId"], axis=1)
        
    return data, target, transaction_id

    

In [15]:
dt, de, tid = preprocessdata(train)
dt.head()

BatchId
AccountId
SubscriptionId
CustomerId
CurrencyCode
ProviderId
ProductId
ProductCategory
ChannelId


Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,PricingStrategy,hour,minute,day
0,46927,2489,3534,2583,0,256,5,1,0,2,1000.0,1000,2,2,18,3
1,31724,3218,2366,2583,0,256,3,19,2,1,20.0,20,2,2,19,3
2,60208,2712,996,2805,0,256,5,0,0,2,500.0,500,2,2,44,3
3,1796,3350,974,3732,0,256,0,11,8,2,20000.0,21800,2,3,32,3
4,48886,3218,2366,3732,0,256,3,19,2,1,644.0,644,2,3,34,3


In [16]:
tid.head()

0    TransactionId_76871
1    TransactionId_73770
2    TransactionId_26203
3      TransactionId_380
4    TransactionId_28195
Name: TransactionId, dtype: object

In [7]:
with gzip.open("fraud_model.dill.gz", 'rb') as f:
    model = dill.load(f)

In [18]:

print("Predicting...")
predictions = model.predict(dt)
submission = pd.DataFrame({'TransactionId': tid, 'FraudResult': predictions})
(submission['FraudResult']==1).value_counts()

Predicting...


False    95376
True       186
Name: FraudResult, dtype: int64

In [26]:
submission[submission['FraudResult'] == train['FraudResult']].count()

TransactionId    95549
FraudResult      95549
dtype: int64

In [27]:
submission[submission['FraudResult'] != train['FraudResult']].count()

TransactionId    13
FraudResult      13
dtype: int64