In [1]:
#importing libraries
import pandas as pd

In [2]:
df = pd.read_csv('../data/training.csv', parse_dates=['TransactionStartTime'])
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0


## Feature engineering

In [3]:
# create column IsDebit (direction, 0 = credit, 1 = debit) 
df["IsDebit"] = (df["Amount"] > 0).astype(int)

# create a column which is 0 if abs("Amount")=="Value" and 1 if not
df['difference'] = df.eval("abs(Amount) - Value")
df.loc[df['difference'] != 0, 'difference'] = 1
df = df.drop("Amount", axis = 1)

In [4]:
# create weekday column
df['weekday'] = df['TransactionStartTime'].dt.dayofweek

# creating time of day column
df["time_of_day"] = df["TransactionStartTime"].dt.second + df["TransactionStartTime"].dt.minute * 60 + df["TransactionStartTime"].dt.hour * 3600
df = df.drop("TransactionStartTime", axis=1)

In [5]:
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Value,PricingStrategy,FraudResult,IsDebit,difference,weekday,time_of_day
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000,2,0,1,0.0,3,8329
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,20,2,0,0,0.0,3,8348
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500,2,0,1,0.0,3,9861
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,21800,2,0,1,1.0,3,12775
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,644,2,0,0,0.0,3,12861


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   TransactionId    95662 non-null  object 
 1   BatchId          95662 non-null  object 
 2   AccountId        95662 non-null  object 
 3   SubscriptionId   95662 non-null  object 
 4   CustomerId       95662 non-null  object 
 5   CurrencyCode     95662 non-null  object 
 6   CountryCode      95662 non-null  int64  
 7   ProviderId       95662 non-null  object 
 8   ProductId        95662 non-null  object 
 9   ProductCategory  95662 non-null  object 
 10  ChannelId        95662 non-null  object 
 11  Value            95662 non-null  int64  
 12  PricingStrategy  95662 non-null  int64  
 13  FraudResult      95662 non-null  int64  
 14  IsDebit          95662 non-null  int64  
 15  difference       95662 non-null  float64
 16  weekday          95662 non-null  int32  
 17  time_of_day 

In [7]:
df[['PricingStrategy', 'weekday']] = df[['PricingStrategy', 'weekday']].astype('object')

# rearrange columns (categoricals first)
df = df[["ProviderId", "ProductCategory", "ChannelId", "PricingStrategy", 
    "weekday", "difference", "IsDebit", "Value", "time_of_day", "FraudResult"]]


In [8]:
df.head()

Unnamed: 0,ProviderId,ProductCategory,ChannelId,PricingStrategy,weekday,difference,IsDebit,Value,time_of_day,FraudResult
0,ProviderId_6,airtime,ChannelId_3,2,3,0.0,1,1000,8329,0
1,ProviderId_4,financial_services,ChannelId_2,2,3,0.0,0,20,8348,0
2,ProviderId_6,airtime,ChannelId_3,2,3,0.0,1,500,9861,0
3,ProviderId_1,utility_bill,ChannelId_3,2,3,1.0,1,21800,12775,0
4,ProviderId_4,financial_services,ChannelId_2,2,3,0.0,0,644,12861,0


In [9]:
df.to_csv('../data/training_new.csv', index=False)