In [2]:
import pandas as pd
import random

In [3]:
df = pd.read_csv('raw.csv')

In [4]:
# Show counts of 'step' unique value
df['step'].unique()
df['step'].value_counts()

19     51352
18     49579
187    49083
235    47491
307    46968
       ...  
432        4
706        4
693        4
112        2
662        2
Name: step, Length: 743, dtype: int64

In [5]:
# 743 steps equal roughly 31 days which can be used as timestamp
x = 743/24
print(x)

30.958333333333332


In [6]:
# Check if there are any nulls or missing values in the dataset
df.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [7]:
# Check how many rows are confirmed to be fraud
df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

In [8]:
# Check how many rows are flagged for fraud
df['isFlaggedFraud'].value_counts()

0    6362604
1         16
Name: isFlaggedFraud, dtype: int64

In [9]:
# Check how many rows of flagged for fraud is actually confirmed to be fraud
df['isFraud'][df['isFlaggedFraud']==1].value_counts()

1    16
Name: isFraud, dtype: int64

In [10]:
# Show all row which flagged as fraud that actually fraud
df[(df['isFraud']==1) & (df['isFlaggedFraud']==1)]

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
2736446,212,TRANSFER,4953893.08,C728984460,4953893.08,4953893.08,C639921569,0.0,0.0,1,1
3247297,250,TRANSFER,1343002.08,C1100582606,1343002.08,1343002.08,C1147517658,0.0,0.0,1,1
3760288,279,TRANSFER,536624.41,C1035541766,536624.41,536624.41,C1100697970,0.0,0.0,1,1
5563713,387,TRANSFER,4892193.09,C908544136,4892193.09,4892193.09,C891140444,0.0,0.0,1,1
5996407,425,TRANSFER,10000000.0,C689608084,19585040.37,19585040.37,C1392803603,0.0,0.0,1,1
5996409,425,TRANSFER,9585040.37,C452586515,19585040.37,19585040.37,C1109166882,0.0,0.0,1,1
6168499,554,TRANSFER,3576297.1,C193696150,3576297.1,3576297.1,C484597480,0.0,0.0,1,1
6205439,586,TRANSFER,353874.22,C1684585475,353874.22,353874.22,C1770418982,0.0,0.0,1,1
6266413,617,TRANSFER,2542664.27,C786455622,2542664.27,2542664.27,C661958277,0.0,0.0,1,1
6281482,646,TRANSFER,10000000.0,C19004745,10399045.08,10399045.08,C1806199534,0.0,0.0,1,1


In [None]:
# Rows that fraud, but not flagged as fraud
df[df['isFraud']!=df['isFlaggedFraud']]

In [None]:
# Show counts of any payment type used for fraud transaction
df['type'][df['isFraud']==1].value_counts()

In [13]:
# Make a days and hour variable
days = 7
hours = 24
# Make a new column that convert step into days
df['days'] = df['step']%days
df['hours'] = df['step']%hours - 1
df['day_convert'] = round(df['step']/24)

In [14]:
# Convert into real timestamp
df['date'] = pd.to_datetime(df['day_convert'],unit='D',origin=pd.Timestamp('2022-01-01 00:00:00')) + pd.TimedeltaIndex(df['hours'], unit='H')

In [16]:
# Add random integer to seconds new columns
empty_list = []
for i in range(df.shape[0]):
    empty_list.append(random.randint(0, 360))
df["seconds"] = empty_list

In [17]:
# Convert seconds column into seconds
df['date'] = df["date"] + pd.TimedeltaIndex(df['seconds'], unit='seconds')
df = df.drop(['days', 'hours', 'seconds', 'day_convert'], axis=1)

In [18]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,date
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,2022-01-01 00:00:05
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,2022-01-01 00:00:35
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,2022-01-01 00:03:31
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,2022-01-01 00:04:10
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,2022-01-01 00:02:30


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 12 columns):
 #   Column          Dtype         
---  ------          -----         
 0   step            int64         
 1   type            object        
 2   amount          float64       
 3   nameOrig        object        
 4   oldbalanceOrg   float64       
 5   newbalanceOrig  float64       
 6   nameDest        object        
 7   oldbalanceDest  float64       
 8   newbalanceDest  float64       
 9   isFraud         int64         
 10  isFlaggedFraud  int64         
 11  date            datetime64[ns]
dtypes: datetime64[ns](1), float64(5), int64(3), object(3)
memory usage: 582.5+ MB


In [20]:
df.to_csv('PS_20174392719_1491204439457_log.csv', index=False)