### Pre-processing Data

Before Running this notebook, navigate to https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud.
Download the data file and upload it into this directory as well as the folder titled 'raw'

In [1]:
import pandas as pd
from sklearn.impute import KNNImputer
from datetime import timedelta
import numpy as np

**Removing Na Values**

In [2]:
raw = pd.read_csv('creditcard.csv')
raw.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
print(raw.isnull().sum())
print(raw.isnull().sum().sum())

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64
0


In [4]:
raw.isnull().sum(axis=1)

0         0
1         0
2         0
3         0
4         0
         ..
284802    0
284803    0
284804    0
284805    0
284806    0
Length: 284807, dtype: int64

In [5]:
dropna = pd.DataFrame(raw.drop([259431]), columns = raw.columns)

In [6]:
#dropna is raw data w/o row containing Nas
dropna.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
#when I first imported dataset, row 259431 contained 19 Nan values. Not sure why this changed upon a rerun, but I have droped it just to be sure 

**Feature Engineering: Create Columns for Transaction Time**

In [8]:
first = dropna['Time'][0]
print(first)
last = dropna['Time'][259430]
print(last)

0.0
159112.0


In [9]:
#arbitrary start date 
df = dropna.copy()
start_date = pd.to_datetime("2023-01-01")

df['TransactionTime'] = start_date + pd.to_timedelta(dropna['Time'], unit='s')
df['HourOfDay'] = df['TransactionTime'].dt.hour
df['MinuteOfHour'] = df['TransactionTime'].dt.minute
df['TransactionTime'] = df['TransactionTime'].dt.time


In [10]:
#ordering columns
df = df[['Time', 'TransactionTime', 'HourOfDay', 'MinuteOfHour'] + [f'V{i}' for i in range(1, 29)] + ['Amount', 'Class']]


In [11]:
df.head()

Unnamed: 0,Time,TransactionTime,HourOfDay,MinuteOfHour,V1,V2,V3,V4,V5,V6,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,00:00:00,0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,00:00:00,0,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,00:00:01,0,0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,00:00:01,0,0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,00:00:02,0,0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


**Checking For Highly Correlated Features**

In [12]:
corr_matrix = dropna.corr().abs()
corr_matrix

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
Time,1.0,0.1173978,0.01059564,0.4196193,0.1052578,0.1730704,0.06301445,0.0847117,0.03694951,0.008661015,...,0.04473773,0.1440638,0.05114142,0.01617781,0.2330801,0.04140874,0.005137379,0.009414623,0.010595,0.012322
V1,0.117398,1.0,3.911733e-07,3.08252e-08,4.780445e-07,3.969157e-07,3.869835e-07,5.159552e-07,8.247681e-09,9.510919e-08,...,3.167514e-07,7.003373e-07,1.9102e-07,7.107723e-07,6.417753e-07,2.572706e-07,4.757618e-07,3.268673e-07,0.227709,0.101347
V2,0.010596,3.911733e-07,1.0,6.343124e-08,9.837066e-07,8.16762e-07,7.963238e-07,1.061718e-06,1.697185e-08,1.95713e-07,...,6.518022e-07,1.441135e-06,3.930757e-07,1.462607e-06,1.320627e-06,5.294042e-07,9.790094e-07,6.726184e-07,0.531409,0.091289
V3,0.419619,3.08252e-08,6.343124e-08,1.0,7.751796e-08,6.43624e-08,6.275184e-08,8.366542e-08,1.337415e-09,1.542256e-08,...,5.136326e-08,1.135642e-07,3.097512e-08,1.152563e-07,1.04068e-07,4.171806e-08,7.714781e-08,5.300362e-08,0.210881,0.192961
V4,0.105258,4.780445e-07,9.837066e-07,7.751796e-08,1.0,9.981473e-07,9.731703e-07,1.297503e-06,2.074093e-08,2.391767e-07,...,7.965536e-07,1.76118e-06,4.803694e-07,1.787421e-06,1.61391e-06,6.469736e-07,1.196426e-06,8.219926e-07,0.098731,0.133447
V5,0.17307,3.969157e-07,8.16762e-07,6.43624e-08,9.981473e-07,1.0,8.080138e-07,1.077304e-06,1.722099e-08,1.985861e-07,...,6.613706e-07,1.46229e-06,3.98846e-07,1.484078e-06,1.340014e-06,5.371758e-07,9.933812e-07,6.824924e-07,0.386356,0.094974
V6,0.063014,3.869835e-07,7.963238e-07,6.275184e-08,9.731703e-07,8.080138e-07,1.0,1.050346e-06,1.679007e-08,1.936168e-07,...,6.448209e-07,1.425699e-06,3.888655e-07,1.446942e-06,1.306482e-06,5.237339e-07,9.685234e-07,6.654141e-07,0.215981,0.043643
V7,0.084712,5.159552e-07,1.061718e-06,8.366542e-08,1.297503e-06,1.077304e-06,1.050346e-06,1.0,2.238577e-08,2.581443e-07,...,8.597232e-07,1.900848e-06,5.184645e-07,1.92917e-06,1.7419e-06,6.98281e-07,1.291308e-06,8.871797e-07,0.397312,0.187257
V8,0.03695,8.247681e-09,1.697185e-08,1.337415e-09,2.074093e-08,1.722099e-08,1.679007e-08,2.238577e-08,1.0,4.126506e-09,...,1.37429e-08,3.038556e-08,8.287792e-09,3.08383e-08,2.784473e-08,1.116221e-08,2.064189e-08,1.41818e-08,0.103079,0.019875
V9,0.008661,9.510919e-08,1.95713e-07,1.542256e-08,2.391767e-07,1.985861e-07,1.936168e-07,2.581443e-07,4.126506e-09,1.0,...,1.584781e-07,3.503949e-07,9.557174e-08,3.556158e-07,3.21095e-07,1.287184e-07,2.380346e-07,1.635393e-07,0.044245,0.097733


In [13]:
correlation_threshold = 0.85
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper.columns if any(upper[column] > correlation_threshold)]
print(to_drop)

[]


In [14]:
lower = corr_matrix.where(np.tril(np.ones(corr_matrix.shape), k=-1).astype(bool))
to_drop = [column for column in lower.columns if any(lower[column] > correlation_threshold)]
print(to_drop)

[]


There exists no highly correlated features

**Importing New Data to CSV**

In [None]:
df.to_csv('processed.csv', index=False)