## CREDIT CARD FRAUD DETECTION
- highly skewed
- very few fraud transactions compared the number of legitimate transactions
- Model sensitivity is highly important (actual positive vs predicted positive)


In [1]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from sklearn.metrics import confusion_matrix, auc, roc_auc_score, recall_score, precision_score, accuracy_score, f1_score

# import the dataset
data = pd.read_csv("../data/raw/credit_card_transactions.csv")
df = pd.DataFrame(data)

print(data.describe)

<bound method NDFrame.describe of          Unnamed: 0 trans_date_trans_time               cc_num  \
0                 0   2019-01-01 00:00:18     2703186189652095   
1                 1   2019-01-01 00:00:44         630423337322   
2                 2   2019-01-01 00:00:51       38859492057661   
3                 3   2019-01-01 00:01:16     3534093764340240   
4                 4   2019-01-01 00:03:06      375534208663984   
...             ...                   ...                  ...   
1296670     1296670   2020-06-21 12:12:08       30263540414123   
1296671     1296671   2020-06-21 12:12:19     6011149206456997   
1296672     1296672   2020-06-21 12:12:32     3514865930894695   
1296673     1296673   2020-06-21 12:13:36     2720012583106919   
1296674     1296674   2020-06-21 12:13:37  4292902571056973207   

                                    merchant       category     amt  \
0                 fraud_Rippin, Kub and Mann       misc_net    4.97   
1            fraud_Heller, Gutm

In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 24 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [3]:
pie_values = data.is_fraud.value_counts()
pie_labels = ['Valid transactions', 'Fraud transactions']

fig = go.Figure(data=[go.Pie(labels=pie_labels, values=pie_values)])
fig.update_layout(title="Fraud vs Valid transactions")
fig.show()

In [4]:
import datetime

print(df['trans_date_trans_time'].min())
print(df['trans_date_trans_time'].max())

2019-01-01 00:00:18
2020-06-21 12:13:37


In [5]:
# Convert transaction_date to datetime if not already
df['transaction_date'] = pd.to_datetime(df['trans_date_trans_time'])

# Extract year and month
df['Year'] = df['transaction_date'].dt.year
df['Month'] = df['transaction_date'].dt.month

# Convert transaction_date and birth_date to datetime if they are not already
df['birth_date'] = pd.to_datetime(df['dob'])

# Calculate age
current_date = pd.to_datetime('today')
df['age'] = (df['transaction_date'] - df['birth_date']).dt.days // 365

# Filter only fraud transactions
fraud_df = df[df['is_fraud'] == 1]

# Count fraud transactions per month for each year
fraud_counts = fraud_df.groupby(['Year', 'Month']).size().reset_index(name='Fraud_Count')

# Create the plot
fig = go.Figure()

# Loop through each year and add a line to the plot
for year in fraud_counts['Year'].unique():
    yearly_data = fraud_counts[fraud_counts['Year'] == year]
    fig.add_trace(go.Scatter(
        x=yearly_data['Month'],
        y=yearly_data['Fraud_Count'],
        mode='lines+markers',
        name=f'Year {year}',
        line=dict(width=2)
    ))

# Update layout for better readability
fig.update_layout(
    title='Monthly Fraud Transactions Count by Year',
    xaxis_title='Month',
    yaxis_title='Number of Fraud Transactions',
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(1, 13)),
        ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
        title='Month'
    ),
    yaxis=dict(
        showline=True,
        showgrid=True,
        showticklabels=True,
        linecolor='black',
        linewidth=2,
        ticks='outside',
        tickfont=dict(size=10)
    ),
    margin=dict(l=40, r=20, t=40, b=80),
    template='plotly_dark'
)

# Show the plot
fig.show()

In [6]:
# Count fraud transactions per age
fraud_counts_by_age = fraud_df.groupby('age').size().reset_index(name='Fraud_Count')

# Create the plot
fig = go.Figure()

# Add a bar plot for fraud counts by age
fig.add_trace(go.Bar(
    x=fraud_counts_by_age['age'],
    y=fraud_counts_by_age['Fraud_Count'],
    name='Fraud Transactions by Age',
    marker=dict(color='red')
))

# Update layout for better readability
fig.update_layout(
    title='Number of Fraud Transactions by Age',
    xaxis_title='Age',
    yaxis_title='Number of Fraud Transactions',
    xaxis=dict(
        showline=True,
        showgrid=True,
        showticklabels=True,
        linecolor='black',
        linewidth=2,
        ticks='outside',
        tickfont=dict(size=10)
    ),
    yaxis=dict(
        showline=True,
        showgrid=True,
        showticklabels=True,
        linecolor='black',
        linewidth=2,
        ticks='outside',
        tickfont=dict(size=10)
    ),
    margin=dict(l=40, r=20, t=40, b=80),
    template='plotly_dark'
)

# Show the plot
fig.show()

- Data clearly has no obvious trend based on the user age or transaction date
- usinf correlation matrix to better understand the data

In [7]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# Select only numeric columns for correlation
numeric_df = df.select_dtypes(include=[np.number])

# Compute the correlation matrix
corr_matrix = numeric_df.corr()

# Display the correlation matrix
print(corr_matrix)

# Create the heatmap
fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,
    x=corr_matrix.columns,
    y=corr_matrix.columns,
    colorscale='Viridis',
    colorbar=dict(title='Correlation'),
    zmin=-1, zmax=1  # Correlation values range from -1 to 1
))

# Update layout for better readability
fig.update_layout(
    title='Correlation Heatmap of Dataset',
    xaxis_title='Features',
    yaxis_title='Features',
    xaxis=dict(tickangle=-45),
    yaxis=dict(tickangle=-45),
    template='plotly_dark'
)

# Show the plot
fig.show()

               Unnamed: 0    cc_num       amt       zip       lat      long  \
Unnamed: 0       1.000000  0.000386 -0.000251  0.000709  0.000602 -0.000676   
cc_num           0.000386  1.000000  0.001769  0.041459 -0.059271 -0.048278   
amt             -0.000251  0.001769  1.000000  0.001843 -0.001926 -0.000187   
zip              0.000709  0.041459  0.001843  1.000000 -0.114290 -0.909732   
lat              0.000602 -0.059271 -0.001926 -0.114290  1.000000 -0.015533   
long            -0.000676 -0.048278 -0.000187 -0.909732 -0.015533  1.000000   
city_pop        -0.001678 -0.008991  0.005818  0.078467 -0.155730 -0.052715   
unix_time        0.998971  0.000354 -0.000293  0.000670  0.000632 -0.000642   
merch_lat        0.000541 -0.058942 -0.001873 -0.113561  0.993592 -0.015452   
merch_long      -0.000671 -0.048252 -0.000151 -0.908924 -0.015509  0.999120   
is_fraud        -0.004767 -0.000981  0.219404 -0.002162  0.001894  0.001721   
merch_zipcode    0.001427  0.056402  0.002074  0.980

- notes from correlation graph
- identifying columns to clean/remove
- zipcode/lat/long seems to have significance i assume???????!

In [8]:

from sklearn.calibration import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()
def textTransform(columnNames):
    for columnName in columnNames:
        df[columnName] = label_encoder.fit_transform(df[columnName])

df['amt'] = StandardScaler().fit_transform(df['amt'].values.reshape(-1,1))

textTransform(["merchant", "merch_zipcode", "job", "street", "city", "state", "gender", "category", "first", "last", "dob", 'transaction_date'])

df.drop(columns=['trans_date_trans_time', 'trans_num', 'birth_date'], inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 26 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Unnamed: 0        1296675 non-null  int64  
 1   cc_num            1296675 non-null  int64  
 2   merchant          1296675 non-null  int64  
 3   category          1296675 non-null  int64  
 4   amt               1296675 non-null  float64
 5   first             1296675 non-null  int64  
 6   last              1296675 non-null  int64  
 7   gender            1296675 non-null  int64  
 8   street            1296675 non-null  int64  
 9   city              1296675 non-null  int64  
 10  state             1296675 non-null  int64  
 11  zip               1296675 non-null  int64  
 12  lat               1296675 non-null  float64
 13  long              1296675 non-null  float64
 14  city_pop          1296675 non-null  int64  
 15  job               1296675 non-null  int64  
 16  

In [9]:
def split_data(df, drop_list):
  df = df.drop(drop_list, axis=1)
  print(df.columns)

  from sklearn.model_selection import train_test_split
  y = df['is_fraud'].values
  X = df.drop(['is_fraud'], axis=1).values

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

  print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
  return X_train, X_test, y_train, y_test

In [10]:
from sklearn.ensemble import RandomForestClassifier


def get_predictions(clf, X_train, y_train, X_test, y_test):
  # create classfier
  clf = clf
  # fit
  clf.fit(pd.DataFrame(X_train), pd.DataFrame(y_train))
  # predict
  y_pred = clf.predict(X_test)
  y_pred_prob = clf.predict_proba(X_test)
  train_pred = clf.predict(X_train)
  print('Training set confuction matrix: \n', confusion_matrix(y_train, train_pred))
  return y_pred, y_pred_prob

In [11]:
def print_scores(y_test, y_pred, y_pred_prob):
  print('test-set confusion matrix:\n', confusion_matrix(y_test, y_pred))
  print('recall score:\n', recall_score(y_test, y_pred))
  print('precision score:\n', precision_score(y_test, y_pred))
  print('f1 score:\n', f1_score(y_test, y_pred))
  print('accuracy score:\n', accuracy_score(y_test, y_pred))
  print('ROC AUC: {}'.format(roc_auc_score(y_test, y_pred_prob[:,1])))

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 26 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   Unnamed: 0        1296675 non-null  int64  
 1   cc_num            1296675 non-null  int64  
 2   merchant          1296675 non-null  int64  
 3   category          1296675 non-null  int64  
 4   amt               1296675 non-null  float64
 5   first             1296675 non-null  int64  
 6   last              1296675 non-null  int64  
 7   gender            1296675 non-null  int64  
 8   street            1296675 non-null  int64  
 9   city              1296675 non-null  int64  
 10  state             1296675 non-null  int64  
 11  zip               1296675 non-null  int64  
 12  lat               1296675 non-null  float64
 13  long              1296675 non-null  float64
 14  city_pop          1296675 non-null  int64  
 15  job               1296675 non-null  int64  
 16  

In [13]:
drop_list = []
X_train, X_test, y_train, y_test = split_data(df, drop_list)
y_pred, y_pred_prob = get_predictions(RandomForestClassifier(n_estimators=50), X_train, y_train, X_test, y_test)
print_scores(y_test, y_pred, y_pred_prob)

Index(['Unnamed: 0', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last',
       'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop',
       'job', 'dob', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'merch_zipcode', 'transaction_date', 'Year', 'Month', 'age'],
      dtype='object')
(1037340, 25) (259335, 25) (1037340,) (259335,)



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples,), for example using ravel().



Training set confuction matrix: 
 [[1031335       0]
 [      7    5998]]
test-set confusion matrix:
 [[257748     86]
 [   458   1043]]
recall score:
 0.6948700866089274
precision score:
 0.9238263950398583
f1 score:
 0.7931558935361217
accuracy score:
 0.9979023271058669
ROC AUC: 0.987228120999429


In [14]:
from sklearn.tree import DecisionTreeClassifier

drop_list = []
X_train, X_test, y_train, y_test = split_data(df, drop_list)
y_pred, y_pred_prob = get_predictions(DecisionTreeClassifier(), X_train, y_train, X_test, y_test)
print_scores(y_test, y_pred, y_pred_prob)

Index(['Unnamed: 0', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last',
       'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop',
       'job', 'dob', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'merch_zipcode', 'transaction_date', 'Year', 'Month', 'age'],
      dtype='object')
(1037340, 25) (259335, 25) (1037340,) (259335,)
Training set confuction matrix: 
 [[1031335       0]
 [      0    6005]]
test-set confusion matrix:
 [[257390    444]
 [   419   1082]]
recall score:
 0.7208527648234511
precision score:
 0.709043250327654
f1 score:
 0.7148992401717873
accuracy score:
 0.9966722578903735
ROC AUC: 0.8595653633064096


In [15]:
from sklearn.ensemble import GradientBoostingClassifier

drop_list = []
X_train, X_test, y_train, y_test = split_data(df, drop_list)
y_pred, y_pred_prob = get_predictions(GradientBoostingClassifier(), X_train, y_train, X_test, y_test)
print_scores(y_test, y_pred, y_pred_prob)

Index(['Unnamed: 0', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last',
       'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop',
       'job', 'dob', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud',
       'merch_zipcode', 'transaction_date', 'Year', 'Month', 'age'],
      dtype='object')
(1037340, 25) (259335, 25) (1037340,) (259335,)



A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



Training set confuction matrix: 
 [[1030654     681]
 [   2314    3691]]
test-set confusion matrix:
 [[257651    183]
 [   586    915]]
recall score:
 0.6095936042638241
precision score:
 0.8333333333333334
f1 score:
 0.7041169680646402
accuracy score:
 0.9970347234272273
ROC AUC: 0.9767013535406791
