# **Transaction Anomaly Detection using Customer Profiling**

In [57]:
from IPython.display import clear_output
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
# pd.set_option("display.float_format","{:.2f}".format)
pio.templates["mod"] = go.layout.Template(layout=dict(font=dict(family="Fira Code")))
pio.templates.default = "plotly_dark+mod"
from scipy import stats
import statsmodels.api as sm
from sklearn.model_selection import StratifiedKFold,GridSearchCV,StratifiedShuffleSplit
from sklearn.metrics import confusion_matrix,roc_auc_score,f1_score
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,LabelEncoder,OneHotEncoder,MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,AdaBoostClassifier,VotingClassifier
from xgboost import XGBClassifier
from zipfile import ZipFile
from glob import glob
import sys
import shutil
import warnings
warnings.filterwarnings(action="ignore")
!pip install scipy==1.9.3
!pip install numpy==1.25.1
clear_output()

In [None]:
all_files = glob('C:/Users/Ameya Dabholkar/Downloads/fraud-detection-dataset/Data/**/*.csv',recursive=True)
all_files

In [None]:
def preprocess():
    global account_activity,amount_data,anomaly_scores,customer_data,fraud_indicators,merchant_data,suspicious_activity,transaction_category_labels,transaction_metadata,transaction_records
    account_activity = pd.read_csv(all_files[0])
    amount_data = pd.read_csv(all_files[6])
    anomaly_scores = pd.read_csv(all_files[7])
    customer_data = pd.read_csv(all_files[1])
    fraud_indicators = pd.read_csv(all_files[9])
    merchant_data = pd.read_csv(all_files[5])
    suspicious_activity = pd.read_csv(all_files[8])
    transaction_category_labels = pd.read_csv(all_files[4])
    transaction_metadata = pd.read_csv(all_files[2])
    transaction_records = pd.read_csv(all_files[3])
    df = pd.merge(left=account_activity,right=customer_data,right_on="CustomerID",left_on="CustomerID")
    df = pd.merge(left=df,right=transaction_records,left_on="CustomerID",right_on="CustomerID")
    df = pd.merge(left=df,right=suspicious_activity,left_on="CustomerID",right_on="CustomerID")
    df = pd.merge(left=df,right=transaction_metadata,left_on="TransactionID",right_on="TransactionID")
    df = pd.merge(left=df,right=amount_data,left_on="TransactionID",right_on="TransactionID")
    df = pd.merge(left=df,right=fraud_indicators,left_on="TransactionID",right_on="TransactionID")
    df = pd.merge(left=df,right=anomaly_scores,left_on="TransactionID",right_on="TransactionID")
    df = pd.merge(left=df,right=transaction_category_labels,left_on="TransactionID",right_on="TransactionID")
    df = pd.merge(left=df,right=merchant_data,left_on="MerchantID",right_on="MerchantID")
    df.drop(['Name','Address','MerchantName','Location','LastLogin','TransactionID','MerchantID','CustomerID'],axis=1,inplace=True)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    return df

LastLogin Column has been omitted as there are some clashes between Timestamp column and LastLogin<br>
some LastLogin dates pre-dated the Timestamp column values which should be statistically impossible

In [None]:
df = preprocess()

<font size=4>

|Column Name|Type of column|Column Description|
|------|-----|-----|
|Customer ID|Categorical|Value counts will tell us how many transactions they have done|
|Account Balance|Continuous|The Amount of money left in their bank account|
|Age|Continuous|Ages at which they made a transactions|
|Transaction ID|Categorical|Unique IDs given to transactions|
|Transaction Amount|Continuous|Amount of the transactions thats carried out|
|Suspicious Flag|Categorical|Suspicious flag 0 or 1, 0 for not 1 for yes|
|Timestamp|Continuous|Time at which transaction has been carried out|
|Merchant ID|Categorical|Unique ID at the the transaction has been carried out|
|Amount|Continuous|Transaction Amount of the fraudulent activity|
|Fraud Indicator|Categorical|Whether its been flagged Fraud or not|
|Anomaly Score|Continuous|The score given for its potential fraud|
|Category|Categorical|Categories at which they made the transactions|

</font>

In [None]:
time_index = pd.date_range(start=df["Timestamp"].min(),end=df["Timestamp"].max(),freq="H")
display(time_index[0])
time_index[-1]

In [None]:
temp = df.copy()
temp = temp.set_index('Timestamp').sort_index()
temp["weekday"] = temp.index.day_name()
temp["Hour"] = temp.index.strftime("%H")
temp["Working"] = np.NaN
temp.loc[temp.between_time(start_time="9:00:00",end_time="17:00:00").index,"Working"] = 1
temp.fillna(0,inplace=True)
temp["Day"] = np.NaN
temp.loc[temp.between_time(start_time="6:00:00",end_time="18:00:00").index,"Day"] = 1
temp.fillna(0,inplace=True)
temp.head()

### Account Balance

In [None]:
px.histogram(temp,x="AccountBalance",marginal="violin").add_vline(x=temp.AccountBalance.mean(),line=dict(dash="dash",color="#202ff5"),annotation=dict(text=f"mean = {temp.AccountBalance.mean():.2f}",y=0.25,font=dict(color="#ffffff",size=20),align="center"))

In [None]:
stats.shapiro(temp.AccountBalance)

In [None]:
data = sm.qqplot(temp.AccountBalance,line="s").gca().lines
plt.close()
fig = go.Figure()
fig.add_trace(go.Scatter(x=data[0].get_xdata(),y=data[0].get_ydata(),mode="markers",name="Obtained<br>Quantiles"))
fig.add_trace(go.Scatter(x=data[1].get_xdata(),y=data[1].get_ydata(),mode="lines",name="Expected<br>Quantiles"))
fig.update_layout(width=700)

#### Calculating goodness of fit with Different Distributions

In [None]:
for i in ["norm","lognorm","uniform","expon","weibull_min","gamma"]:
    print(i,stats.kstest(temp.AccountBalance.to_numpy(),i,args=getattr(stats,i).fit(temp.AccountBalance))[1])

### Age

In [None]:
px.histogram(temp,x="Age",marginal="violin").add_vline(x=temp.Age.mean(),line=dict(dash="dash",color="#202ff5"),annotation=dict(text=f"mean = {temp.Age.mean():.2f}",y=0.25,font=dict(color="#ffffff",size=20),align="center"))

In [None]:
stats.shapiro(temp.Age)

In [None]:
stats.normaltest(temp.Age.to_numpy())

In [None]:
data = sm.qqplot(temp.Age,line="s").gca().lines
plt.close()
fig = go.Figure()
fig.add_trace(go.Scatter(x=data[0].get_xdata(),y=data[0].get_ydata(),mode="markers",name="Obtained<br>Quantiles"))
fig.add_trace(go.Scatter(x=data[1].get_xdata(),y=data[1].get_ydata(),mode="lines",name="Expected<br>Quantiles"))
fig.update_layout(width=700)

#### Calculating goodness of fit with different Distributions

In [None]:
for i in ["norm","lognorm","uniform","expon","weibull_min","gamma"]:
    print(i,stats.kstest(temp.Age.to_numpy(),i,args=getattr(stats,i).fit(temp.Age))[1])

### Amount

In [None]:
px.histogram(temp,x="Amount",marginal="violin").add_vline(x=temp.Amount.mean(),line=dict(color="#202fff",dash='dash'),annotation=dict(text=f"mean : {temp.Amount.mean():.2f}",y=0.5))

In [None]:
data = sm.qqplot(temp.Amount.to_numpy(),line="s").gca().lines
plt.close()
fig = go.Figure()
fig.add_trace(go.Scatter(x=data[0].get_xdata(),y=data[0].get_ydata(),mode="markers",name="Observred<br>Quantiles"))
fig.add_trace(go.Scatter(x=data[1].get_xdata(),y=data[1].get_ydata(),mode="lines",name="Expeceted<br>Quantiles"))
fig.update_layout(width=700)
fig.show()

#### Testing goodness of fit with different distributions

In [None]:
for i in ["norm","lognorm","uniform","expon","weibull_min","gamma"]:
    print(i,stats.kstest(temp.Amount.to_numpy(),i,args=getattr(stats,i).fit(temp.Amount))[1])

### Transaction Amount

In [None]:
px.histogram(temp,x="TransactionAmount",marginal="violin").add_vline(x=temp.TransactionAmount.mean(),line=dict(color="#202fff",dash='dash'),annotation=dict(text=f"mean : {temp.TransactionAmount.mean():.2f}",y=0.5))

In [None]:
stats.shapiro(temp.TransactionAmount)

In [None]:
stats.normaltest(temp.TransactionAmount)

In [None]:
for i in ["norm","expon","weibull_min","uniform","lognorm","gamma"]:
    print(i,stats.kstest(temp.TransactionAmount,i,args=getattr(stats,i).fit(temp.TransactionAmount))[1])

## Categorical Variables

In [None]:
temp.head(2)

In [None]:
px.histogram(temp,x="SuspiciousFlag",histfunc="count",title="Suspicious Flag",color="SuspiciousFlag",color_discrete_map={0:"#636efa",1:"red"})

In [None]:
sm.stats.proportions_ztest(count=temp.SuspiciousFlag.value_counts().to_numpy(),nobs=temp.shape[0])

In [None]:
px.histogram(temp,x="FraudIndicator",histfunc="count",title="Fraud Indicator",color="FraudIndicator",color_discrete_map={0:"#636efa",1:"red"})

In [None]:
sm.stats.proportions_ztest(temp.FraudIndicator.value_counts(),nobs=temp.shape[0])

In [None]:
temp.Category.value_counts()

In [None]:
fig = make_subplots(cols=2,specs=[[{"type":"xy"},{"type":"domain"}]])
fig.add_trace(go.Bar(x=temp.Category.value_counts().index,y=temp.Category.value_counts(),marker=dict(color=px.colors.qualitative.Plotly)),row=1,col=1)
fig.add_trace(go.Pie(values=temp.Category.value_counts().to_numpy(),labels=temp.Category.value_counts().index,textinfo="label+percent+value",marker=dict(colors=px.colors.qualitative.Plotly),sort=False),row=1,col=2)
fig.update_layout(showlegend=False,title="Category")
fig.update_yaxes(title="count")
fig.update_xaxes(title="category")
fig.show()

In [None]:
print("p-value  :",stats.chi2_contingency([temp.Category.value_counts(),np.full_like(temp.Category.value_counts(),1000/temp.Category.nunique())])[1])

In [None]:
px.histogram(temp,x="weekday",color="weekday",title="Weekday").update_layout(showlegend=False)

In [None]:
print("p-value  :",stats.chi2_contingency([temp.weekday.value_counts(),np.full_like(temp.weekday.value_counts(),1000/temp.weekday.nunique())])[1])

In [None]:
px.histogram(temp,x="Hour",color="Hour").update_layout(showlegend=False)

In [None]:
print("p-value  :",stats.chi2_contingency([temp.Hour.value_counts(),np.full_like(temp.Hour.value_counts(),1000/temp.Hour.nunique())])[1])

### Between AccountBalance and TransactionAmount

In [None]:
cont_cols = ["TransactionAmount","Amount","AccountBalance","AnomalyScore","Age"]

In [None]:
corr_arr = temp[cont_cols].corr()
np.fill_diagonal(corr_arr.to_numpy(),0)
fig = px.imshow(corr_arr,text_auto=".2f",color_continuous_scale="GnBu_r",title="Correlation between all the continuous Columns")
fig.add_annotation(text="same columns values are made zero<br>to understand <br>the relative dependence",x=1.05,y=0,xanchor="left",yanchor="bottom",xref="x domain",yref="y domain",showarrow=False)
fig.show()

In [None]:
ksamp_df = pd.DataFrame(columns=cont_cols,index=cont_cols)

In [None]:
for col in cont_cols:
    for ind in cont_cols:
        sample_1 = ((temp[ind]-temp[ind].min())/(temp[ind].max()-temp[ind].min())).to_numpy()
        sample_2 = ((temp[col]-temp[col].min())/(temp[col].max()-temp[col].min())).to_numpy()
        ksamp_df.loc[ind,col] = stats.ks_2samp(sample_1,sample_2).pvalue

In [None]:
np.fill_diagonal(ksamp_df.to_numpy(),0)

In [None]:
fig = px.imshow(img=ksamp_df,text_auto=".2f",color_continuous_scale="GnBu_r",title="Kolmogorov Smirnov Test")
fig.add_annotation(text="same columns values are made zero<br>to understand <br>the relative dependence",x=1.05,y=0,xanchor="left",yanchor="bottom",xref="x domain",yref="y domain",showarrow=False)
fig.add_annotation(text="The Values are P-Values where<br>higher the value more the probability that<br>they came from same distribution<br>values greater than 0.05(significance) suggest<br>that they are indeed from same distribution",x=1.05,y=0.5,xanchor="left",yanchor="bottom",font=dict(size=10),xref="x domain",yref="y domain",showarrow=False)
fig.update_yaxes(showgrid=False)
fig.update_xaxes(showgrid=False)
fig.update_layout(margin=dict(l=0))

### Contingency test between FraudIndicator and the rest of the categorical columns

In [None]:
temp.columns

In [None]:
def givemat(col):
    return pd.crosstab(index=temp.FraudIndicator,columns=temp[col])

In [None]:
print(f'''p-value for Working :{sm.stats.mcnemar(givemat("Working").to_numpy()).pvalue}''')
print(f'''p-value for Hour :{stats.chi2_contingency(givemat("Hour").to_numpy())[1]}''')
print(f'''p-value for Day :{sm.stats.mcnemar(givemat("Day").to_numpy()).pvalue}''')
print(f'''p-value for Category :{stats.chi2_contingency(givemat("Category").to_numpy())[1]}''')

As we can see Working and Day are having dependency as they can be considered paired because of how the time dilation

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=temp.query("FraudIndicator == 1 and Working == 0").index,
    y=temp.query("FraudIndicator == 1 and Working == 0").Amount,
    marker=dict(size=temp.query("FraudIndicator == 1 and Working == 0").AnomalyScore,sizeref=0.03,sizemin=4,color="blue",line_color="white"),
    mode="markers",
    name="Fraudulent at \n<br>Non-Working Hours\n<br>5 PM to 8 AM"
    ))
fig.add_trace(go.Scatter(
    x=temp.query("FraudIndicator == 1 and Working == 1").index,
    y=temp.query("FraudIndicator == 1 and Working == 1").Amount,
    marker=dict(size=temp.query("FraudIndicator == 1 and Working == 1").AnomalyScore,sizeref=0.03,sizemin=4,color="red",line_color="white"),
    mode="markers",
    name="Fraudulent at \n<br>Working Hours\n<br>8 AM to 5 PM"
    ))
fig.add_trace(go.Scatter(
    x=temp.query("FraudIndicator == 0 and Working == 0").index,
    y=temp.query("FraudIndicator == 0 and Working == 0").Amount,
    marker=dict(size=temp.query("FraudIndicator == 0 and Working == 0").AnomalyScore,sizeref=0.03,sizemin=4,color="blue",opacity=0.3),
    mode="markers",
    name="Non-Fraudulent at \n<br>Non-Working Hours\n<br>5 PM to 8 AM",
    visible="legendonly"
))
fig.add_trace(go.Scatter(
    x=temp.query("FraudIndicator == 0 and Working == 1").index,
    y=temp.query("FraudIndicator == 0 and Working == 1").Amount,
    marker=dict(size=temp.query("FraudIndicator == 0 and Working == 1").AnomalyScore,sizeref=0.03,sizemin=4,color="red",opacity=0.3),
    mode="markers",
    name="Non-Fraudulent at \n<br>Working Hours\n<br>8 AM to 5 PM",
    visible="legendonly"
))
fig.add_annotation(text="*Toggle normal to see <br>the non-fraudulent transactions<br> that took place",x=1.25,y=-0.15,xanchor="right",yanchor="bottom",showarrow=False,xref="x domain",yref="y domain")
fig.add_annotation(text="*size indicates AnomalyScore",x=1.25,y=0,xanchor="right",yanchor="bottom",showarrow=False,xref="x domain",yref="y domain")
fig.update_layout(title=dict(text="Fraudulent Activity by Time from January 1,2022 to February 11,2022"),legend=dict(itemsizing="constant"))
fig.update_xaxes(title=dict(text="Time"))
fig.update_yaxes(title=dict(text="Amount"))
fig.show()

In [None]:
temp.query("FraudIndicator == 1 and Working == 0").shape[0]/temp.query("FraudIndicator == 1").shape[0]

In [None]:
temp.query("FraudIndicator == 1 and Working == 1").shape[0]/temp.query("FraudIndicator == 1").shape[0]

# Modelling

In [None]:
def week(x):
    if x <= 7:
        return "week 1"
    elif (x > 7) & (x <= 14):
        return "week 2"
    elif (x > 14) & (x <= 21):
        return "week 3"
    elif (x > 21) & (x <= 28):
        return "week 4"
    else:
        return "week 5"

def Dayofmonth(x):
    if x <= 15:
        return "first half"
    else:
        return "second half"
    
def weekend(x):
    if x > 4:
        return "weekend"
    else:
        return "weekday"

In [None]:
class AppendTimeColumns(BaseEstimator,TransformerMixin):


    def fit(self,X:pd.DataFrame,y=None):
        X['Timestamp'] = pd.to_datetime(X['Timestamp'])
        X.set_index('Timestamp',inplace=True)
        X.sort_index(inplace=True)
        X["Working"] = np.NaN
        X["Day"] = np.NaN
        X["Hour"] = np.NaN
        return self
    
    def transform(self,X:pd.DataFrame,y=None):
        X["Hour"] = X.index.strftime("%H")
        X.loc[X.between_time(start_time="9:00:00",end_time="17:00:00").index,"Working"] = 1
        X["Working"] = X.Working.fillna(0)
        X.loc[X.between_time(start_time="6:00:00",end_time="18:00:00").index,"Day"] = 1
        X["Day"] = X.Day.fillna(0)
        X["Day_of_month"] = X.index.strftime('%d').astype(np.float32)
        X["day_of_week"] = X.index.dayofweek
        X["week_of_month"] = X["Day_of_month"].apply(lambda x: week(x))
        X["weekend"] = X["day_of_week"].apply(lambda x: weekend(x))
        X["Period_of_month"] = X["Day_of_month"].apply(lambda x: Dayofmonth(x))
        X["dayofweek"] = X.index.day_name()
        X.drop(['day_of_week'],inplace=True,axis=1)
        self.columns = X.columns
        return X

In [None]:
temp_df = df.copy()
y = temp_df['FraudIndicator']
temp_df.drop(['FraudIndicator'],axis=1,inplace=True)
CustomTransformerclass = AppendTimeColumns()
transformed_df = CustomTransformerclass.fit_transform(temp_df)

In [None]:
num_columns = ["AccountBalance","Age","Amount","TransactionAmount"]
cat_columns = ["Category","Period_of_month","week_of_month","weekend","Day_of_month","dayofweek"]

In [None]:
full_pipe = ColumnTransformer([
    ("num",MinMaxScaler(),num_columns),
    ("cat",OneHotEncoder(),cat_columns)
],remainder="passthrough")
X = full_pipe.fit_transform(transformed_df)
strat = StratifiedShuffleSplit(n_splits=3,test_size=0.2,random_state=44)
grid_params = {
    "n_estimators":np.arange(100,500,step=50),
    "bootstrap":[True,False],
    "max_depth":np.arange(3,16),
    "criterion":["gini","entropy","log-loss"]
    }
grid = GridSearchCV(RandomForestClassifier(),grid_params,scoring="accuracy",n_jobs=-1,cv=strat)
grid.fit(X,y)
rand = grid.best_estimator_
strati = StratifiedShuffleSplit(1,test_size=0.3,random_state=44)
for train_ind,test_ind in strati.split(X,y):
    X_train = X[train_ind,:]
    X_test = X[test_ind,:]
    y_train = y[train_ind]
    y_test = y[test_ind]

rand.fit(X_train,y_train)
print("score : ",rand.score(X_test,y_test))

In [None]:
y_pred = rand.predict(X_test)

In [None]:
ft = grid.best_estimator_.feature_importances_
cols = full_pipe.get_feature_names_out(transformed_df.columns)
fig = go.Figure()
i = 0
for num,col in sorted(zip(ft,cols),reverse=True):
    fig.add_trace(go.Bar(x=[col.split('__')[-1]],y=[num]))
    if i == 10:
        break
    i += 1
fig.update_layout(showlegend=False,title="Feature Importances for Gradient Boosting")
fig.update_xaxes(tickangle=30)
fig.show()

In [None]:
grid_params = {
    "C":np.r_[np.logspace(-6,0,num=7),0],
    "kernel":["rbf","sigmoid"]
}
grid = GridSearchCV(SVC(),grid_params,scoring="accuracy",n_jobs=-1,cv=strat)
grid.fit(X,y)
print(grid.best_estimator_)
print(grid.best_score_)

In [None]:
px.imshow(confusion_matrix(y_test,y_pred),color_continuous_scale="GnBu_r")