<a href="https://colab.research.google.com/github/serendu10/Retail-Customer-Segmentation/blob/main/Finance_Freud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pyarrow.parquet as pq
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import ElasticNet
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn.utils import resample
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import io
from tensorflow import keras

In [2]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from google.colab import files
uploaded = files.upload()

In [4]:
# Import data
Transaction = pq.read_table('credit_card_transaction_data_de.parquet copy').to_pandas()
User = pq.read_table('credit_card_users_de.parquet copy').to_pandas().reset_index()
Card = pq.read_table('sd254_cards_de.parquet copy').to_pandas()

FileNotFoundError: ignored

In [None]:
# Join Transaction with User
df = pd.merge(Transaction,User,left_on='User',right_on='index',how='left')
# Join the anove with Card
df = pd.merge(df,Card,left_on=['Card','User'],right_on=['CARD INDEX','User'],how='left')

**Data Cleaning**

In [None]:
# Create new columns
df.loc[(df['Current Age']>=df['Retirement Age']),'Retired'] = 'Y'
df.loc[(df['Current Age']<df['Retirement Age']),'Retired'] = 'N'
df["Hour"] = df["Time"].str[:2]
df["Min"] = df["Time"].str[-2:]
df = df.drop(['Time'],axis=1)
df['Expires Month'] = df['Expires'].str[:2]
df['Expires Year'] = df['Expires'].str[-4:]
df = df.drop(['Expires'],axis=1)
df['Open Month'] = df['Acct Open Date'].str[:2]
df['Open Year'] = df['Acct Open Date'].str[-4:]
df = df.drop(['Acct Open Date'],axis=1)
# Drop unnecessary column
df = df[['Year', 'Month', 'Day','Hour','Min','Amount', 'Use Chip',
        'Merchant City', 'Merchant State', 'Zip', 'MCC',
       'Errors?', 'Is Fraud?','Current Age','Retired','Birth Year', 'Birth Month','Gender','City', 'State',
       'Zipcode','Per Capita Income - Zipcode',
       'Yearly Income - Person', 'Total Debt', 'FICO Score','Num Credit Cards','Card Brand',
       'Card Type', 'Card Number', 'Expires Month', 'Expires Year','CVV', 'Has Chip',
       'Cards Issued', 'Credit Limit', 'Open Month','Open Year',
       'Year PIN last Changed', 'Card on Dark Web']]

In [None]:
# Change data types
df['Amount']=df['Amount'].str.replace('$', '').astype(float)
df['Per Capita Income - Zipcode']=df['Per Capita Income - Zipcode'].str.replace('$', '').astype(float)
df['Yearly Income - Person']=df['Yearly Income - Person'].str.replace('$', '').astype(float)
df['Total Debt']=df['Total Debt'].str.replace('$', '').astype(float)
df['Credit Limit']=df['Credit Limit'].str.replace('$', '').astype(float)

df['Hour'] = df['Hour'].astype(int)
df['Min'] = df['Min'].astype(int)
df['Expires Month'] = df['Expires Month'].astype(int)
df['Expires Year'] = df['Expires Year'].astype(int)
df['Open Month'] = df['Open Month'].astype(int)
df['Open Year']  = df['Open Year'] .astype(int)

**EDA**

In [None]:
data = df[df['Is Fraud?'] == 'Yes']

In [None]:
# Plot Fraud Amount
sns.distplot(data['Amount'].astype(float), kde=False, bins=80)
plt.title('Distribution of Fraudulent Transaction Amounts')
plt.xlabel('Amount')
plt.ylabel('Number of Transactions')
plt.xlim(-200, 2000)
plt.show()

In [None]:
# Plot Top 50 Fraud Locations
plt.figure(figsize=(15, 10))
sns.countplot(data=data, y='Merchant State', order=data['Merchant State'].value_counts().index)
plt.title('Number of Fraudulent Transactions by State')
plt.show()

In [None]:
# Plot Top 10 Fraud City
plt.figure(figsize=(15, 10))
top_cities = data['Merchant City'].value_counts().head(10).index
sns.countplot(data=data, y='Merchant City', order=top_cities)
plt.title('Number of Fraudulent Transactions by Top 10 Cities')
plt.show()

In [None]:
# Plot Fraud Transaction Type
sns.countplot(data=data, x='Use Chip')
plt.title('Distribution of Fraudulent Transactions by Transaction Type')
plt.show()

In [None]:
# Plot Fraud by Card Brand
sns.countplot(data=data, x='Card Brand')
plt.title('Distribution of Fraudulent Transactions by card Brands')
plt.show()

In [None]:
# Plot Fraud by Year
plt.figure(figsize=(15, 6))
sns.countplot(data=data, x='Year')
plt.title('Number of Fraudulent Transactions by Year')
plt.show()

In [None]:
# Plot Fraud by Yearly Income

**Data Pre-processing**

In [None]:
# Correcting the Imbalance in Dataset using Undersampling method
# Separate majority and minority classes
Normal = df[df['Is Fraud?'] == 'No']
Freud = df[df['Is Fraud?'] == 'Yes']
# Downsample majority class
df_normal_downsampled = resample(Normal,
                                   replace=False,
                                   n_samples=len(Freud),
                                   random_state=42)
# Combine minority class with downsampled majority class
df = pd.concat([df_normal_downsampled, Freud])

In [None]:
# change the is fraud column to binary
df["Is Fraud?"] = df["Is Fraud?"].apply(lambda x: 1 if x == 'Yes' else 0)

In [None]:
# Divide Categorical and Numerical Variables
Num = ['Year', 'Month', 'Day', 'Hour', 'Min', 'Amount',
       'Zip','MCC', 'Current Age','Birth Year', 'Birth Month', 'Zipcode', 'Per Capita Income - Zipcode',
       'Yearly Income - Person', 'Total Debt', 'FICO Score',
       'Num Credit Cards', 'Card Number','Expires Month', 'Expires Year', 'CVV','Cards Issued',
       'Credit Limit', 'Open Month', 'Open Year', 'Year PIN last Changed']
Cat = ['Use Chip','Merchant City', 'Merchant State','Errors?','Retired', 'Gender',
       'City', 'State','Card Brand', 'Card Type','Has Chip','Card on Dark Web']
y = ['Is Fraud?']

In [None]:
# Fillna
df[Num] = df[Num].fillna(0)

In [None]:
# Scaling
scaler = StandardScaler()
# Fit and transform the numerical data
scaled_df_Num = scaler.fit_transform(df[Num])
scaled_features_df_Num = pd.DataFrame(scaled_df_Num, index=df[Num].index, columns=df[Num].columns)
# Combine
df = pd.concat([scaled_features_df_Num,df[Cat],df[y]],axis=1)

In [None]:
# Dummy Coding
df = pd.get_dummies(df, columns = Cat,dtype=float)

**Feature Selection**

In [None]:
# Split the data into training and testing sets
X = df.loc[:, df.columns != 'Is Fraud?']
y = df['Is Fraud?']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Feature Selection with Elastic Net
# Define a grid of hyperparameters to search
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1.0, 10.0],
    'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
}
# Initialize Elastic Net model
elastic_net = ElasticNet()
# Initialize GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(elastic_net, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
# Fit the grid search on the training data
grid_search.fit(X_train, y_train)
# Get the best hyperparameters
best_params = grid_search.best_params_
# Get the best model
best_model = grid_search.best_estimator_
# Get selected features (non-zero coefficients)
coefficients = best_model.coef_
selected_features = X_train.columns[coefficients != 0]

In [None]:
selected_features

In [None]:
# Feature Selection
X_selected_train = X_train[selected_features]
X_selected_test = X_test[selected_features]

**Logistic Regression**

In [None]:
# Initialize the Logistic Regression model
logistic_reg = LogisticRegression(max_iter=1000)
# Define a grid of hyperparameters to search
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
              'penalty': ['l1', 'l2']}
# Initialize GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(logistic_reg, param_grid, cv=5, scoring='accuracy')
# Fit the grid search on the training data
grid_search.fit(X_selected_train, y_train)
# Get the best hyperparameters
best_params = grid_search.best_params_
# Get the best model
best_model = grid_search.best_estimator_
print(f"Best Hyperparameters: {best_params}")

In [None]:
# Evaluate the model on the test set
y_pred = best_model.predict(X_selected_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy on Test Set: {accuracy}")

**Random Forest**

In [None]:
# Modeling with Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create a RandomForestClassifier model
rf = RandomForestClassifier(random_state=42)

# GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=0, scoring='f1_macro')

grid_search.fit(X_selected_train, y_train)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best hyperparameters:", best_params)
# Use the best estimator for predictions or further work
best_rf = grid_search.best_estimator_
y_pred_best_rf = best_rf.predict(X_selected_test)
print("Random Forest Classifier Results with Best Hyperparameters:")
print(classification_report(y_test, y_pred_best_rf))
print(confusion_matrix(y_test, y_pred_best_rf))

In [None]:
# Extract feature importances from the best random forest model
feature_importance = best_rf.feature_importances_
features = X_selected_train.columns

# Sort the feature importances and their corresponding feature names
sorted_idx = feature_importance.argsort()

# Plot horizontal bar chart
plt.figure(figsize=(10, 7))
plt.barh(features[sorted_idx][:30], feature_importance[sorted_idx][:30], align='center', color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importances using Random Forest')
plt.show()

**XGBoost**