# Project — Fraud Detection in Financial Transcatons

Team:
- Prathamesh Lawand
- Harsh Bhandari
- Aditya Wagh
- Chaitanya Kalamkar




In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk(''):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import pandas as pd


transaction_df = pd.read_csv("train_transaction.csv")
identity_df = pd.read_csv("train_identity.csv")
train_df = transaction_df.merge(identity_df, on='TransactionID', how='left')

train_df.info()
train_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 434 entries, TransactionID to DeviceInfo
dtypes: float64(399), int64(4), object(31)
memory usage: 1.9+ GB


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


**Pre-processing**

In [3]:
missing_values = train_df.isnull().sum()
missing_percentage = (missing_values / len(train_df)) * 100
missing_df = pd.DataFrame({'missing_values': missing_values, 'percentage': missing_percentage})
print(missing_df[missing_df['missing_values'] > 0].sort_values(by='percentage', ascending=False))


       missing_values  percentage
id_24          585793   99.196159
id_25          585408   99.130965
id_07          585385   99.127070
id_08          585385   99.127070
id_21          585381   99.126393
...               ...         ...
V285               12    0.002032
V284               12    0.002032
V280               12    0.002032
V279               12    0.002032
V312               12    0.002032

[414 rows x 2 columns]


#### The below code removes columns with a high percentage of missing values, which are unlikely to contribute meaningfully to the analysis, thereby streamlining the dataset.


In [4]:
# Dropping columns 


#greater than 60%
columns_to_drop = missing_percentage[missing_percentage > 60].index


columns_to_drop = [col for col in columns_to_drop if col in train_df.columns]
train_df.drop(columns=columns_to_drop, axis=1, inplace=True)


## Display of the First Few Rows (`train_df.head()`):

Display the first few rows of the dataset using `train_df.head()` to provide a glimpse of its structure and content.





In [5]:
train_df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
from sklearn.impute import SimpleImputer

# Numerical imputation
numerical_features = train_df.select_dtypes(include=['int64', 'float64']).columns
numerical_imputer = SimpleImputer(strategy='median')
train_df[numerical_features] = numerical_imputer.fit_transform(train_df[numerical_features])

# Categorical imputation with 'Unknown'
categorical_features = train_df.select_dtypes(include=['object']).columns
categorical_imputer = SimpleImputer(strategy='constant', fill_value='Unknown')
train_df[categorical_features] = categorical_imputer.fit_transform(train_df[categorical_features])




### The following code provides a brief overview of the dataset's structure, data types, and memory usage. This sets the stage for further analysis and exploration.







In [7]:

train_df.head()
train_df.describe()
train_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 226 entries, TransactionID to V321
dtypes: float64(213), object(13)
memory usage: 1018.2+ MB


## Analysis of Fraudulent Transactions Distribution

The following analysis examines the distribution of fraudulent transactions within the dataset `train_df`.


In [8]:

fraud_counts = train_df['isFraud'].value_counts(normalize=True) * 100


print(fraud_counts)


isFraud
0.0    96.500999
1.0     3.499001
Name: proportion, dtype: float64


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split


categorical_cols = train_df.select_dtypes(include=['object']).columns.tolist()


train_df_encoded = pd.get_dummies(train_df, columns=categorical_cols)



In [10]:

X = train_df_encoded.drop('isFraud', axis=1)
y = train_df_encoded['isFraud']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


****Random forest****

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score


forest_model = RandomForestClassifier(n_estimators=10)
forest_model.fit(X_train, y_train)

y_pred = forest_model.predict(X_test)
y_pred_proba = forest_model.predict_proba(X_test)[:, 1]

# Evaluation
print("\nRandom Forest Metrics:")

print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")



Random Forest Metrics:
Precision: 0.9115
Recall: 0.4212
F1 Score: 0.5762
ROC AUC: 0.8854


XGBoost

In [12]:
import numpy as np
import pandas as pd
import gc
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, precision_recall_curve
import xgboost as xgb
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score


In [13]:
X = train_df_encoded.drop('isFraud', axis=1)
y = train_df_encoded['isFraud']

# Standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [14]:
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

# Clear memory
del X_scaled, train_df_encoded
gc.collect()


0

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Clear memory
del X_pca
gc.collect()


0

In [16]:

negative_instances = sum(y == 0)
positive_instances = sum(y == 1)

# Now, use these counts to set the 'scale_pos_weight' parameter
model = xgb.XGBClassifier(scale_pos_weight=negative_instances / positive_instances)
model.fit(X_train, y_train)

gc.collect()

42

In [17]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

print("XGBoost Metrics:")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")

# Clear memory
gc.collect()


XGBoost Metrics:
Precision: 0.2397
Recall: 0.7329
F1 Score: 0.3612
ROC AUC: 0.9063


0

 **Applying Smote **

In [18]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [19]:
# Applying SMOTE
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Training the XGBoost model
model = xgb.XGBClassifier(random_state=42)
model.fit(X_train_smote, y_train_smote)

In [20]:
26# Making predictions
y_pred = model.predict(X_test_scaled)
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

# Evaluating the model
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)

print("XGBoost Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

XGBoost Metrics:
Precision: 0.2245
Recall: 0.7315
F1 Score: 0.3436
ROC AUC: 0.9045


In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics import precision_recall_curve, f1_score, accuracy_score, roc_auc_score, confusion_matrix
import matplotlib.pyplot as plt
from xgboost import XGBClassifier, plot_importance

 

# Train the XGBoost model
model = XGBClassifier(random_state=42)
model.fit(X_train_smote, y_train_smote)

# Predict probabilities
y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]

# Calculate precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)

# Calculate F1 scores for each threshold
f1_scores = 2 * (precision * recall) / (precision + recall)
# Locate the index of the largest F1 score
ix = np.argmax(f1_scores)
print('Best Threshold=%f, F1 Score=%.3f' % (thresholds[ix], f1_scores[ix]))

# Plot the precision-recall vs threshold chart
plt.plot(thresholds, precision[:-1], 'r-', label='Precision')
plt.plot(thresholds, recall[:-1], 'b-', label='Recall')
plt.plot(thresholds, f1_scores[:-1], 'g-', label='F1 Score')
plt.xlabel('Threshold')
plt.legend()
plt.show()

# Adjust the classification threshold
threshold_adjusted = thresholds[ix]
y_pred_adjusted = (y_pred_proba >= threshold_adjusted).astype(int)

# Evaluate the model with the new threshold
print("Adjusted Metrics:")
print("Precision: {:.4f}".format(precision_score(y_test, y_pred_adjusted)))
print("Recall: {:.4f}".format(recall_score(y_test, y_pred_adjusted)))
print("F1 Score: {:.4f}".format(f1_score(y_test, y_pred_adjusted)))
print("ROC AUC: {:.4f}".format(roc_auc_score(y_test, y_pred_proba)))

# Feature importance
plt.figure(figsize=(10, 8))
plot_importance(model, max_num_features=10)  
plt.show()


NameError: name 'X_train_smote' is not defined