In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
kartik2112_fraud_detection_path = kagglehub.dataset_download('kartik2112/fraud-detection')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install imbalanced-learn

In [None]:
!pip install --upgrade scikit-learn

## Load the data

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("/kaggle/input/fraud-detection/fraudTrain.csv")
test = pd.read_csv("/kaggle/input/fraud-detection/fraudTest.csv")

train.shape, test.shape

In [None]:
train.head()

### Clean the dataset

In [None]:
print(train.columns)
print(test.columns)

In [None]:
# Drop unwanted columns

train.drop(columns=['Unnamed: 0','trans_date_trans_time','cc_num','merchant','first','last','street',
                    'city','zip','lat','long','dob','trans_num','unix_time','merch_lat','merch_long'], inplace = True)

test.drop(columns=['Unnamed: 0','trans_date_trans_time','cc_num','merchant','first','last','street',
                    'city','zip','lat','long','dob','trans_num','unix_time','merch_lat','merch_long'], inplace = True)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# Checking for null values

print(train.isnull().sum())
print(test.isnull().sum())

In [None]:
print(train.dtypes)
print("*****************")
print(test.dtypes)

In [None]:
# lets combine both df for easy operations

df = pd.concat([train, test], axis = 0)

In [None]:
df.head()

In [None]:
df.shape

### Converting categorical values to numerical

In [None]:
df['category'].unique()

In [None]:
category_df = pd.get_dummies(df['category'])
category_df = category_df.astype(int)
category_df.head()

In [None]:
df.drop(columns=['category'], inplace=True)


In [None]:
df = pd.concat([df, category_df], axis = 1)


In [None]:
df['gender'].unique()

In [None]:
# Lets replace the values with one and zero

df['gender'].replace({'F':1, 'M':0}, inplace=True)
df['gender'].unique()

In [None]:
df['state'].unique()

In [None]:
# lets do one hot encoding

state_df = pd.get_dummies(df['state']).astype('int')
state_df.head()

In [None]:
df.drop(columns=['state'], inplace=True)

In [None]:
df = pd.concat([df, state_df], axis=1)
df.head()

In [None]:
df.dtypes

# job is remaining

In [None]:
df['job'].unique()
print(df['job'].unique().shape)

In [None]:
# same for this one-hot-encoding

job_df = pd.get_dummies(df['job']).astype('int')

In [None]:
df.drop(columns=['job'], inplace=True)
df.head()

In [None]:
df = pd.concat([df, job_df], axis=1)
df.shape

In [None]:
df.head()

### Deviding dataaset into features and targets

In [None]:
X = df.drop(columns = ['is_fraud'])
y = df['is_fraud']

X.shape

In [None]:
y.value_counts()

# The dataset is very imbalanced

### Handling the imbalanced dataset using SMOTE
here minority class is 1 and majority class is 0

In [None]:
from imblearn.oversampling import SMOTE

smote = SMOTE(sampling_strategy = "minority")
X, y = smote.fit_resample(X, y)

In [None]:
y.value_counts()

In [None]:
X.shape

### Normalization

In [None]:
X.describe()

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X[:] = scaler.fit_transform(X)

### Build the model

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
y_train.value_counts()

In [None]:
model = keras.Sequential([
    keras.layers.Dense( 300, input_shape = (542, ), activation='relu'),
    keras.layers.Dense( 150, activation='relu'),
    keras.layers.Dense(1, activation='sigmoid'),
])

model.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics = ['accuracy']
)

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=200)

In [None]:
_,acc = model.evaluate(X_test, y_test)
print(acc*100:.2f)

In [None]:
pred = model.predict(y_test)

In [None]:
y_pred = (pred > 0.5).astype('int')


In [None]:
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print(cr)

In [None]:
import seaborn as sns

sns.heatmap(cm, cmap="Blues", annot=True, fmt='d')