<a href="https://colab.research.google.com/github/shobanj/genai_assignment_level3/blob/main/GenAI_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 2

Write a Python program to draw (visualize) the architecture of a Neural Network used to classify fraudulent and non-fraudulent credit card transactions.

Assume the fraud detection dataset contains the following input features:  

* Transaction Amount
* Transaction Time
* Merchant Category
* Customer Age
* Account Balance
* Number Of Transactions Today
* Fraud (0 = Genuine, 1 = Fraud)

### Creating a Synthetic Dataset

To simulate the input features for our fraud detection model, create a synthetic dataset using `pandas`. This dataset will contain the following columns:

*   `Transaction Amount`
*   `Transaction Time`
*   `Merchant Category`
*   `Customer Age`
*   `Account Balance`
*   `Number Of Transactions Today`
*   `Fraud` (target variable: 0 = Genuine, 1 = Fraud)


In [None]:
import pandas as pd
import numpy as np

# Number of samples
n_samples = 1000

# Generate synthetic data
data = {
    'Transaction Amount': np.random.uniform(10, 2000, n_samples),
    'Transaction Time': np.random.randint(0, 24, n_samples), # Hour of the day
    'Merchant Category': np.random.choice(['Electronics', 'Groceries', 'Clothing', 'Travel', 'Services'], n_samples),
    'Customer Age': np.random.randint(18, 75, n_samples),
    'Account Balance': np.random.uniform(100, 10000, n_samples),
    'Number Of Transactions Today': np.random.randint(1, 20, n_samples),
    'Fraud': np.random.choice([0, 1], n_samples, p=[0.95, 0.05]) # 5% fraud rate
}

df = pd.DataFrame(data)

# Introduce some correlation for 'Fraud' column for more realistic data
# For example, higher transaction amounts or certain merchant categories might have higher fraud rates
df.loc[df['Fraud'] == 1, 'Transaction Amount'] = np.random.uniform(500, 5000, df['Fraud'].sum()) # Fraudulent transactions tend to be higher
df.loc[df['Fraud'] == 1, 'Number Of Transactions Today'] = np.random.randint(15, 30, df['Fraud'].sum()) # Many transactions in a day might indicate fraud

df.head()

Unnamed: 0,Transaction Amount,Transaction Time,Merchant Category,Customer Age,Account Balance,Number Of Transactions Today,Fraud
0,484.260393,20,Services,28,6975.513095,7,0
1,1123.191148,22,Services,41,2257.781728,5,0
2,48.114975,17,Travel,23,8380.220382,13,0
3,530.704409,21,Clothing,20,2317.59907,7,0
4,1084.737172,6,Travel,41,3834.826935,6,0


In [None]:
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# 2. Separate the features (all columns except 'Fraud') into a variable X and the target variable ('Fraud') into a variable y.
X = df.drop('Fraud', axis=1)
y = df['Fraud']
print(f"\nShape of features (X): {X.shape}")
print(f"Shape of target (y): {y.shape}")

# 3. Identify categorical and numerical features
categorical_features = ['Merchant Category']
numerical_features = ['Transaction Amount', 'Transaction Time', 'Customer Age', 'Account Balance', 'Number Of Transactions Today']

# 5. Create a ColumnTransformer that applies OneHotEncoder to the categorical features and StandardScaler to the numerical features.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])
print("\nColumnTransformer created for preprocessing.")

# 6. Apply the preprocessor to X to transform the features.
X_processed = preprocessor.fit_transform(X)
print(f"Shape of preprocessed features (X_processed): {X_processed.shape}")

# 8. Split X_processed and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y, test_size=0.2, random_state=42)

print(f"\nShapes after splitting:")
print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")


Shape of features (X): (1000, 6)
Shape of target (y): (1000,)

ColumnTransformer created for preprocessing.
Shape of preprocessed features (X_processed): (1000, 10)

Shapes after splitting:
X_train: (800, 10)
X_test: (200, 10)
y_train: (800,)
y_test: (200,)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, Dense

# 2. Determine the number of input features
input_features = X_train.shape[1]
print(f"Number of input features: {input_features}")

# 3. Initialize a Sequential model
model = Sequential()

# Add an Input layer to explicitly define the input shape
model.add(Input(shape=(input_features,)))

# 4. Add the first Dense layer (now it's not the input layer in terms of `input_shape` argument)
model.add(Dense(64, activation='relu'))

# 5. Add at least one more hidden Dense layer
model.add(Dense(32, activation='relu'))

# 6. Add an output Dense layer for binary classification
model.add(Dense(1, activation='sigmoid'))

# 7. Print the model summary
print("\nNeural Network Model Architecture:")
model.summary()

Number of input features: 10

Neural Network Model Architecture:


## Train the Neural Network Model

Compile and train the defined neural network model using the training data.


In [None]:
from tensorflow.keras.metrics import Precision, Recall

# 1. Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', Precision(), Recall()])
print("Model compiled successfully.")

# 2. Train the compiled model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
print("Model training completed.")

Model compiled successfully.
Epoch 1/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step - accuracy: 0.9488 - loss: 0.4515 - precision_1: 0.0000e+00 - recall_1: 0.0000e+00 - val_accuracy: 0.9350 - val_loss: 0.2950 - val_precision_1: 0.0000e+00 - val_recall_1: 0.0000e+00
Epoch 2/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9596 - loss: 0.2131 - precision_1: 0.0000e+00 - recall_1: 0.0000e+00 - val_accuracy: 0.9350 - val_loss: 0.2021 - val_precision_1: 0.0000e+00 - val_recall_1: 0.0000e+00
Epoch 3/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.9618 - loss: 0.1303 - precision_1: 0.0000e+00 - recall_1: 0.0000e+00 - val_accuracy: 0.9350 - val_loss: 0.1467 - val_precision_1: 0.0000e+00 - val_recall_1: 0.0000e+00
Epoch 4/10
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.9695 - loss: 0.1091 - precision_1: 0.9615 - recall_1: 0.3070 - val_accurac

## Evaluate Model Performance

Evaluate the trained neural network's performance on the test set using metrics such as precision, recall, F1-score, and ROC AUC.


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report


In [None]:
import numpy as np

# 1. Use the trained model to predict probabilities on the X_test data.
y_pred_proba = model.predict(X_test)

# 2. Convert the predicted probabilities into binary predictions (0 or 1) using a threshold (e.g., 0.5).
y_pred_binary = (y_pred_proba > 0.5).astype(int)

# 3. Calculate the accuracy, precision, recall, and F1-score.
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)

# 4. Calculate the ROC AUC score.
roc_auc = roc_auc_score(y_test, y_pred_proba)

# 5. Print all the calculated evaluation metrics.
print("\nModel Evaluation on Test Set:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print(f"ROC AUC: {roc_auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_binary))

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step

Model Evaluation on Test Set:
Accuracy: 0.9750
Precision: 0.9000
Recall: 0.6923
F1-Score: 0.7826
ROC AUC: 0.9761

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       187
           1       0.90      0.69      0.78        13

    accuracy                           0.97       200
   macro avg       0.94      0.84      0.88       200
weighted avg       0.97      0.97      0.97       200

