## IMPORTING LIBRARIES ##

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import seaborn as sns
import matplotlib.pyplot as plt


## ACQUIRING DATASET ##

In [2]:
dataset=pd.read_csv('CreditCardDataset.csv')

In [3]:
dataset

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,#3203 892,13-Oct-20,Tuesday,22,MasterCard,Tap,£15,POS,Electronics,United Kingdom,United Kingdom,United Kingdom,F,53.8,Halifax,0
99996,#3304 849,14-Oct-20,Wednesday,23,MasterCard,PIN,£7,ATM,Children,Russia,Russia,Russia,M,45.0,Barclays,0
99997,#3532 129,13-Oct-20,Tuesday,11,MasterCard,PIN,£21,ATM,Subscription,United Kingdom,United Kingdom,United Kingdom,F,46.5,HSBC,0
99998,#3107 092,14-Oct-20,Wednesday,22,Visa,Tap,£25,POS,Products,United Kingdom,United Kingdom,United Kingdom,M,48.2,Barclays,0


In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   Transaction ID          100000 non-null  object 
 1   Date                    100000 non-null  object 
 2   Day of Week             100000 non-null  object 
 3   Time                    100000 non-null  int64  
 4   Type of Card            100000 non-null  object 
 5   Entry Mode              100000 non-null  object 
 6   Amount                  99994 non-null   object 
 7   Type of Transaction     100000 non-null  object 
 8   Merchant Group          99990 non-null   object 
 9   Country of Transaction  100000 non-null  object 
 10  Shipping Address        99995 non-null   object 
 11  Country of Residence    100000 non-null  object 
 12  Gender                  99996 non-null   object 
 13  Age                     100000 non-null  float64
 14  Bank                 

## DATA PREPROCESSING ##

**Dropping Null Columns**

In [5]:
dataset=dataset.dropna(subset=['Amount'])
dataset=dataset.dropna(subset=['Merchant Group'])
dataset=dataset.dropna(subset=['Shipping Address'])
dataset=dataset.dropna(subset=['Gender'])

In [6]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 99977 entries, 0 to 99999
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Transaction ID          99977 non-null  object 
 1   Date                    99977 non-null  object 
 2   Day of Week             99977 non-null  object 
 3   Time                    99977 non-null  int64  
 4   Type of Card            99977 non-null  object 
 5   Entry Mode              99977 non-null  object 
 6   Amount                  99977 non-null  object 
 7   Type of Transaction     99977 non-null  object 
 8   Merchant Group          99977 non-null  object 
 9   Country of Transaction  99977 non-null  object 
 10  Shipping Address        99977 non-null  object 
 11  Country of Residence    99977 non-null  object 
 12  Gender                  99977 non-null  object 
 13  Age                     99977 non-null  float64
 14  Bank                    99977 non-null  obj

**One Hot Encoding**

In [7]:
# Identify columns for one-hot encoding
one_hot_columns = ['Type of Card', 'Entry Mode', 'Type of Transaction', 'Merchant Group',
                   'Country of Transaction', 'Country of Residence', 'Gender', 'Bank','Shipping Address']

# Perform one-hot encoding
encoded_dataset = pd.get_dummies(dataset, columns=one_hot_columns)


**Label Encoding**

In [8]:
le = LabelEncoder()
encoded_dataset['Day of week'] = le.fit_transform(encoded_dataset['Day of Week'])

encoded_dataset.drop('Day of Week', axis=1, inplace=True)

**Removing unwanted signs**

In [9]:
# Remove currency signs
encoded_dataset['Amount'] = encoded_dataset['Amount'].str.replace(r'[^\d.]', '', regex=True)

# Convert to numeric
encoded_dataset['Amount'] = pd.to_numeric(encoded_dataset['Amount'])


# Remove the '#' character from 'Transaction ID'
encoded_dataset['Transaction ID'] = encoded_dataset['Transaction ID'].str.replace('#', '')

**Extracting day/month/year**

In [10]:
# Converting the 'Date' column to datetime format
encoded_dataset['Date'] = pd.to_datetime(encoded_dataset['Date'], format='%d-%b-%y')

# Extracting day, month, and year into separate columns
encoded_dataset['Day'] = encoded_dataset['Date'].dt.day
encoded_dataset['Month'] = encoded_dataset['Date'].dt.month
encoded_dataset['Year'] = encoded_dataset['Date'].dt.year

encoded_dataset.drop('Date', axis=1, inplace=True)

**Converting True/False to 0/1**

In [11]:
boolean_columns = [
       'Type of Card_MasterCard', 'Type of Card_Visa', 'Entry Mode_CVC',
       'Entry Mode_PIN', 'Entry Mode_Tap', 'Type of Transaction_ATM',
       'Type of Transaction_Online', 'Type of Transaction_POS',
       'Merchant Group_Children', 'Merchant Group_Electronics',
       'Merchant Group_Entertainment', 'Merchant Group_Fashion',
       'Merchant Group_Food', 'Merchant Group_Gaming',
       'Merchant Group_Products', 'Merchant Group_Restaurant',
       'Merchant Group_Services', 'Merchant Group_Subscription',
       'Country of Transaction_China', 'Country of Transaction_India',
       'Country of Transaction_Russia', 'Country of Transaction_USA',
       'Country of Transaction_United Kingdom', 'Country of Residence_China',
       'Country of Residence_India', 'Country of Residence_Russia',
       'Country of Residence_USA', 'Country of Residence_United Kingdom',
       'Gender_F', 'Gender_M', 'Bank_Barclays', 'Bank_Barlcays', 'Bank_HSBC',
       'Bank_Halifax', 'Bank_Lloyds', 'Bank_Metro', 'Bank_Monzo', 'Bank_RBS',
       'Shipping Address_China', 'Shipping Address_India',
       'Shipping Address_Russia', 'Shipping Address_USA',
       'Shipping Address_United Kingdom',
]
encoded_dataset[boolean_columns] = encoded_dataset[boolean_columns].astype(int)

**Removing Extra Spaces from Id**

In [12]:
for column in encoded_dataset.columns:
    if encoded_dataset[column].dtype == 'object':  # Check if column type is object (string)
        encoded_dataset[column] = encoded_dataset[column].str.replace(' ', '')  # Remove spaces
        encoded_dataset[column] = pd.to_numeric(encoded_dataset[column], errors='coerce')  # Convert to numeric

In [13]:
encoded_dataset

Unnamed: 0,Transaction ID,Time,Amount,Age,Fraud,Type of Card_MasterCard,Type of Card_Visa,Entry Mode_CVC,Entry Mode_PIN,Entry Mode_Tap,...,Bank_RBS,Shipping Address_China,Shipping Address_India,Shipping Address_Russia,Shipping Address_USA,Shipping Address_United Kingdom,Day of week,Day,Month,Year
0,3577209,19,5,25.2,0,0,1,0,0,1,...,1,0,0,0,0,1,3,14,10,2020
1,3039221,17,288,49.6,0,1,0,0,1,0,...,0,0,0,0,1,0,3,14,10,2020
2,2694780,14,5,42.2,0,0,1,0,0,1,...,0,0,1,0,0,0,3,14,10,2020
3,2640960,14,28,51.0,0,0,1,0,0,1,...,0,0,1,0,0,0,2,13,10,2020
4,2771031,23,91,38.0,1,0,1,1,0,0,...,0,0,0,0,1,0,2,13,10,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3203892,22,15,53.8,0,1,0,0,0,1,...,0,0,0,0,0,1,2,13,10,2020
99996,3304849,23,7,45.0,0,1,0,0,1,0,...,0,0,0,1,0,0,3,14,10,2020
99997,3532129,11,21,46.5,0,1,0,0,1,0,...,0,0,0,0,0,1,2,13,10,2020
99998,3107092,22,25,48.2,0,0,1,0,0,1,...,0,0,0,0,0,1,3,14,10,2020


## Feature Engineering ##

In [14]:
# Coordinates for each country
country_coordinates = {
    'China': (35.8617, 104.1954),
    'India': (20.5937, 78.9629),
    'Russia': (61.5240, 105.3188),
    'USA': (37.0902, -95.7129),
    'United Kingdom': (55.3781, -3.4360)
}

# Function to map binary columns back to country names
def get_country_from_columns(row, prefix):
    for col in encoded_dataset.columns:
        if col.startswith(prefix) and row[col] == 1:
            return col.split('_')[-1]
    return None

encoded_dataset['Country of Residence'] = encoded_dataset.apply(lambda x: get_country_from_columns(x, 'Country of Residence_'), axis=1)
encoded_dataset['Country of Transaction'] = encoded_dataset.apply(lambda x: get_country_from_columns(x, 'Country of Transaction_'), axis=1)


In [15]:
def haversine(coord1, coord2):
    lat1, lon1 = coord1
    lat2, lon2 = coord2
    R = 6371  # Earth radius in km
    dlat = np.radians(lat2 - lat1)
    dlon = np.radians(lon2 - lon1)
    a = np.sin(dlat/2) ** 2 + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return R * c

# Calculate Distance from Home
encoded_dataset['Distance from Home'] = encoded_dataset.apply(lambda x: haversine(country_coordinates[x['Country of Residence']], country_coordinates[x['Country of Transaction']]), axis=1)

In [16]:
encoded_dataset=encoded_dataset.drop('Country of Residence', axis=1)
encoded_dataset=encoded_dataset.drop('Country of Transaction', axis=1)

In [17]:
encoded_dataset

Unnamed: 0,Transaction ID,Time,Amount,Age,Fraud,Type of Card_MasterCard,Type of Card_Visa,Entry Mode_CVC,Entry Mode_PIN,Entry Mode_Tap,...,Shipping Address_China,Shipping Address_India,Shipping Address_Russia,Shipping Address_USA,Shipping Address_United Kingdom,Day of week,Day,Month,Year,Distance from Home
0,3577209,19,5,25.2,0,0,1,0,0,1,...,0,0,0,0,1,3,14,10,2020,0.000000
1,3039221,17,288,49.6,0,1,0,0,1,0,...,0,0,0,1,0,3,14,10,2020,0.000000
2,2694780,14,5,42.2,0,0,1,0,0,1,...,0,1,0,0,0,3,14,10,2020,0.000000
3,2640960,14,28,51.0,0,0,1,0,0,1,...,0,1,0,0,0,2,13,10,2020,0.000000
4,2771031,23,91,38.0,1,0,1,1,0,0,...,0,0,0,1,0,2,13,10,2020,6830.403541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,3203892,22,15,53.8,0,1,0,0,0,1,...,0,0,0,0,1,2,13,10,2020,0.000000
99996,3304849,23,7,45.0,0,1,0,0,1,0,...,0,0,1,0,0,3,14,10,2020,0.000000
99997,3532129,11,21,46.5,0,1,0,0,1,0,...,0,0,0,0,1,2,13,10,2020,0.000000
99998,3107092,22,25,48.2,0,0,1,0,0,1,...,0,0,0,0,1,3,14,10,2020,0.000000


In [18]:
target_column = 'Fraud'

# Move the target column to the end
columns = [col for col in encoded_dataset.columns if col != target_column] + [target_column]
encoded_dataset= encoded_dataset[columns]

## Balancing Data Before Splitting ##

In [19]:
final_Dataset=encoded_dataset

In [20]:
# Separate input features and target
X = final_Dataset.drop('Fraud', axis=1)
y = final_Dataset['Fraud']

# Define the SMOTE sampler
smote = SMOTE(random_state=42)

# Apply SMOTE to the data
X_res, y_res = smote.fit_resample(X, y)

# Combine the resampled data back into a DataFrame
resampled = pd.DataFrame(X_res, columns=X.columns)
resampled['Fraud'] = y_res


In [21]:
resampled

Unnamed: 0,Transaction ID,Time,Amount,Age,Type of Card_MasterCard,Type of Card_Visa,Entry Mode_CVC,Entry Mode_PIN,Entry Mode_Tap,Type of Transaction_ATM,...,Shipping Address_India,Shipping Address_Russia,Shipping Address_USA,Shipping Address_United Kingdom,Day of week,Day,Month,Year,Distance from Home,Fraud
0,3577209,19,5,25.200000,0,1,0,0,1,0,...,0,0,0,1,3,14,10,2020,0.000000,0
1,3039221,17,288,49.600000,1,0,0,1,0,0,...,0,0,1,0,3,14,10,2020,0.000000,0
2,2694780,14,5,42.200000,0,1,0,0,1,0,...,1,0,0,0,3,14,10,2020,0.000000,0
3,2640960,14,28,51.000000,0,1,0,0,1,0,...,1,0,0,0,2,13,10,2020,0.000000,0
4,2771031,23,91,38.000000,0,1,1,0,0,0,...,0,0,1,0,2,13,10,2020,6830.403541,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185565,3518297,16,14,51.588713,0,1,1,0,0,0,...,0,1,0,0,2,13,10,2020,5613.818821,1
185566,3001948,15,17,41.397438,0,1,1,0,0,0,...,0,0,0,0,3,14,10,2020,0.000000,1
185567,3115895,9,14,29.818190,0,1,0,0,0,0,...,0,0,0,0,3,14,10,2020,0.000000,1
185568,3135057,4,123,36.576454,0,0,0,0,0,0,...,0,0,0,0,2,13,10,2020,7690.177163,1


In [22]:
target_distribution =resampled['Fraud'].value_counts()

# Print the distribution
print("Target Variable Distribution:")
print(target_distribution)

Target Variable Distribution:
Fraud
0    92785
1    92785
Name: count, dtype: int64


**Creating Subset**

In [23]:
# Creating a subset of your DataFrame
subset = resampled.sample(n=60000, random_state=42) # Sample 3 random rows from dataframe

main_dataset = resampled.drop(subset.index)

In [24]:
main_dataset.reset_index(drop=True, inplace=True)

In [25]:
main_dataset

Unnamed: 0,Transaction ID,Time,Amount,Age,Type of Card_MasterCard,Type of Card_Visa,Entry Mode_CVC,Entry Mode_PIN,Entry Mode_Tap,Type of Transaction_ATM,...,Shipping Address_India,Shipping Address_Russia,Shipping Address_USA,Shipping Address_United Kingdom,Day of week,Day,Month,Year,Distance from Home,Fraud
0,3039221,17,288,49.600000,1,0,0,1,0,0,...,0,0,1,0,3,14,10,2020,0.000000,0
1,2694780,14,5,42.200000,0,1,0,0,1,0,...,1,0,0,0,3,14,10,2020,0.000000,0
2,2640960,14,28,51.000000,0,1,0,0,1,0,...,1,0,0,0,2,13,10,2020,0.000000,0
3,3446698,20,30,48.400000,1,0,0,0,1,0,...,1,0,0,0,2,13,10,2020,0.000000,0
4,3652191,18,231,39.500000,0,1,1,0,0,0,...,0,0,0,1,2,13,10,2020,0.000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125565,2671607,7,43,47.990385,0,1,0,0,0,0,...,0,0,0,0,2,13,10,2020,6262.452647,1
125566,3407522,9,20,35.116123,0,0,0,0,0,0,...,0,0,0,0,2,13,10,2020,0.000000,1
125567,3371075,7,17,51.963993,0,1,0,0,0,0,...,0,1,0,0,3,14,10,2020,5613.818821,1
125568,3518297,16,14,51.588713,0,1,1,0,0,0,...,0,1,0,0,2,13,10,2020,5613.818821,1


In [26]:
subset.to_csv('extraexamples.csv')

## MODEL DEVLOPMENT ##

**Splitting dataset**

In [27]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(main_dataset, main_dataset['Fraud']):
    strat_train_set = main_dataset.loc[train_index]
    strat_test_set = main_dataset.loc[test_index]

In [28]:
 strat_test_set['Fraud'].value_counts()

Fraud
0    12584
1    12530
Name: count, dtype: int64

**Standardization and Normalization**

In [29]:
X_train = strat_train_set.drop("Fraud", axis=1)
Y_train= strat_train_set["Fraud"].copy()

# Define the pipeline
pipeline = Pipeline([
    ('std_scaler', StandardScaler()),   # Step 1: Standardization
    ('min_max_scaler', MinMaxScaler())  # Step 2: Normalization
])

# Fit and transform the data using the pipeline
X_train_transformed = pipeline.fit_transform(X_train)

# Print the transformed data
print("Transformed Data:")
print(X_train_transformed)


Transformed Data:
[[0.89186175 0.         0.12151899 ... 0.         0.         0.92916951]
 [0.91039245 0.54166667 0.03037975 ... 0.         0.         0.92304571]
 [0.67153015 0.91666667 0.01518987 ... 0.         0.         0.87799412]
 ...
 [0.17716871 0.08333333 0.10126582 ... 0.         0.         0.        ]
 [0.1632902  0.66666667 0.68607595 ... 0.         0.         0.        ]
 [0.56141652 0.41666667 0.05316456 ... 0.         0.         0.        ]]


In [30]:
X_test = strat_test_set.drop("Fraud", axis=1)
Y_test= strat_test_set["Fraud"].copy()
X_test_transformed = pipeline.fit_transform(X_test)

**Random Forest**

In [31]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train_transformed, Y_train)

# Predict on the test data
Y_pred = rf_classifier.predict(X_test_transformed)

# Evaluate the model
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred))

Accuracy: 0.9897666640121048

Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99     12584
           1       0.99      0.99      0.99     12530

    accuracy                           0.99     25114
   macro avg       0.99      0.99      0.99     25114
weighted avg       0.99      0.99      0.99     25114



**FNN MODEL**

In [None]:
# Initialize the FNN model
model = Sequential()

# Adding layers
model.add(Dense(units=64, activation='relu', input_dim=X_train_transformed.shape[1]))
model.add(Dense(units=128, activation='relu'))
model.add(Dropout(0.2))  # Optional: Dropout layer for regularization
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))  # Output layer for binary classification

# Compiling the model
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Training the model
hist = model.fit(X_train_transformed, Y_train, epochs=15, batch_size=32, validation_split=0.2)

# Evaluating the model
loss, accuracy = model.evaluate(X_test_transformed, Y_test)
print(f'Test Accuracy: {accuracy:.4f}')

# Making predictions
Y_pred_prob = model.predict(X_test_transformed)
Y_pred = (Y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions (0 or 1)

Y_true = Y_test.values  

# Compute confusion matrix
conf_matrix = confusion_matrix(Y_true, Y_pred)

print("Confusion Matrix:")
print(conf_matrix)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m2512/2512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 3ms/step - accuracy: 0.9617 - loss: 0.0982 - val_accuracy: 0.9807 - val_loss: 0.0497
Epoch 2/15
[1m2512/2512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9839 - loss: 0.0409 - val_accuracy: 0.9845 - val_loss: 0.0382
Epoch 3/15
[1m2512/2512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 3ms/step - accuracy: 0.9860 - loss: 0.0357 - val_accuracy: 0.9859 - val_loss: 0.0359
Epoch 4/15
[1m2512/2512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9881 - loss: 0.0316 - val_accuracy: 0.9872 - val_loss: 0.0339
Epoch 5/15
[1m2512/2512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9880 - loss: 0.0304 - val_accuracy: 0.9875 - val_loss: 0.0343
Epoch 6/15
[1m2512/2512[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9888 - loss: 0.0288 - val_accuracy: 0.9890 - val_loss: 0.0291
Epoch 7/15
[1m

**FNN EVALUATION**

In [None]:
# Make predictions
Y_pred_prob = model.predict(X_test_transformed)
Y_pred = (Y_pred_prob > 0.5).astype(int)  # Convert probabilities to binary predictions (0 or 1)

# Convert true labels from one-hot encoding to class labels if needed
Y_true = Y_test.values  

# Compute confusion matrix
conf_matrix = confusion_matrix(Y_true, Y_pred)

# Plot confusion matrix as a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=['Not Fraud', 'Fraud'], yticklabels=['Not Fraud', 'Fraud'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Retrieve metrics from the history object
epochs = range(1, len(hist.history['accuracy']) + 1)
train_accuracy = hist.history['accuracy']
val_accuracy = hist.history['val_accuracy']
train_loss = hist.history['loss']
val_loss = hist.history['val_loss']

# Plot training and validation accuracy
plt.figure(figsize=(14, 6))

plt.subplot(1, 2, 1)
plt.plot(epochs, train_accuracy, 'bo-', label='Training accuracy')
plt.plot(epochs, val_accuracy, 'ro-', label='Validation accuracy')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

# Plot training and validation loss
plt.subplot(1, 2, 2)
plt.plot(epochs, train_loss, 'bo-', label='Training loss')
plt.plot(epochs, val_loss, 'ro-', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.show()

**Logistic Regression**

In [None]:
log_reg = LogisticRegression()

log_reg.fit(X_train_transformed, Y_train)

# Predict on the test data
Y_pred = log_reg.predict(X_test_transformed)

# Evaluate the model
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred))

**XGboost**

In [1]:
xgb_clf = xgb.XGBClassifier(objective='binary:logistic', colsample_bytree=0.3, learning_rate=0.1,
                            max_depth=5, alpha=10, n_estimators=10)


# Fit the model
xgb_clf.fit(X_train_transformed, Y_train)

# Predict on the test data
Y_pred = xgb_clf.predict(X_test_transformed)

# Evaluate the model
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:")
print(classification_report(Y_test, Y_pred))

NameError: name 'xgb' is not defined