In [1]:
import pandas as pd

# Load dataset
df = pd.read_csv("C:/Users/satya/Downloads/fraud_detection_dataset.csv")

# Display first few rows
print(df.head())

# Check for missing values
print(df.isnull().sum())


   Transaction_ID   Amount    Type  Is_Fraud
0            1001  2289.89   Debit         0
1            1002  4575.03  Credit         1
2            1003  1127.76   Debit         0
3            1004  1776.56  Credit         0
4            1005  4463.70   Debit         0
Transaction_ID    0
Amount            0
Type              0
Is_Fraud          0
dtype: int64


In [2]:
from sklearn.preprocessing import LabelEncoder

# Encode categorical variable
label_encoder = LabelEncoder()
df['Type'] = label_encoder.fit_transform(df['Type'])

# Display encoded values
print(df['Type'].unique())


[1 0]


In [3]:
# Example: Creating a new feature 'Amount_Squared' as an engineered feature
df['Amount_Squared'] = df['Amount'] ** 2

# Display new feature
print(df[['Amount', 'Amount_Squared']].head())


    Amount  Amount_Squared
0  2289.89    5.243596e+06
1  4575.03    2.093090e+07
2  1127.76    1.271843e+06
3  1776.56    3.156165e+06
4  4463.70    1.992462e+07


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# Define features and target variable
X = df.drop(columns=['Transaction_ID', 'Is_Fraud'])
y = df['Is_Fraud']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Decision Tree model
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)


In [5]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform GridSearchCV
grid_search = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='f1')
grid_search.fit(X_train, y_train)

# Display best parameters
print("Best Parameters:", grid_search.best_params_)

# Train with best parameters
best_clf = grid_search.best_estimator_


Best Parameters: {'max_depth': 5, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [6]:
from sklearn.metrics import classification_report

# Make predictions
y_pred = clf.predict(X_test)

# Evaluate model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.83      0.78      0.80       250
           1       0.66      0.73      0.69       150

    accuracy                           0.76       400
   macro avg       0.74      0.75      0.75       400
weighted avg       0.76      0.76      0.76       400



In [7]:
# Predict using the best model
y_pred_best = best_clf.predict(X_test)

# Evaluate performance
print(classification_report(y_test, y_pred_best))


              precision    recall  f1-score   support

           0       0.82      1.00      0.90       250
           1       0.99      0.65      0.78       150

    accuracy                           0.86       400
   macro avg       0.91      0.82      0.84       400
weighted avg       0.89      0.86      0.86       400

