In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Generate synthetic dataset
np.random.seed(42)  # For reproducibility
data = {
    'Math': np.random.randint(60, 100, size=100),     # Random Math scores
    'Science': np.random.randint(60, 100, size=100),  # Random Science scores
    'English': np.random.randint(60, 100, size=100),  # Random English scores
    'Target': np.random.randint(0, 2, size=100)       # Random binary target: 0 or 1 (Fail/Pass)
}

# Create a DataFrame
df = pd.DataFrame(data)

# Feature Engineering: Create new features
df['Total_Score'] = df['Math'] + df['Science'] + df['English']  # Sum of subject scores
df['Average_Score'] = df[['Math', 'Science', 'English']].mean(axis=1)  # Average of subject scores

# Display the first few rows of the dataset to see the new features
print("Dataset after feature engineering:")
print(df.head())

# Split dataset into features (X) and target (y)
X = df.drop('Target', axis=1)  # Features: all columns except 'Target'
y = df['Target']  # Target: 'Target' column

# Optionally scale the features (important for certain models like SVM)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Define model: Random Forest Classifier
model = RandomForestClassifier(random_state=42)

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the forest
    'max_depth': [5, 10, 15, None],  # Maximum depth of each tree
    'min_samples_split': [2, 5],     # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2],      # Minimum samples required to be at a leaf node
}

# Initialize GridSearchCV to search over the parameter grid with 5-fold cross-validation
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Output the best parameters and the best score
print("\nBest Hyperparameters found by GridSearchCV:")
print(grid_search.best_params_)
print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

# Evaluate the model with the best hyperparameters on the test set
best_model = grid_search.best_estimator_

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Evaluate performance: accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {accuracy:.4f}")



Dataset after feature engineering:
   Math  Science  English  Target  Total_Score  Average_Score
0    98       82       65       0          245      81.666667
1    88       83       75       1          246      82.000000
2    74       96       88       0          258      86.000000
3    67       94       62       0          223      74.333333
4    80       99       79       0          258      86.000000

Best Hyperparameters found by GridSearchCV:
{'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-Validation Accuracy: 0.6875

Test Accuracy: 0.5000


In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Step 1: Generate Fraud Detection Dataset (fraud_detection.csv)
# Set random seed for reproducibility
np.random.seed(42)

# Number of rows in the dataset
n = 1000  # You can adjust this value to change the dataset size

# Generate random data for each column
transaction_ids = np.arange(1001, 1001 + n)  # Transaction ID from 1001 to n+1000
amounts = np.random.uniform(10, 1000, n)  # Random transaction amounts between 10 and 1000
types = np.random.choice(['credit', 'debit'], n)  # Random transaction type: credit or debit
is_fraud = np.random.choice([0, 1], n, p=[0.95, 0.05])  # 5% fraud, 95% legitimate

# Create a DataFrame
df = pd.DataFrame({
    'Transaction ID': transaction_ids,
    'Amount': amounts,
    'Type': types,
    'Is Fraud': is_fraud
})

# Save the DataFrame to a CSV file
df.to_csv('fraud_detection.csv', index=False)

# Output the first few rows to verify the dataset
print("Dataset generated and saved as 'fraud_detection.csv'")
print(df.head())

# Step 2: Load & Preprocess the Dataset
df = pd.read_csv('fraud_detection.csv')

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

# Fill missing values (if any) - For simplicity, let's fill with mean for numeric columns
df['Amount'] = df['Amount'].fillna(df['Amount'].mean())

# Step 3: Convert categorical variables using Label Encoding
# Convert 'Type' column (e.g., credit/debit) to numeric values using LabelEncoder
le = LabelEncoder()
df['Type'] = le.fit_transform(df['Type'])

# Step 4: Feature Engineering
# Create new features like transaction-to-amount ratio
df['Transaction_to_Amount_Ratio'] = df['Transaction ID'] / df['Amount']

# Step 5: Split the dataset into features (X) and target (y)
X = df.drop(columns=['Transaction ID', 'Is Fraud'])  # Drop unnecessary columns
y = df['Is Fraud']  # Target column: Is Fraud

# Step 6: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Step 7: Train a Decision Tree Classifier
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Step 8: Evaluate Model Performance
# Predict on the test set
y_pred = model.predict(X_test)

# Compute precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print confusion matrix for better understanding
conf_matrix = confusion_matrix(y_test, y_pred)

# Output performance metrics
print("\nPerformance Metrics:")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# Display confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)


Dataset generated and saved as 'fraud_detection.csv'
   Transaction ID      Amount    Type  Is Fraud
0            1001  380.794718  credit         0
1            1002  951.207163   debit         0
2            1003  734.674002   debit         0
3            1004  602.671899   debit         0
4            1005  164.458454   debit         0

Missing values in each column:
Transaction ID    0
Amount            0
Type              0
Is Fraud          0
dtype: int64

Performance Metrics:
Precision: 0.0333
Recall: 0.0714
F1-Score: 0.0455

Confusion Matrix:
[[257  29]
 [ 13   1]]
