In [None]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score, accuracy_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from google.colab import files
uploaded = files.upload()

In [None]:
# Check the first few rows
print(data.head())
print(data.describe())

# Check for missing values and data types
print(data.info())

# Check for missing values in each column
data.isnull().sum().max()

print('No Frauds', round(data['Class'].value_counts()[0]/len(data) * 100,2), '% of the dataset')
print('Frauds', round(data['Class'].value_counts()[1]/len(data) * 100,2), '% of the dataset')

In [None]:
# Visual representation of the dataset
import seaborn as sns
import matplotlib.pyplot as plt

colors = ["#0101DF", "#DF0101"]

# Countplot with updated parameters
sns.countplot(x='Class', data=data, palette=colors, legend=False)
plt.title('Class Distributions \n (0: No Fraud || 1: Fraud)', fontsize=14)

# Subplots for histograms
fig, ax = plt.subplots(1, 2, figsize=(18, 4))

amount_val = data['Amount'].values
time_val = data['Time'].values

# Histogram for transaction amount
sns.histplot(amount_val, ax=ax[0], color='r', kde=True)
ax[0].set_title('Distribution of Transaction Amount', fontsize=14)
ax[0].set_xlim([min(amount_val), max(amount_val)])


# Histogram for transaction time
sns.histplot(time_val, ax=ax[1], color='b', kde=True)
ax[1].set_title('Distribution of Transaction Time', fontsize=14)
ax[1].set_xlim([min(time_val), max(time_val)])

# Show the plot
plt.tight_layout()
plt.show()

In [None]:
# Step 1: Preprocessing

# Standardize 'Amount' and 'Time'
amount_scaler = StandardScaler()
time_scaler = StandardScaler()
data['scaled_Amount'] = amount_scaler.fit_transform(data['Amount'].values.reshape(-1,1))
data['scaled_Time'] = time_scaler.fit_transform(data['Time'].values.reshape(-1,1))

# Verify the standardization
print(data[['Time', 'Amount', 'scaled_Time' , 'scaled_Amount']].head())

# Step 2: Feature Engineering - Add 'Hour' and 'Amount_Bucket'
data['Hour'] = (data['scaled_Time'] // 3600) % 24

# Since we scaled 'Amount', we need to calculate percentiles and accordingly assign labels to 'Amount_Bucket'
percentiles = np.percentile(data['scaled_Amount'], [25, 50, 75, 95])
data['Amount_Bucket'] = pd.cut(data['scaled_Amount'], bins=[-float('inf')] + list(percentiles) + [float('inf')], labels=False)
print(data)

# One-hot encode 'Amount_Bucket'
data = pd.get_dummies(data, columns=['Amount_Bucket'], drop_first=True)

In [None]:
# Step 3: Separate features and target
X = data.drop(columns=['Class'])
y = data['Class']

In [None]:
# Define models to evaluate
models = {
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100, class_weight='balanced'),
    'Logistic Regression': LogisticRegression(random_state=42, class_weight='balanced', max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(random_state=42, class_weight='balanced'),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

In [None]:
from sklearn.metrics import accuracy_score, roc_auc_score
from collections import defaultdict

# Step 4: Perform Stratified K-Fold Cross-Validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Results storage
results = defaultdict(lambda: {"accuracy": [], "roc_auc": []})

# Evaluate each model
for model_name, model in models.items():
    print(f"Evaluating {model_name}...")

    fold = 1
    for train_idx, test_idx in kfold.split(X, y):
        print(f"  Fold {fold}...")
        # Subset the data for the current fold
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Train the model
        model.fit(X_train, y_train)

        # Predict on the test set
        y_pred = model.predict(X_test)

        # Evaluate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        results[model_name]['accuracy'].append(accuracy)

        # Evaluate ROC AUC
        if hasattr(model, "predict_proba"):
            y_proba = model.predict_proba(X_test)[:, 1]
            roc_auc = roc_auc_score(y_test, y_proba)
            results[model_name]['roc_auc'].append(roc_auc)

        fold += 1

    # Average metrics across folds
    avg_accuracy = np.mean(results[model_name]['accuracy'])
    avg_roc_auc = np.mean(results[model_name]['roc_auc']) if results[model_name]['roc_auc'] else None

    print(f"  Average Accuracy: {avg_accuracy:.4f}")
    if avg_roc_auc:
        print(f"  Average ROC AUC: {avg_roc_auc:.4f}")

# Summarize results
for model_name, metrics in results.items():
    print(f"\nFinal Results for {model_name}:")
    print(f"  Accuracy: {np.mean(metrics['accuracy']):.4f}")
    if metrics['roc_auc']:
        print(f"  ROC AUC: {np.mean(metrics['roc_auc']):.4f}")

In [None]:
# Step 6: Visualize Results
# Accuracy Bar Plot
plt.figure(figsize=(10, 5))
sns.barplot(x=list(models.keys()), y=avg_accuracy)
plt.title('Average Accuracy Across Models')
plt.ylabel('Accuracy')
plt.show()

# ROC-AUC Bar Plot
plt.figure(figsize=(10, 5))
sns.barplot(x=list(models.keys()), y=avg_roc_auc)
plt.title('Average ROC-AUC Across Models')
plt.ylabel('ROC-AUC')
plt.show()

# Step 7: Display Classification Reports
# Classification Report for Random Forest
print("\nDetailed Classification Report for Random Forest (Last Fold):")
model = models['Random Forest']
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))


# Classification Report for Logistic Regression
print("\nDetailed Classification Report for Logistic Regression (Last Fold):")
log_reg_model = models['Logistic Regression']
log_reg_model.fit(X_train, y_train)
y_pred_log_reg = log_reg_model.predict(X_test)
print(classification_report(y_test, y_pred_log_reg))

# Classification Report for Decision Tree
print("\nDetailed Classification Report for Decision Tree (Last Fold):")
dt_model = models['Decision Tree']
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
print(classification_report(y_test, y_pred_dt))

# Classification Report for KNN
print("\nDetailed Classification Report for KNN (Last Fold):")
knn_model = models['KNN']
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
print(classification_report(y_test, y_pred_knn))

In [None]:
# Correlation matrix
# Step 1: Calculate the Correlation Matrix
correlation_matrix = data.corr()

# Step 2: Visualize the Correlation Matrix using a Heatmap
plt.figure(figsize=(12, 8))  # Set the size of the plot
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)

# Display the plot
plt.title("Correlation Matrix of Features")
plt.show()

In [None]:
# Save the processed data to an Excel file
data.to_excel('/content/processed_creditcard_data.xlsx', index=False)

In [None]:
# Download the file to your local machine
files.download('/content/processed_creditcard_data.xlsx')

In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Load your dataset
# Replace 'your_dataset.csv' with the actual dataset path
data = pd.read_csv('creditcard.csv')

# Define features (X) and target (y)
X = data.drop('Class', axis=1)  # Replace 'Class' with your target column name
y = data['Class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Step 1: Train and Save the Model and Scaler
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[['Time', 'Amount']] = scaler.fit_transform(X_train[['Time', 'Amount']])

# Train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Save the model and scaler
joblib.dump(model, 'random_forest_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model and scaler saved successfully.")

# Step 2: Define a function to preprocess the user input
def preprocess_input(user_input):
    # Load the scaler
    scaler = joblib.load('scaler.pkl')

    # Convert user input into a dataframe
    input_data = pd.DataFrame([user_input], columns=X_train.columns)

    # Scale the 'Time' and 'Amount' columns
    input_data[['Time', 'Amount']] = scaler.transform(input_data[['Time', 'Amount']])

    return input_data

# Step 3: Define a function to make a prediction
def predict_fraud(user_input):
    # Load the trained model
    model = joblib.load('random_forest_model.pkl')

    # Preprocess the user input
    preprocessed_input = preprocess_input(user_input)

    # Make the prediction using the model
    prediction = model.predict(preprocessed_input)

    # Return the prediction result
    return "Fraudulent Transaction" if prediction[0] == 1 else "Non-Fraudulent Transaction"

# Step 4: Get user input and make a prediction
# This user input is a non fraudulant transaction taken for testing
user_input = [150000, -2.123, 0.345, -0.654, 0.567, 1.234, -0.987, 0.456, 0.652, -0.718, 1.385, 0.962, -0.616, 0.238, 0.2489, 0.456, 0.654, 0.567, 1.234, -0.956, 0.3155, 0.168, 0.652, -0.718, -0.123, 0.789, 0.4462,0.568, 1.205, 100.0]

# Call the prediction function
result = predict_fraud(user_input)
print(result)

In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

# Load your dataset
# Replace 'your_dataset.csv' with the actual dataset path
data = pd.read_csv('creditcard.csv')

# Define features (X) and target (y)
X = data.drop('Class', axis=1)  # Replace 'Class' with your target column name
y = data['Class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Step 1: Train and Save the Model and Scaler
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_train_scaled[['Time', 'Amount']] = scaler.fit_transform(X_train[['Time', 'Amount']])

# Train the Random Forest model
model = RandomForestClassifier(random_state=42)
model.fit(X_train_scaled, y_train)

# Save the model and scaler
joblib.dump(model, 'random_forest_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model and scaler saved successfully.")

# Step 2: Define a function to preprocess the user input
def preprocess_input(user_input):
    # Load the scaler
    scaler = joblib.load('scaler.pkl')

    # Convert user input into a dataframe
    input_data = pd.DataFrame([user_input], columns=X_train.columns)

    # Scale the 'Time' and 'Amount' columns
    input_data[['Time', 'Amount']] = scaler.transform(input_data[['Time', 'Amount']])

    return input_data

# Step 3: Define a function to make a prediction
def predict_fraud(user_input):
    # Load the trained model
    model = joblib.load('random_forest_model.pkl')

    # Preprocess the user input
    preprocessed_input = preprocess_input(user_input)

    # Make the prediction using the model
    prediction = model.predict(preprocessed_input)

    # Return the prediction result
    return "Fraudulent Transaction" if prediction[0] == 1 else "Non-Fraudulent Transaction"

# Step 4: Get user input and make a prediction
# This user input is a fraudulant transaction taken for testing
user_input = [406, -2.312226542, 1.951992011, -1.609850732, 3.997905588, -0.522187865, -1.426545319, -2.537387306, 1.391657248, -2.770089277, -2.772272145, 3.202033207, -2.899907388, -0.595221881, -4.289253782, 0.38972412, -1.14074718,	-2.830055675, -0.016822468,	0.416955705, 0.126910559, 0.517232371, -0.035049369, -0.465211076, 0.320198199,	0.044519167, 0.177839798, 0.261145003, -0.143275875, 0]

# Call the prediction function
result = predict_fraud(user_input)
print(result)