### Detecting & Handling Imbalanced Data: Visualizing Class Imbalance
**Question**: Load the Credit Card Fraud Detection dataset and visualize the class imbalance. Then apply random undersampling to balance it.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the Credit Card Fraud Detection dataset
try:
    df = pd.read_csv('creditcard.csv')  # Replace 'creditcard.csv' with the actual path if needed
    print("Credit Card Fraud Detection dataset loaded successfully.\n")
except FileNotFoundError:
    print("Error: creditcard.csv not found. Please make sure the file is in the correct directory.")
    exit()

# --- Step 1: Visualize Class Imbalance ---
print("--- Visualizing Class Imbalance ---")

# Count the number of instances for each class
class_counts = df['Class'].value_counts()
print("Class distribution:\n", class_counts)

# Visualize the class distribution using a bar chart
plt.figure(figsize=(8, 6))
sns.barplot(x=class_counts.index, y=class_counts.values)
plt.title('Class Distribution in Credit Card Fraud Detection Dataset')
plt.xlabel('Class (0: Not Fraud, 1: Fraud)')
plt.ylabel('Number of Transactions')
plt.show()

# Calculate the percentage of each class
total_transactions = len(df)
fraud_percentage = (class_counts[1] / total_transactions) * 100
non_fraud_percentage = (class_counts[0] / total_transactions) * 100
print(f"\nPercentage of Fraudulent Transactions: {fraud_percentage:.2f}%")
print(f"Percentage of Non-Fraudulent Transactions: {non_fraud_percentage:.2f}%")

# --- Step 2: Apply Random Undersampling to Balance Data ---
print("\n--- Applying Random Undersampling ---")

# Separate the majority class (non-fraud) and the minority class (fraud)
fraud_df = df[df['Class'] == 1]
non_fraud_df = df[df['Class'] == 0]

# Number of fraud samples
num_fraud = len(fraud_df)
print(f"Number of fraud transactions: {num_fraud}")

# Randomly select an equal number of non-fraud samples
undersampled_non_fraud = non_fraud_df.sample(n=num_fraud, random_state=42)

# Concatenate the undersampled non-fraud samples with all fraud samples
undersampled_df = pd.concat([undersampled_non_fraud, fraud_df])

# Shuffle the undersampled dataset
undersampled_df = undersampled_df.sample(frac=1, random_state=42).reset_index(drop=True)

# --- Step 3: Visualize the Balanced Class Distribution ---
print("\n--- Visualizing Balanced Class Distribution After Undersampling ---")

# Count the number of instances for each class in the undersampled dataset
undersampled_class_counts = undersampled_df['Class'].value_counts()
print("Class distribution after undersampling:\n", undersampled_class_counts)

# Visualize the class distribution of the undersampled data
plt.figure(figsize=(8, 6))
sns.barplot(x=undersampled_class_counts.index, y=undersampled_class_counts.values)
plt.title('Class Distribution After Random Undersampling')
plt.xlabel('Class (0: Not Fraud, 1: Fraud)')
plt.ylabel('Number of Transactions')
plt.show()

# Calculate the percentage of each class in the undersampled data
total_undersampled = len(undersampled_df)
undersampled_fraud_percentage = (undersampled_class_counts[1] / total_undersampled) * 100
undersampled_non_fraud_percentage = (undersampled_class_counts[0] / total_undersampled) * 100
print(f"\nPercentage of Fraudulent Transactions (Undersampled): {undersampled_fraud_percentage:.2f}%")
print(f"Percentage of Non-Fraudulent Transactions (Undersampled): {undersampled_non_fraud_percentage:.2f}%")

print("\nClass imbalance visualized and random undersampling applied.")

Error: creditcard.csv not found. Please make sure the file is in the correct directory.
--- Visualizing Class Imbalance ---


NameError: name 'df' is not defined

: 