In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv('Creditcard_data.csv')

In [17]:
data.shape

(772, 31)

In [12]:
data['Class'].value_counts()

Class
0    763
1      9
Name: count, dtype: int64

In [None]:
# Generating synthetic samples for minority class to balance the dataset...
from imblearn.over_sampling import SMOTE

X = data.drop('Class', axis=1)
y = data['Class']

smote = SMOTE(random_state=42, k_neighbors=5)
X_smote, y_smote = smote.fit_resample(X, y)
smote_data = pd.DataFrame(X_smote, columns=X.columns)
smote_data['Class'] = y_smote

print("SMOTE Oversampling - Class Distribution:")
print(smote_data['Class'].value_counts())
print(f"Total Sample Size: {len(smote_data)}")
print(f"New Synthetic Samples Created: {len(smote_data) - len(data)}")
print(f"All original records preserved and minority class duplicated/synthesized\n")

SMOTE Oversampling - Class Distribution:
Class
0    763
1    763
Name: count, dtype: int64
Total Sample Size: 1526
New Synthetic Samples Created: 754
All original records preserved and minority class duplicated/synthesized



[WinError 2] The system cannot find the file specified
  File "c:\Users\bhati\AppData\Local\Programs\Python\Python312\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "c:\Users\bhati\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\bhati\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "c:\Users\bhati\AppData\Local\Programs\Python\Python312\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^


In [26]:
# 1. Simple Random Sampling
# Randomly select samples without replacement
sample_size = 100
random_sample = smote_data.sample(n=sample_size, random_state=202)
print("Random Sampling - Class Distribution:")
print(random_sample['Class'].value_counts())
print(f"Sample Size: {len(random_sample)}\n")

Random Sampling - Class Distribution:
Class
0    51
1    49
Name: count, dtype: int64
Sample Size: 100



In [27]:
# 2. Systematic Sampling
# Select every kth sample
k = len(smote_data) // 100  # Calculate interval
systematic_sample = smote_data.iloc[::k]
print("Systematic Sampling - Class Distribution:")
print(systematic_sample['Class'].value_counts())
print(f"Sample Size: {len(systematic_sample)}\n")

Systematic Sampling - Class Distribution:
Class
0    52
1    50
Name: count, dtype: int64
Sample Size: 102



In [28]:
# 3. Stratified Sampling
# Sample proportionally from each class (best for unbalanced data)
stratified_sample = smote_data.groupby('Class', group_keys=False).apply(
    lambda x: x.sample(frac=0.5, random_state=42)
)
print("Stratified Sampling - Class Distribution:")
print(stratified_sample['Class'].value_counts())
print(f"Sample Size: {len(stratified_sample)}")
print(f"Class 0 Percentage: {len(stratified_sample[stratified_sample['Class'] == 0]) / len(stratified_sample) * 100:.2f}%")
print(f"Class 1 Percentage: {len(stratified_sample[stratified_sample['Class'] == 1]) / len(stratified_sample) * 100:.2f}%\n")

Stratified Sampling - Class Distribution:
Class
0    382
1    382
Name: count, dtype: int64
Sample Size: 764
Class 0 Percentage: 50.00%
Class 1 Percentage: 50.00%



  stratified_sample = smote_data.groupby('Class', group_keys=False).apply(


In [30]:
# 4. Cluster Sampling
# Divide data into clusters and randomly select clusters
from sklearn.cluster import KMeans

n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
smote_data['Cluster'] = kmeans.fit_predict(smote_data.drop('Class', axis=1))

# Randomly select clusters
selected_clusters = np.random.choice(n_clusters, size=3, replace=False)
cluster_sample = smote_data[smote_data['Cluster'].isin(selected_clusters)].drop('Cluster', axis=1)

print("Cluster Sampling - Class Distribution:")
print(cluster_sample['Class'].value_counts())
print(f"Sample Size: {len(cluster_sample)}\n")

Cluster Sampling - Class Distribution:
Class
1    396
0    297
Name: count, dtype: int64
Sample Size: 693



In [32]:
# 5. Cross-Validation (K-Fold Split)
# Divide data into k-folds for validation
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
fold_number = 1

print("Cross-Validation (K-Fold) - Class Distribution per Fold:")
for train_idx, test_idx in kfold.split(smote_data):
    train_fold = smote_data.iloc[train_idx]
    test_fold = smote_data.iloc[test_idx]
    print(f"\nFold {fold_number}:")
    print(f"  Training Set Size: {len(train_fold)}, Test Set Size: {len(test_fold)}")
    print(f"  Training - Class 0: {len(train_fold[train_fold['Class'] == 0])}, Class 1: {len(train_fold[train_fold['Class'] == 1])}")
    print(f"  Test - Class 0: {len(test_fold[test_fold['Class'] == 0])}, Class 1: {len(test_fold[test_fold['Class'] == 1])}")
    fold_number += 1

Cross-Validation (K-Fold) - Class Distribution per Fold:

Fold 1:
  Training Set Size: 1220, Test Set Size: 306
  Training - Class 0: 620, Class 1: 600
  Test - Class 0: 143, Class 1: 163

Fold 2:
  Training Set Size: 1221, Test Set Size: 305
  Training - Class 0: 603, Class 1: 618
  Test - Class 0: 160, Class 1: 145

Fold 3:
  Training Set Size: 1221, Test Set Size: 305
  Training - Class 0: 603, Class 1: 618
  Test - Class 0: 160, Class 1: 145

Fold 4:
  Training Set Size: 1221, Test Set Size: 305
  Training - Class 0: 601, Class 1: 620
  Test - Class 0: 162, Class 1: 143

Fold 5:
  Training Set Size: 1221, Test Set Size: 305
  Training - Class 0: 625, Class 1: 596
  Test - Class 0: 138, Class 1: 167


In [33]:
# 6. Bootstrap Sampling
# Sampling with replacement
sample_size = 100
bootstrap_sample = smote_data.sample(n=sample_size, replace=True, random_state=42)

print("\nBootstrap Sampling - Class Distribution:")
print(bootstrap_sample['Class'].value_counts())
print(f"Sample Size: {len(bootstrap_sample)}")
print(f"Unique records: {len(bootstrap_sample.drop_duplicates())}")
print("\nBootstrap Sampling allows repeated selection of the same records")


Bootstrap Sampling - Class Distribution:
Class
1    58
0    42
Name: count, dtype: int64
Sample Size: 100
Unique records: 97

Bootstrap Sampling allows repeated selection of the same records


In [None]:
# Import required libraries for model training and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, sensitivity_score, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Define 5 different models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Support Vector Machine': SVC(kernel='rbf', random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42)
}

# Dictionary to store all samples
samples = {
    'Original Data': data,
    'Random Sampling': random_sample,
    'Systematic Sampling': systematic_sample,
    'Stratified Sampling': stratified_sample,
    'Cluster Sampling': cluster_sample,
    'Bootstrap Sampling': bootstrap_sample,
    'SMOTE Oversampling': smote_data
}

print(f"Models to train: {list(models.keys())}")
print(f"Samples available: {list(samples.keys())}")

In [None]:
# Function to train and evaluate a model
def train_and_evaluate(model, X_train, X_test, y_train, y_test):
    """Train model and calculate evaluation metrics"""
    # Scale the features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Train the model
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_scaled)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    sensitivity = recall  # Sensitivity = Recall for binary classification
    
    return {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'Sensitivity': sensitivity
    }

print("Training and evaluation function defined!")

In [None]:
# Train and evaluate all models on all samples
results_summary = {}

for sample_name, sample_data in samples.items():
    results_summary[sample_name] = {}
    
    # Prepare data (80% train, 20% test)
    X = sample_data.drop('Class', axis=1)
    y = sample_data['Class']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    print(f"\n{'='*70}")
    print(f"Training Models on: {sample_name}")
    print(f"Training Set Size: {len(X_train)}, Test Set Size: {len(X_test)}")
    print(f"{'='*70}")
    
    for model_name, model in models.items():
        metrics = train_and_evaluate(model, X_train, X_test, y_train, y_test)
        results_summary[sample_name][model_name] = metrics
        
        print(f"\n{model_name}:")
        print(f"  Accuracy:   {metrics['Accuracy']:.4f}")
        print(f"  Precision:  {metrics['Precision']:.4f}")
        print(f"  Recall:     {metrics['Recall']:.4f}")
        print(f"  Sensitivity:{metrics['Sensitivity']:.4f}")

print("\n" + "="*70)
print("Training Complete!")
print("="*70)

In [None]:
# Create comprehensive comparison tables
import pandas as pd

print("\n" + "="*100)
print("MODEL PERFORMANCE COMPARISON ACROSS ALL SAMPLING TECHNIQUES")
print("="*100)

# For each metric, create a table
metrics_list = ['Accuracy', 'Precision', 'Recall', 'Sensitivity']

for metric in metrics_list:
    print(f"\n{metric.upper()} SCORES:")
    print("-" * 100)
    
    metric_data = []
    for sample_name in samples.keys():
        row = {'Sampling Method': sample_name}
        for model_name in models.keys():
            if sample_name in results_summary and model_name in results_summary[sample_name]:
                row[model_name] = f"{results_summary[sample_name][model_name][metric]:.4f}"
            else:
                row[model_name] = "N/A"
        metric_data.append(row)
    
    metric_df = pd.DataFrame(metric_data)
    print(metric_df.to_string(index=False))

print("\n" + "="*100)

In [None]:
# Summary: Best performing combinations
print("\n" + "="*100)
print("BEST PERFORMING MODEL-SAMPLING COMBINATIONS")
print("="*100)

for metric in metrics_list:
    print(f"\nBest {metric} Score:")
    
    best_score = 0
    best_combo = None
    
    for sample_name in samples.keys():
        for model_name in models.keys():
            if sample_name in results_summary and model_name in results_summary[sample_name]:
                score = results_summary[sample_name][model_name][metric]
                if score > best_score:
                    best_score = score
                    best_combo = (sample_name, model_name, score)
    
    if best_combo:
        print(f"  Sampling: {best_combo[0]}")
        print(f"  Model: {best_combo[1]}")
        print(f"  Score: {best_combo[2]:.4f}")