In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Load dataset
data = pd.read_csv('/content/Creditcard_data.csv')

# Separate majority and minority classes
majority_class = data[data['Class'] == 0]
minority_class = data[data['Class'] == 1]

# Balance the dataset using oversampling the minority class
minority_oversampled = resample(
    minority_class,
    replace=True,
    n_samples=len(majority_class),
    random_state=42
)

balanced_data = pd.concat([majority_class, minority_oversampled])
balanced_data = shuffle(balanced_data, random_state=42)

# Split features and target
X = balanced_data.drop('Class', axis=1)
y = balanced_data['Class']

# Scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Function to create samples using various techniques
def create_samples(X, y, technique, sample_size):
    if technique == 'simple_random':
        X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, random_state=42)
    elif technique == 'systematic':
        step = len(X) // sample_size
        indices = np.arange(0, len(X), step)
        X_sample = X[indices]
        y_sample = y.iloc[indices]
    elif technique == 'stratified':
        X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, stratify=y, random_state=42)
    elif technique == 'cross_validation':
        skf = StratifiedKFold(n_splits=5)
        for train_index, test_index in skf.split(X, y):
            X_sample = X[train_index]
            y_sample = y.iloc[train_index]
            break
    elif technique == 'bootstrap':
        X_sample, y_sample = resample(X, y, replace=True, n_samples=sample_size, random_state=42)
    else:
        raise ValueError("Unknown sampling technique")
    return X_sample, y_sample

# Initialize models
models = {
    'M1': RandomForestClassifier(random_state=42),
    'M2': LogisticRegression(random_state=42, max_iter=1000),  # Increased max_iter
    'M3': SVC(random_state=42),
    'M4': DecisionTreeClassifier(random_state=42),
    'M5': GaussianNB()
}

# Sampling techniques
sampling_techniques = ['simple_random', 'systematic', 'stratified', 'cross_validation', 'bootstrap']

# Sample size detection formula (assume 10% of the dataset as an example)
sample_size = int(0.1 * len(X))

results = []

# Apply sampling techniques and train models
for technique in sampling_techniques:
    X_sample, y_sample = create_samples(X, y, technique, sample_size)
    for model_name, model in models.items():

        X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        results.append({'Model': model_name, 'Sampling_Technique': technique, 'Accuracy': accuracy})

# Convert results to DataFrame and display
results_df = pd.DataFrame(results)
results_pivot = results_df.pivot(index='Model', columns='Sampling_Technique', values='Accuracy')
print(results_pivot)

output_path = 'sampling_results.csv'
results_pivot.to_csv(output_path)
print(f"Results saved to {output_path}")


Sampling_Technique  bootstrap  cross_validation  simple_random  stratified  \
Model                                                                        
M1                   0.967742          1.000000       1.000000    1.000000   
M2                   0.838710          0.897541       0.935484    0.903226   
M3                   0.935484          0.967213       0.935484    1.000000   
M4                   1.000000          1.000000       0.967742    1.000000   
M5                   0.774194          0.823770       0.774194    0.709677   

Sampling_Technique  systematic  
Model                           
M1                    0.903226  
M2                    0.870968  
M3                    0.838710  
M4                    0.935484  
M5                    0.806452  
Results saved to sampling_results.csv
