In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

data = pd.read_csv('Creditcard_data.csv')

X = data.drop('Class', axis=1)
y = data['Class']

smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

def sample_size_formula(population_size, confidence_level, margin_of_error):
    z = 1.96
    p = 0.5
    e = margin_of_error
    n = (z**2 * p * (1 - p)) / (e**2)
    n = n / (1 + ((n - 1) / population_size))
    return int(np.ceil(n))

sample_size = sample_size_formula(len(y_balanced), 0.95, 0.05)

samples = []
for i in range(5):
    sample = X_balanced.sample(n=sample_size, random_state=42 + i)
    samples.append((sample, y_balanced.loc[sample.index]))

sampling_techniques = ['Sample1', 'Sample2', 'Sample3', 'Sample4', 'Sample5']
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'KNN': KNeighborsClassifier()
}

def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

results_df = pd.DataFrame(columns=["Sample", "Classifier", "Accuracy"])

for i, (X_sample, y_sample) in enumerate(samples):
    X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)
    for model_name, classifier in models.items():
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        results_df = pd.concat([
            results_df,
            pd.DataFrame([{"Sample": sampling_techniques[i], "Classifier": model_name, "Accuracy": accuracy}])
        ], ignore_index=True)

results_df.to_csv('model_evaluation_result.csv', index=False)
print("Results saved to 'model_evaluation_result.csv'")

# Optional: Download the results file (for Colab or similar environments)
from google.colab import files
files.download('model_evaluation_result.csv')

final_results = {}
for model_name in models.keys():
    best_sampling = results_df[results_df["Classifier"] == model_name].loc[results_df[results_df["Classifier"] == model_name]["Accuracy"].idxmax()]
    final_results[model_name] = (best_sampling["Sample"], best_sampling["Accuracy"])

print("Results for each Sampling Technique:")
print(results_df)
print("\nBest Sampling Technique for each Model:")
print(final_results)

  results_df = pd.concat([


Results saved to 'model_evaluation_result.csv'


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Results for each Sampling Technique:
     Sample           Classifier  Accuracy
0   Sample1        Random Forest  0.951613
1   Sample1  Logistic Regression  0.919355
2   Sample1                  SVM  0.645161
3   Sample1        Decision Tree  0.903226
4   Sample1                  KNN  0.806452
5   Sample2        Random Forest  1.000000
6   Sample2  Logistic Regression  0.822581
7   Sample2                  SVM  0.629032
8   Sample2        Decision Tree  0.935484
9   Sample2                  KNN  0.645161
10  Sample3        Random Forest  0.983871
11  Sample3  Logistic Regression  0.935484
12  Sample3                  SVM  0.661290
13  Sample3        Decision Tree  0.919355
14  Sample3                  KNN  0.822581
15  Sample4        Random Forest  1.000000
16  Sample4  Logistic Regression  0.887097
17  Sample4                  SVM  0.612903
18  Sample4        Decision Tree  0.935484
19  Sample4                  KNN  0.870968
20  Sample5        Random Forest  0.967742
21  Sample5  Logi