In [None]:
import pandas as pd
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
data = pd.read_csv(url)
data.head()

In [None]:
data.shape

In [None]:
data['Class'].value_counts()

In [None]:
import matplotlib.pyplot as plt

data['Class'].value_counts().plot(kind='bar')
plt.title("Class Distribution (Imbalanced Dataset)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.show()

In [None]:
X = data.drop('Class', axis=1)
y = data['Class']

X.shape, y.shape


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.3, #30% of the data for testing & 70% for training
    random_state=42,
    stratify=y
)
X_train.shape, X_test.shape


In [None]:
print("Training set:")
print(y_train.value_counts())

print("\nTesting set:")
print(y_test.value_counts())

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

In [24]:
#SAMPLING TECHINQUE 1

ros = RandomOverSampler(random_state=42)    #Randomly duplicates fraud cases

# Apply on training data ONLY
X_ros, y_ros = ros.fit_resample(X_train, y_train)

y_ros.value_counts()


Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,534
1,534


In [23]:
#SAMPLING TECHNIQUE 2

from imblearn.under_sampling import RandomUnderSampler  #Removes normal transactions and keeps fraud transactions

rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)
y_rus.value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,6
1,6


In [25]:
#SAMPLING TECHNIQUE 3

from imblearn.over_sampling import SMOTE    #Creates new fraud samples

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)
y_smote.value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,534
1,534


In [20]:
#SAMPLING TECHNIQUE 4

from imblearn.under_sampling import NearMiss   #Removes majority samples closest to minority

nearmiss = NearMiss()
X_nm, y_nm = nearmiss.fit_resample(X_train, y_train)
y_nm.value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,6
1,6


In [22]:
#SAMPLING TECHNIQUE 5

from imblearn.combine import SMOTETomek    #SMOTE -> Adds frauds samples && Tomek -> removes noisy overlaps

smt = SMOTETomek(random_state=42)
X_smt, y_smt = smt.fit_resample(X_train, y_train)
y_smt.value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,518
1,518


In [26]:
datasets = {
    "Sampling1_ROS": (X_ros, y_ros),
    "Sampling2_RUS": (X_rus, y_rus),
    "Sampling3_SMOTE": (X_smote, y_smote),
    "Sampling4_NearMiss": (X_nm, y_nm),
    "Sampling5_SMOTETomek": (X_smt, y_smt)
}

for name, (X_s, y_s) in datasets.items():
    print(name)
    print(y_s.value_counts(), "\n")


Sampling1_ROS
Class
0    534
1    534
Name: count, dtype: int64 

Sampling2_RUS
Class
0    6
1    6
Name: count, dtype: int64 

Sampling3_SMOTE
Class
0    534
1    534
Name: count, dtype: int64 

Sampling4_NearMiss
Class
0    6
1    6
Name: count, dtype: int64 

Sampling5_SMOTETomek
Class
0    518
1    518
Name: count, dtype: int64 



In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [28]:
models = {
    "M1_LogisticRegression": LogisticRegression(max_iter=1000),
    "M2_DecisionTree": DecisionTreeClassifier(random_state=42),
    "M3_RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "M4_SVM": SVC(),
    "M5_KNN": KNeighborsClassifier()
}

In [31]:
from sklearn.metrics import accuracy_score

In [None]:
import pandas as pd

results = pd.DataFrame(
    index=models.keys(),
    columns=datasets.keys()
)
results

In [34]:
for model_name, model in models.items():  #looping through each ML model
    for sampling_name, (X_s, y_s) in datasets.items():  #looping through each sampled dataset
        model.fit(X_s, y_s)   #train model on balanced training data
        y_pred = model.predict(X_test)    #train on original unseen data
        acc = accuracy_score(y_test, y_pred)   #calculate accuracy
        results.loc[model_name, sampling_name] = round(acc * 100, 2)   #convert to percentage
results

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,Sampling1_ROS,Sampling2_RUS,Sampling3_SMOTE,Sampling4_NearMiss,Sampling5_SMOTETomek
M1_LogisticRegression,91.81,57.76,93.53,18.53,92.67
M2_DecisionTree,96.98,38.79,98.71,15.95,97.84
M3_RandomForest,99.14,66.81,98.71,50.86,98.71
M4_SVM,87.5,74.57,43.53,38.79,43.53
M5_KNN,97.84,75.0,72.41,39.66,73.71


In [35]:
results.to_csv("sampling_results.csv")