<a href="https://colab.research.google.com/github/prarthana127/Sampling-Prarthana-102383015/blob/main/Sampling_Prarthana_102383015.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sampling Techniques and Model Evaluation

1. **Data Preprocessing**: Normalize the 'Amount' feature and prepare the dataset for analysis.

2. **Sampling Techniques**: Implement Simple Random Sampling, Systematic Sampling, Cluster Sampling, Stratified Sampling, and Bootstrap Sampling.

3. **Machine Learning Models**: Test and evaluate models including Random Forest, Logistic Regression, Support Vector Machine (SVM), Decision Trees, and AdaBoost for each sampling method.

4. **Model Evaluation**: Compute accuracy scores for all models to compare and assess their performance.









In [132]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.preprocessing import normalize

In [133]:
url = "https://raw.githubusercontent.com/AnjulaMehto/Sampling_Assignment/main/Creditcard_data.csv"
df = pd.read_csv(url)

In [134]:
df.head(10)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,1
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


In [135]:
df.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0,...,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0,772.0
mean,283.005181,-0.176963,0.217169,0.875172,0.285628,-0.005029,0.159081,0.123329,-0.057547,-0.030384,...,0.004888,-0.096995,-0.040344,-0.002501,0.114337,0.022782,0.023353,-0.017045,68.66829,0.011658
std,171.834196,1.294724,1.173401,1.031878,1.258758,1.098143,1.225682,0.852075,0.830144,0.878183,...,0.609335,0.607228,0.358724,0.621507,0.429667,0.484227,0.300934,0.278332,197.838269,0.107411
min,0.0,-6.093248,-12.114213,-5.694973,-4.657545,-6.631951,-3.498447,-4.925568,-7.494658,-2.770089,...,-4.134608,-2.776923,-3.553381,-1.867208,-1.389079,-1.243924,-2.377933,-2.735623,0.0,0.0
25%,126.5,-0.896416,-0.174684,0.308677,-0.460058,-0.534567,-0.630717,-0.296289,-0.16788,-0.517068,...,-0.213746,-0.525289,-0.176915,-0.379766,-0.166227,-0.313631,-0.047868,-0.033083,5.9875,0.0
50%,282.0,-0.382618,0.285843,0.905435,0.395919,-0.116612,-0.109581,0.116329,0.034755,-0.08227,...,-0.075802,-0.076551,-0.048353,0.091886,0.143723,-0.026414,0.023199,0.021034,16.665,0.0
75%,432.0,1.110739,0.885745,1.532969,1.117559,0.452818,0.482972,0.57539,0.252395,0.412261,...,0.095149,0.307438,0.070085,0.426339,0.425798,0.260408,0.112199,0.087023,55.5275,0.0
max,581.0,1.586093,5.267376,3.772857,4.075817,7.672544,5.122103,4.808426,2.134599,5.459274,...,5.27342,1.57475,3.150413,1.215279,1.13672,3.087444,2.490503,1.57538,3828.04,1.0


In [136]:
df.Class.value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,763
1,9


The dataset is clearly unbalanced. Used **SMOTE** to balance the dataset. **SMOTE** creates synthetic samples of the minority class by interpolating between existing samples.

In [137]:
from sklearn.preprocessing import StandardScaler
df.iloc[:, 1:30] = StandardScaler().fit_transform(df.iloc[:, 1:30])
data_matrix = df.values

In [138]:
x = df.drop('Class', axis=1)
y = df['Class']

In [139]:
x.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0,-0.91418,-0.247263,1.6109,0.868503,-0.303702,0.24762,0.136543,0.188336,0.449139,...,0.323467,-0.038091,0.617686,-0.195625,0.111783,0.033076,-0.437881,0.366449,-0.01441,0.409447
1,0,1.057914,0.04177,-0.687243,0.1292,0.059272,-0.197113,-0.237378,0.171947,-0.256424,...,-0.232448,-0.378795,-0.892627,0.395078,-0.543138,0.123043,0.213082,-0.107521,0.114215,-0.333712
2,1,-0.913057,-1.328056,0.870859,0.074846,-0.453941,1.340056,0.784631,0.367913,-1.691257,...,0.797984,0.399235,1.431487,2.649315,-1.105739,-1.029321,-0.334519,-0.261707,-0.153538,1.56791
3,1,-0.61003,-0.343153,0.890044,-0.913332,-0.004811,0.888345,0.134206,0.524325,-1.545829,...,-0.473472,-0.185878,0.168529,-0.418355,-1.88869,1.24139,-0.505691,0.130911,0.28223,0.277334
4,2,-0.75839,0.563316,0.653161,0.093332,-0.36646,-0.051563,0.551496,-0.256731,0.966396,...,0.596017,-0.023514,1.475319,-0.270897,0.231472,-0.746053,0.990903,0.651959,0.83479,0.006685


In [140]:
from imblearn.over_sampling import SMOTE
sampler = SMOTE(sampling_strategy=0.95)
x_resample, y_resample = sampler.fit_resample(x, y)
print(y_resample.value_counts())

Class
0    763
1    724
Name: count, dtype: int64


In [141]:
resample = pd.concat([x_resample, y_resample], axis=1)

## Simple Random Sampling

In [142]:
n = int((1.96*1.96 * 0.5*0.5)/(0.05**2))
SimpleSampling = resample.sample(n=n, random_state=42)
SimpleSampling.shape

X = SimpleSampling.drop('Class', axis=1)
y = SimpleSampling['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression(max_iter=1000)
svm_model = SVC(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
ada_model = AdaBoostClassifier(random_state=42)

models = [rf_model, lr_model, svm_model, dt_model, ada_model]
model_names = ['Random Forest', 'Logistic Regression', 'SVM', 'Decision Trees', 'AdaBoost']

accuracies = []

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

results = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies
})
print(results)
best_model_idx = np.argmax(accuracies)
print(f"\nBest Model for Simple Random Sampling: {model_names[best_model_idx]} with Accuracy: {accuracies[best_model_idx]:.4f}")


                 Model  Accuracy
0        Random Forest  0.987013
1  Logistic Regression  0.883117
2                  SVM  0.753247
3       Decision Trees  0.935065
4             AdaBoost  0.935065

Best Model for Simple Random Sampling: Random Forest with Accuracy: 0.9870


## Systematic Sampling

In [143]:
import random

SystematicSampling = resample.sample(frac=1, random_state=42).reset_index(drop=True)

sampling_interval = 2
SystematicSample = SystematicSampling.iloc[::sampling_interval]
SystematicSample.shape

X = SystematicSample.drop('Class', axis=1)
y = SystematicSample['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression(max_iter=1000)
svm_model = SVC(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
ada_model = AdaBoostClassifier(random_state=42)

models = [rf_model, lr_model, svm_model, dt_model, ada_model]
model_names = ['Random Forest', 'Logistic Regression', 'SVM', 'Decision Trees', 'AdaBoost']

accuracies = []

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

results = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies
})
print(results)
best_model_idx = np.argmax(accuracies)
print(f"\nBest Model for Systematic Sampling: {model_names[best_model_idx]} with Accuracy: {accuracies[best_model_idx]:.4f}")


                 Model  Accuracy
0        Random Forest  0.993289
1  Logistic Regression  0.926174
2                  SVM  0.718121
3       Decision Trees  0.979866
4             AdaBoost  0.966443

Best Model for Systematic Sampling: Random Forest with Accuracy: 0.9933


## Cluster Sampling

In [144]:
from sklearn.cluster import KMeans

num_clusters = 10

kmeans = KMeans(n_clusters=num_clusters, n_init=10, random_state=42)

clusters = kmeans.fit_predict(resample)
clusters = pd.Series(clusters)

selected_clusters = random.sample(range(num_clusters), 3)
ClusterSample = resample.loc[clusters.isin(selected_clusters)]
print(ClusterSample.shape)

X = ClusterSample.drop('Class', axis=1)
y = ClusterSample['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression(max_iter=1000)
svm_model = SVC(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
AdaBoostClassifier_model = AdaBoostClassifier(random_state=42)

models = [rf_model, lr_model, svm_model, dt_model, ada_model]
model_names = ['Random Forest', 'Logistic Regression', 'SVM', 'Decision Trees', 'AdaBoost']

accuracies = []

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

results = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies
})
print(results)
best_model_idx = np.argmax(accuracies)
print(f"\nBest Model for Cluster Sampling: {model_names[best_model_idx]} with Accuracy: {accuracies[best_model_idx]:.4f}")

(466, 31)
                 Model  Accuracy
0        Random Forest  0.989362
1  Logistic Regression  0.925532
2                  SVM  0.765957
3       Decision Trees  0.989362
4             AdaBoost  0.989362

Best Model for Cluster Sampling: Random Forest with Accuracy: 0.9894


## Stratified Sampling

In [145]:
n = int((1.96*1.96 * 0.5*0.5)/((0.05)**2))
StratifiedSampling = resample.groupby('Class')
StratifiedSample=StratifiedSampling.sample(frac= 0.45)
StratifiedSample.shape

X = StratifiedSample.drop('Class', axis=1)
y = StratifiedSample['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression(max_iter=750)
svm_model = SVC(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
ada_model = AdaBoostClassifier(random_state=42)

models = [rf_model, lr_model, svm_model, dt_model, ada_model]
model_names = ['Random Forest', 'Logistic Regression', 'SVM', 'Decision Trees', 'AdaBoost']

accuracies = []

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

results = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies
})
print(results)
best_model_idx = np.argmax(accuracies)
print(f"\nBest Model for Stratified Sampling: {model_names[best_model_idx]} with Accuracy: {accuracies[best_model_idx]:.4f}")



                 Model  Accuracy
0        Random Forest  1.000000
1  Logistic Regression  0.902985
2                  SVM  0.626866
3       Decision Trees  0.917910
4             AdaBoost  0.985075

Best Model for Stratified Sampling: Random Forest with Accuracy: 1.0000


## Bootstrap Sampling

In [146]:
n_bootstrap = 100
desired_sample_size = 400
BootstrapSamples = pd.DataFrame()
for _ in range(n_bootstrap):
    resampled_data = resample.sample(n=len(df), replace=True, random_state=42)
    BootstrapSamples = pd.concat([BootstrapSamples, resampled_data])
    if BootstrapSamples.shape[0] >= desired_sample_size:
        break
BootstrapSamples = BootstrapSamples.iloc[:desired_sample_size, :]
print("Final Shape of Bootstrap Samples DataFrame:", BootstrapSamples.shape)

X = BootstrapSamples.drop('Class', axis=1)
y = BootstrapSamples['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models
rf_model = RandomForestClassifier(random_state=42)
lr_model = LogisticRegression(max_iter=750)
svm_model = SVC(random_state=42)
dt_model = DecisionTreeClassifier(random_state=42)
ada_model = AdaBoostClassifier(random_state=42)

models = [rf_model, lr_model, svm_model, dt_model, ada_model]
model_names = ['Random Forest', 'Logistic Regression', 'SVM', 'Decision Trees', 'AdaBoost']

accuracies = []

for model, name in zip(models, model_names):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

results = pd.DataFrame({
    'Model': model_names,
    'Accuracy': accuracies
})
print(results)
best_model_idx = np.argmax(accuracies)
print(f"\nBest Model for Bootstrap Sampling: {model_names[best_model_idx]} with Accuracy: {accuracies[best_model_idx]:.4f}")



Final Shape of Bootstrap Samples DataFrame: (400, 31)
                 Model  Accuracy
0        Random Forest     1.000
1  Logistic Regression     0.925
2                  SVM     0.675
3       Decision Trees     0.950
4             AdaBoost     0.950

Best Model for Bootstrap Sampling: Random Forest with Accuracy: 1.0000
