Step 1: Load the Dataset

In [1]:
import pandas as pd

# Load the dataset
file_path = "/content/Creditcard_data.csv"
data = pd.read_csv(file_path)

# Display the first few rows
print(data.head())

# Check basic info
print(data.info())


   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26       V27       V28 

Step 2: Check Class Distribution

In [2]:
# Check the distribution of the target variable (Class column)
print(data['Class'].value_counts())  # Replace 'Class' with the actual column name


Class
0    763
1      9
Name: count, dtype: int64


Step 3: Balance the Dataset

In [3]:
from imblearn.over_sampling import SMOTE

# Separate features (X) and target variable (y)
X = data.drop('Class', axis=1)  # Replace 'Class' with the actual target column name
y = data['Class']

# Balance the dataset using SMOTE
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X, y)

# Check the new class distribution
print(pd.Series(y_balanced).value_counts())


Class
0    763
1    763
Name: count, dtype: int64


Step 4: Calculate the Sample Size

In [4]:
import math

# Calculate sample size
N = len(X_balanced)  # Total population size
Z = 1.96  # 95% confidence
p = 0.5  # Estimated proportion
E = 0.05  # Margin of error

sample_size = math.ceil((N * Z**2 * p * (1 - p)) / ((E**2 * (N - 1)) + (Z**2 * p * (1 - p))))
print("Calculated Sample Size:", sample_size)


Calculated Sample Size: 308


Step 5: Create Five Samples

In [5]:
from sklearn.utils import resample

# Create five samples
samples = []
for i in range(5):
    sample = resample(pd.concat([X_balanced, y_balanced], axis=1),
                      n_samples=sample_size, random_state=i)
    samples.append(sample)

# Check one sample
print(samples[0].head())


      Time        V1        V2        V3        V4        V5        V6  \
684    517  1.314713 -0.328688  0.002645 -0.805044 -0.467260 -0.522747   
559    417 -2.680348  1.872052  1.144712 -0.693664  0.155172  0.601325   
1216   424 -1.977808  1.576583 -0.800228  3.076116 -0.162537 -1.299571   
835    532  0.244796  0.380242  0.971548  0.455242  0.255845 -0.990334   
763    574 -0.402057  0.584300  2.474227  0.929684  0.014314  0.297490   

            V7        V8        V9  ...       V21       V22       V23  \
684  -0.180850 -0.093472  1.795353  ... -0.122540 -0.029521 -0.250848   
559   0.904201 -0.520079  3.013065  ... -0.459592  0.485421 -0.365437   
1216 -1.711958  1.017589 -2.135151  ...  0.408297  0.041402 -0.415139   
835   0.457930 -0.185410 -0.058102  ... -0.123176 -0.309964 -0.051041   
763   0.715195 -0.257153  0.593868  ... -0.072812  0.445733 -0.245103   

           V24       V25       V26       V27       V28     Amount  Class  
684  -0.427629  0.917790 -0.534370  0.062

Step 6: Apply Five Sampling Techniques

Simple Random Sampling:

In [6]:
simple_random_sample = samples[0].sample(frac=1, random_state=42)


Stratified Sampling:

In [7]:
from sklearn.model_selection import train_test_split

stratified_sample, _ = train_test_split(samples[1], test_size=0.5,
                                        stratify=samples[1]['Class'], random_state=42)


Cluster Sampling:

In [15]:
# Group by the 'V2' column
clusters = samples[2].groupby('V2')  # Replace 'V2' with a valid column name

# Get all unique cluster keys
cluster_keys = list(clusters.groups.keys())

# Sample the keys safely
sampled_keys = pd.Series(cluster_keys).sample(n=2, random_state=42).tolist()  # Adjust n as needed

# Filter the original dataset for the sampled clusters
cluster_sample = samples[2][samples[2]['V2'].isin(sampled_keys)]  # Replace 'V2' with the column name


Systematic Sampling:

In [11]:
step = len(samples[3]) // sample_size
systematic_sample = samples[3].iloc[::step]


Convenience Sampling:

In [12]:
convenience_sample = samples[4].head(sample_size)


 Logistic Regression Code

In [17]:
# Import StandardScaler
from sklearn.preprocessing import StandardScaler

# Scale the data
scaler = StandardScaler()
X_sample_scaled = scaler.fit_transform(X_sample)  # Replace X_sample with your features

# Initialize Logistic Regression with updated parameters
log_reg = LogisticRegression(max_iter=1000, solver='saga')

# Train the model
log_reg.fit(X_sample_scaled, y_sample)

# Evaluate the model
y_pred = log_reg.predict(X_sample_scaled)
accuracy = accuracy_score(y_sample, y_pred)
print(f"LogisticRegression Accuracy: {accuracy}")


LogisticRegression Accuracy: 0.9545454545454546


Scaling for Other Models

In [18]:
# Scale the data once
scaler = StandardScaler()
X_sample_scaled = scaler.fit_transform(X_sample)

# Train and evaluate all models
models = [
    LogisticRegression(max_iter=1000, solver='saga'),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    SVC(),
    KNeighborsClassifier()
]

for model in models:
    model.fit(X_sample_scaled, y_sample)
    y_pred = model.predict(X_sample_scaled)
    accuracy = accuracy_score(y_sample, y_pred)
    print(f"{model.__class__.__name__} Accuracy: {accuracy}")


LogisticRegression Accuracy: 0.9545454545454546
DecisionTreeClassifier Accuracy: 1.0
RandomForestClassifier Accuracy: 1.0
SVC Accuracy: 0.9837662337662337
KNeighborsClassifier Accuracy: 0.9123376623376623


Validate Results on a Test Set
Split the data into training and test sets to evaluate model performance on unseen data.

In [19]:
from sklearn.model_selection import train_test_split

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_sample_scaled, y_sample, test_size=0.2, random_state=42)

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Test Accuracy: {accuracy}")


LogisticRegression Test Accuracy: 0.9354838709677419
DecisionTreeClassifier Test Accuracy: 0.9516129032258065
RandomForestClassifier Test Accuracy: 0.9838709677419355
SVC Test Accuracy: 0.9838709677419355
KNeighborsClassifier Test Accuracy: 0.9354838709677419


Cross-Validation

Perform k-fold cross-validation to ensure the models are not overfitting.


In [20]:
from sklearn.model_selection import cross_val_score

for model in models:
    scores = cross_val_score(model, X_sample_scaled, y_sample, cv=5)
    print(f"{model.__class__.__name__} Cross-Validation Accuracy: {scores.mean()}")


LogisticRegression Cross-Validation Accuracy: 0.9221047065044949
DecisionTreeClassifier Cross-Validation Accuracy: 0.9576414595452143
RandomForestClassifier Cross-Validation Accuracy: 0.9967213114754099
SVC Cross-Validation Accuracy: 0.9675304071919619
KNeighborsClassifier Cross-Validation Accuracy: 0.8670015864621894


 Hyperparameter Tuning

Use GridSearchCV or RandomizedSearchCV to optimize model hyperparameters for better performance.

In [21]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10]
}

grid_search = GridSearchCV(RandomForestClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_sample_scaled, y_sample)
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)


Best Parameters: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 50}
Best Accuracy: 0.9967213114754099
