<a href="https://colab.research.google.com/github/rhy004/Sampling/blob/main/Balance_Class_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U imbalanced-learn

Collecting imbalanced-learn
  Downloading imbalanced_learn-0.12.0-py3-none-any.whl (257 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.7/257.7 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: imbalanced-learn
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.10.1
    Uninstalling imbalanced-learn-0.10.1:
      Successfully uninstalled imbalanced-learn-0.10.1
Successfully installed imbalanced-learn-0.12.0


# **SMOTE  (SYNTHETIC MINORITY OVERSAMPLING TECHNIQUE)**

In [None]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Load your dataset
df = pd.read_csv('/kaggle/input/mydataset/Creditcard_data.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
X = df.drop('Class', axis=1)
y = df['Class']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
y_train.head()

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
balanced_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='Class')], axis=1)
balanced_df
balanced_df.to_csv('balanced_data.csv', index=False)

In [None]:
print("original class distribution:\n",y_train.value_counts())
print("\nbBalanced class distribution:\n",pd.Series(y_resampled).value_counts())

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [None]:
model_original = RandomForestClassifier(random_state=42)
model_original.fit(X_train,y_train)
y_pred_original = model_original.predict(X_test)

In [None]:
model_balanced=RandomForestClassifier(random_state=42)
model_balanced.fit(X_resampled,y_resampled)
y_pred_balanced = model_balanced.predict(X_test)

In [None]:
print("original accuracy:",accuracy_score(y_test,y_pred_original),"vs \n balanced accuracy",accuracy_score(y_test,y_pred_balanced))
print("original: ",classification_report(y_test, y_pred_original))
print("\nbalanced\n",classification_report(y_test, y_pred_balanced))

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score

print("Original Model Metrics:")
print("Precision:", precision_score(y_test, y_pred_original))
print("Recall:", recall_score(y_test, y_pred_original))
print("F1 Score:", f1_score(y_test, y_pred_original))
print("ROC AUC:", roc_auc_score(y_test, model_original.predict_proba(X_test)[:, 1]))

print("\nBalanced Model Metrics:")
print("Precision:", precision_score(y_test, y_pred_balanced))
print("Recall:", recall_score(y_test, y_pred_balanced))
print("F1 Score:", f1_score(y_test, y_pred_balanced))
print("ROC AUC:", roc_auc_score(y_test, model_balanced.predict_proba(X_test)[:, 1]))

# **RANDOM OVER SAMPLING**

In [None]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from collections import Counter

# Load your dataset
df1 = pd.read_csv('/kaggle/input/mydataset/Creditcard_data.csv')

# Separate features (X) and target variable (y)
X = df1.drop('Class', axis=1)
y = df1['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use RandomOverSampler to oversample the minority class
ros = RandomOverSampler(random_state=42)
X_resampled1, y_resampled1 = ros.fit_resample(X, y)

# Display original and resampled class distribution
print('Original dataset shape:', Counter(y))
print('Resampled dataset shape:', Counter(y_resampled1))


In [None]:
balanced_df1 = pd.concat([pd.DataFrame(X_resampled1, columns=X.columns), pd.Series(y_resampled1, name='Class')], axis=1)
balanced_df1
balanced_df1.to_csv('balanced_data1.csv', index=False)

In [None]:
model_balanced1=RandomForestClassifier(random_state=42)
model_balanced1.fit(X_resampled1,y_resampled1)
y_pred_balanced1 = model_balanced1.predict(X_test)

In [None]:
print("original accuracy:",accuracy_score(y_test,y_pred_original),"vs \n balanced accuracy",accuracy_score(y_test,y_pred_balanced1))
print("original: ",classification_report(y_test, y_pred_original))
print("\nbalanced\n",classification_report(y_test, y_pred_balanced1))

# **RANDOM UNDER SAMPLING**

In [None]:
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

# Load your dataset
df2 = pd.read_csv('/kaggle/input/mydataset/Creditcard_data.csv')

# Separate features (X) and target variable (y)
X = df2.drop('Class', axis=1)
y = df2['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use RandomUnderSampler to undersample the majority class
ros = RandomUnderSampler(random_state=42)
X_resampled2, y_resampled2 = ros.fit_resample(X, y)

# Display original and resampled class distribution
print('Original dataset shape:', Counter(y))
print('Resampled dataset shape:', Counter(y_resampled2))


In [None]:
balanced_df2 = pd.concat([pd.DataFrame(X_resampled2, columns=X.columns), pd.Series(y_resampled2, name='Class')], axis=1)
balanced_df2
balanced_df2.to_csv('balanced_data2.csv', index=False)

In [None]:
model_balanced2=RandomForestClassifier(random_state=42)
model_balanced2.fit(X_resampled2,y_resampled2)
y_pred_balanced2 = model_balanced2.predict(X_test)

In [None]:
print("original accuracy:",accuracy_score(y_test,y_pred_original),"vs \n balanced accuracy",accuracy_score(y_test,y_pred_balanced2))
print("original: ",classification_report(y_test, y_pred_original))
print("\nbalanced\n",classification_report(y_test, y_pred_balanced2))

**Out of above three methods SMOTE ,RANDOM OVER SAMPLING , RANDOM UNDER SAMPLING**

**RANDOM OVER SAMPLING GIVES HIGHEST ACCURACY**

**HENCE balanced_data.csv1 is the best balanced class dataset.**
