In [None]:
#Task 2

For this week, you can find some imbalanced dataset with fewer classes and ask them to apply oversampling or undersampling techniques
(random over and undersampling, tomek link, smot and class weighing). Then ask them to train the model on balanced dataset and 
find the performance matrices (accuracy, F1 score and AUC) and compare which technique is improving model performance. 
Find the dataset on GitHub or kaggle

In [19]:
#!python.exe -m pip install --upgrade pip
#!pip install scikit-learn 
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.12.4-py3-none-any.whl.metadata (8.3 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.12.4-py3-none-any.whl (258 kB)
Installing collected packages: imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.12.4 imblearn-0.0


In [34]:
import pandas as pd
import numpy as np
data =pd.read_csv('data.csv')
data['diagnosis'].value_counts()
data
# Importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks

# Load the dataset
data = pd.read_csv('data.csv')
data
# Prepare the data
X = data.drop(columns=['id', 'diagnosis', 'Unnamed: 32'])
y = LabelEncoder().fit_transform(data['diagnosis'])  # Encode labels: B=0, M=1

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Initialize the random forest classifier
model = RandomForestClassifier(random_state=42)

# Dictionary to store results for each sampling technique
results = {}
# Function to train and evaluate the model
def evaluate_model(X_train_res, y_train_res, technique_name):
    model.fit(X_train_res, y_train_res)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    
    results[technique_name] = {'Accuracy': accuracy, 'F1 Score': f1, 'AUC': auc}
# 1. Random Oversampling
ros = RandomOverSampler(random_state=42)
X_resampled_ros, y_resampled_ros = ros.fit_resample(X_train, y_train)
evaluate_model(X_resampled_ros, y_resampled_ros, "Random Oversampling")

# 2. Random Undersampling
rus = RandomUnderSampler(random_state=42)
X_resampled_rus, y_resampled_rus = rus.fit_resample(X_train, y_train)
evaluate_model(X_resampled_rus, y_resampled_rus, "Random Undersampling")
X_train

# 3. SMOTE
smote = SMOTE(random_state=42)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X_train, y_train)
evaluate_model(X_resampled_smote, y_resampled_smote, "SMOTE")

# 4. Tomek Links
tomek = TomekLinks()
X_resampled_tomek, y_resampled_tomek = tomek.fit_resample(X_train, y_train)
evaluate_model(X_resampled_tomek, y_resampled_tomek, "Tomek Links")

# 5. Class Weighting (no resampling, but using class weights in the model)
model_weighted = RandomForestClassifier(random_state=42, class_weight='balanced')
model_weighted.fit(X_train, y_train)
y_pred_weighted = model_weighted.predict(X_test)
y_pred_proba_weighted = model_weighted.predict_proba(X_test)[:, 1]

accuracy_weighted = accuracy_score(y_test, y_pred_weighted)
f1_weighted = f1_score(y_test, y_pred_weighted)
auc_weighted = roc_auc_score(y_test, y_pred_proba_weighted)

results["Class Weighting"] = {'Accuracy': accuracy_weighted, 'F1 Score': f1_weighted, 'AUC': auc_weighted}

# Display the results
for technique, metrics in results.items():
    print(f"Technique: {technique}")
    for metric, score in metrics.items():
        print(f"{metric}: {score:.4f}")
    print("-" * 30)

Technique: Random Oversampling
Accuracy: 0.9561
F1 Score: 0.9383
AUC: 0.9974
------------------------------
Technique: Random Undersampling
Accuracy: 0.9649
F1 Score: 0.9512
AUC: 0.9977
------------------------------
Technique: SMOTE
Accuracy: 0.9737
F1 Score: 0.9630
AUC: 0.9985
------------------------------
Technique: Tomek Links
Accuracy: 0.9649
F1 Score: 0.9512
AUC: 0.9947
------------------------------
Technique: Class Weighting
Accuracy: 0.9737
F1 Score: 0.9630
AUC: 0.9965
------------------------------
