<a href="https://colab.research.google.com/github/saiteja-ms/DAL-Project/blob/main/Assignment_7_ME21B171_Notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import the essential Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.utils import resample

# Task - 1
1. Splitting the data file into train and test partitions.
2. Build baseline classifiers (SVC, LogReg, and DecisionTree) by cross-validating the best hyper-parameters of the respective models, using GridSearchCV

In [None]:
# Loading the preprocessing the Data
df = pd.read_csv('/content/aps_failure_training_set.csv')
df.head()

Unnamed: 0,class,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
0,neg,76698,na,2130706438,280,0,0,0,0,0,...,1240520,493384,721044,469792,339156,157956,73224,0,0,0
1,neg,33058,na,0,na,0,0,0,0,0,...,421400,178064,293306,245416,133654,81140,97576,1500,0,0
2,neg,41040,na,228,100,0,0,0,0,0,...,277378,159812,423992,409564,320746,158022,95128,514,0,0
3,neg,12,0,70,66,0,10,0,0,0,...,240,46,58,44,10,0,0,0,4,32
4,neg,60874,na,1368,458,0,0,0,0,0,...,622012,229790,405298,347188,286954,311560,433954,1218,0,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60000 entries, 0 to 59999
Columns: 171 entries, class to eg_000
dtypes: int64(1), object(170)
memory usage: 78.3+ MB


In [None]:
class_counts = df['class'].value_counts()

# Print the counts
print(class_counts)

class
neg    59000
pos     1000
Name: count, dtype: int64


In [None]:
# let's replace all the 'na' with 0.
df[df == 'na'] = 0
df_us = df

In [None]:
# let's try some random undersampling for faster training
df_pos = df[df['class']=='pos']
df_neg = df[df['class']=='neg']
df_pos_us = df_pos.sample(100)
df_neg_us = df_neg.sample(900)
df_us = pd.concat((df_pos_us ,df_neg_us))
df_us.shape

(1000, 171)

In [None]:
# extract the features and labels
y = df_us['class']
X = df_us.drop(columns=['class'])
print("X dims =", X.shape)
print("Y distrib =", y.value_counts())

X dims = (1000, 170)
Y distrib = class
neg    900
pos    100
Name: count, dtype: int64


In [None]:
# y.values gets us (N,1) and ravel() gets us (N,) as the shapes.
ybin = y.values.ravel()
print(ybin)


['pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos'
 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos'
 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos'
 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos'
 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos'
 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos'
 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos'
 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos' 'pos'
 'pos' 'pos' 'pos' 'pos' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg'
 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'neg' 'n

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float32))
X_test_scaled = scaler.transform(X_test)

#Build the Baseline Classifiers

In [None]:
baseline_results = {}

# 1. Support Vector Machine (SVM) Classifier
print("\nTraining Support Vector Machine (SVM) Classifier...")
svm_clf = SVC()
svm_param_grid = {
        'kernel': ['rbf'],# Train linear also(but takes lot of time)
        'C': [0.1, 1, 10, 100, 500, 100],
        'gamma': [1E-5, 0.0001, 0.001]
    }

# Perform grid search to find the best hyperparameters
svm_grid_search = GridSearchCV(svm_clf, svm_param_grid, cv=5, scoring='f1_macro')
svm_grid_search.fit(X_train, y_train)


Training Support Vector Machine (SVM) Classifier...


In [None]:
# Get the best model and make predictions
best_svm = svm_grid_search.best_estimator_
svm_train_pred = best_svm.predict(X_train)
svm_test_pred = best_svm.predict(X_test)

In [None]:
# Calculate F1 scores and store results
baseline_results['SVM'] = {
    'best_params': svm_grid_search.best_params_,
    'train_f1': f1_score(y_train, svm_train_pred, average='macro'),
    'test_f1': f1_score(y_test, svm_test_pred, average='macro')
    }

# Print results
print("SVM Results:")
print(f"Best Parameters: {baseline_results['SVM']['best_params']}")
print(f"Train F1 Score: {baseline_results['SVM']['train_f1']:.4f}")
print(f"Test F1 Score: {baseline_results['SVM']['test_f1']:.4f}")
print("Classification Report (Test Set):")
print(classification_report(y_test, svm_test_pred))

SVM Results:
Best Parameters: {'C': 0.1, 'gamma': 1e-05, 'kernel': 'rbf'}
Train F1 Score: 0.4737
Test F1 Score: 0.4737
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.90      1.00      0.95       180
         pos       0.00      0.00      0.00        20

    accuracy                           0.90       200
   macro avg       0.45      0.50      0.47       200
weighted avg       0.81      0.90      0.85       200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Macro-average f1-score of SVM Classifier is 0.47

In [None]:
print("\nTraining Logistic Regression Classifier...")
logreg_clf = LogisticRegression()
logreg_param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.1, 1, 10],
        'solver': ['liblinear']
    }


Training Logistic Regression Classifier...


In [None]:
# Perform grid search to find the best hyperparameters
logreg_grid_search = GridSearchCV(logreg_clf, logreg_param_grid, cv=5, scoring='f1_macro')
logreg_grid_search.fit(X_train, y_train)

# Get the best model and make predictions
best_logreg = logreg_grid_search.best_estimator_
logreg_train_pred = best_logreg.predict(X_train)
logreg_test_pred = best_logreg.predict(X_test)



In [None]:
# Calculate F1 scores and store results
baseline_results['LogReg'] = {
    'best_params': logreg_grid_search.best_params_,
    'train_f1': f1_score(y_train, logreg_train_pred, average='macro'),
    'test_f1': f1_score(y_test, logreg_test_pred, average='macro')
}

In [None]:
# Print results
print("Logistic Regression Results:")
print(f"Best Parameters: {baseline_results['LogReg']['best_params']}")
print(f"Train F1 Score: {baseline_results['LogReg']['train_f1']:.4f}")
print(f"Test F1 Score: {baseline_results['LogReg']['test_f1']:.4f}")
print("Classification Report (Test Set):")
print(classification_report(y_test, logreg_test_pred))


Logistic Regression Results:
Best Parameters: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
Train F1 Score: 0.9792
Test F1 Score: 0.8722
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.97      0.98      0.98       180
         pos       0.79      0.75      0.77        20

    accuracy                           0.95       200
   macro avg       0.88      0.86      0.87       200
weighted avg       0.95      0.95      0.95       200



# Macro-average f1-score of Logistic Regression Classifier is 0.87

In [None]:
# 3. Decision Tree Classifier
print("\nTraining Decision Tree Classifier...")
dt_clf = DecisionTreeClassifier()
dt_param_grid = {
    'max_depth': [5, 10, None],
    'min_samples_leaf': [1, 2, 4]
}


Training Decision Tree Classifier...


In [None]:
# Perform grid search to find the best hyperparameters
dt_grid_search = GridSearchCV(dt_clf, dt_param_grid, cv=5, scoring='f1_macro')
dt_grid_search.fit(X_train, y_train)

In [None]:
# Get the best model and make predictions
best_dt = dt_grid_search.best_estimator_
dt_train_pred = best_dt.predict(X_train)
dt_test_pred = best_dt.predict(X_test)

In [None]:
# Calculate F1 scores and store results
baseline_results['DecisionTree'] = {
    'best_params': dt_grid_search.best_params_,
    'train_f1': f1_score(y_train, dt_train_pred, average='macro'),
    'test_f1': f1_score(y_test, dt_test_pred, average='macro')
}

In [None]:
# Print results
print("Decision Tree Results:")
print(f"Best Parameters: {baseline_results['DecisionTree']['best_params']}")
print(f"Train F1 Score: {baseline_results['DecisionTree']['train_f1']:.4f}")
print(f"Test F1 Score: {baseline_results['DecisionTree']['test_f1']:.4f}")
print("Classification Report (Test Set):")
print(classification_report(y_test, dt_test_pred))

Decision Tree Results:
Best Parameters: {'max_depth': 5, 'min_samples_leaf': 1}
Train F1 Score: 0.9965
Test F1 Score: 0.8837
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.97      0.98      0.98       180
         pos       0.83      0.75      0.79        20

    accuracy                           0.96       200
   macro avg       0.90      0.87      0.88       200
weighted avg       0.96      0.96      0.96       200



# Macro-average of decision tree classifier is 0.88

# Task - 2
Addressing the class imbalance using the following approaches:
1. Undersampling the majority class
2. Oversampling the minority class
3. Using class_weights which is inversely proportional to the class population
4. Using sample_weights, to assign penalty for misclassification of datapoints.
5. Using SMOTE(Synthetic Minority Over-sampling Technique) method

In [None]:
imbalance_results = {}

# 1. Undersampling
print("\nApplying Undersampling...")

# Separate majority and minority classes
X_majority = X_train[y_train == 'neg']
X_minority = X_train[y_train == 'pos']
y_majority = y_train[y_train == 'neg']
y_minority = y_train[y_train == 'pos']


Applying Undersampling...


# 1. Undersampling Majority class

In [None]:
# Undersample majority class
X_majority_downsampled = resample(X_majority,
                                      n_samples=len(X_minority),
                                      random_state=42)
y_majority_downsampled = resample(y_majority,
                                      n_samples=len(y_minority),
                                      random_state=42)

# Combine minority class with downsampled majority class
X_undersampled = pd.concat([X_majority_downsampled, X_minority])
y_undersampled = pd.concat([y_majority_downsampled, y_minority])

In [None]:
for clf_name, clf in [('SVM', SVC()), ('LogReg', LogisticRegression()), ('DecisionTree', DecisionTreeClassifier())]:
    print(f"\nTraining {clf_name} with Undersampling...")
    clf.fit(X_undersampled, y_undersampled)

    train_pred = clf.predict(X_undersampled)
    test_pred = clf.predict(X_test)

    train_f1 = f1_score(y_undersampled, train_pred, average='macro')
    test_f1 = f1_score(y_test, test_pred, average='macro')

    imbalance_results[f"{clf_name}_Undersampling"] = {
            'train_f1': train_f1,
            'test_f1': test_f1
        }
    print(f"Train F1 Score: {train_f1:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    print("Classification Report (Test Set):")
    print(classification_report(y_test, test_pred))


Training SVM with Undersampling...
Train F1 Score: 0.8471
Test F1 Score: 0.9362
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.97      1.00      0.98       120
         pos       1.00      0.80      0.89        20

    accuracy                           0.97       140
   macro avg       0.98      0.90      0.94       140
weighted avg       0.97      0.97      0.97       140


Training LogReg with Undersampling...
Train F1 Score: 1.0000
Test F1 Score: 0.9440
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.99      0.97      0.98       120
         pos       0.86      0.95      0.90        20

    accuracy                           0.97       140
   macro avg       0.93      0.96      0.94       140
weighted avg       0.97      0.97      0.97       140


Training DecisionTree with Undersampling...
Train F1 Score: 1.0000
Test F1 Score: 0.9038
Classification Report (

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#Macro - average F1-Score of the following classifiers (using undersampling) are:
1. SVM Classifier: 0.94
2. Logistic Regression: 0.94
3. Decision Tree Classifier: 0.90

# 2. Oversampling Minority class

In [None]:
# 2. Oversampling
print("\nApplying Oversampling...")

# Oversample minority class
X_minority_upsampled = resample(X_minority,
                                    n_samples=len(X_majority),
                                    random_state=42)
y_minority_upsampled = resample(y_minority,
                                    n_samples=len(y_majority),
                                    random_state=42)

# Combine majority class with upsampled minority class
X_oversampled = pd.concat([X_majority, X_minority_upsampled])
y_oversampled = pd.concat([y_majority, y_minority_upsampled])


Applying Oversampling...


In [None]:
# Train and evaluate classifiers on oversampled data
for clf_name, clf in [('SVM', SVC()), ('LogReg', LogisticRegression()), ('DecisionTree', DecisionTreeClassifier())]:
    print(f"\nTraining {clf_name} with Oversampling...")
    clf.fit(X_oversampled, y_oversampled)

    train_pred = clf.predict(X_oversampled)
    test_pred = clf.predict(X_test)

    train_f1 = f1_score(y_oversampled, train_pred, average='macro')
    test_f1 = f1_score(y_test, test_pred, average='macro')

    imbalance_results[f"{clf_name}_Oversampling"] = {
            'train_f1': train_f1,
            'test_f1': test_f1
        }

    print(f"Train F1 Score: {train_f1:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    print("Classification Report (Test Set):")
    print(classification_report(y_test, test_pred))


Training SVM with Oversampling...
Train F1 Score: 0.9217
Test F1 Score: 0.9533
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.98      1.00      0.99       120
         pos       1.00      0.85      0.92        20

    accuracy                           0.98       140
   macro avg       0.99      0.93      0.95       140
weighted avg       0.98      0.98      0.98       140


Training LogReg with Oversampling...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train F1 Score: 0.9771
Test F1 Score: 0.9087
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.97      0.98      0.98       120
         pos       0.89      0.80      0.84        20

    accuracy                           0.96       140
   macro avg       0.93      0.89      0.91       140
weighted avg       0.96      0.96      0.96       140


Training DecisionTree with Oversampling...
Train F1 Score: 1.0000
Test F1 Score: 0.8996
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.95      1.00      0.98       120
         pos       1.00      0.70      0.82        20

    accuracy                           0.96       140
   macro avg       0.98      0.85      0.90       140
weighted avg       0.96      0.96      0.95       140



#Macro - average F1-Score of the following classifiers (using oversampling) are:
1. SVM Classifier: 0.95
2. Logistic Regression: 0.91
3. Decision Tree Classifier: 0.90

# 3. Assigning Class weights

In [None]:
# 3. Class Weights
print("\nApplying Class Weights...")

# Calculate class weights
class_weights = {
        'neg': len(y_train) / (2 * (y_train == 'neg').sum()),
        'pos': len(y_train) / (2 * (y_train == 'pos').sum())
    }

# Train and evaluate classifiers with class weights
for clf_name, clf in [('SVM', SVC(class_weight='balanced')),
                          ('LogReg', LogisticRegression(class_weight='balanced')),
                          ('DecisionTree', DecisionTreeClassifier(class_weight='balanced'))]:
    print(f"\nTraining {clf_name} with Class Weights...")
    clf.fit(X_train, y_train)

    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)

    train_f1 = f1_score(y_train, train_pred, average='macro')
    test_f1 = f1_score(y_test, test_pred, average='macro')

    imbalance_results[f"{clf_name}_ClassWeights"] = {
            'train_f1': train_f1,
            'test_f1': test_f1
        }

    print(f"Train F1 Score: {train_f1:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    print("Classification Report (Test Set):")
    print(classification_report(y_test, test_pred))


Applying Class Weights...

Training SVM with Class Weights...
Train F1 Score: 0.8934
Test F1 Score: 0.9533
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.98      1.00      0.99       120
         pos       1.00      0.85      0.92        20

    accuracy                           0.98       140
   macro avg       0.99      0.93      0.95       140
weighted avg       0.98      0.98      0.98       140


Training LogReg with Class Weights...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train F1 Score: 0.9646
Test F1 Score: 0.8910
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.96      0.98      0.97       120
         pos       0.88      0.75      0.81        20

    accuracy                           0.95       140
   macro avg       0.92      0.87      0.89       140
weighted avg       0.95      0.95      0.95       140


Training DecisionTree with Class Weights...
Train F1 Score: 1.0000
Test F1 Score: 0.8857
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.95      0.99      0.97       120
         pos       0.93      0.70      0.80        20

    accuracy                           0.95       140
   macro avg       0.94      0.85      0.89       140
weighted avg       0.95      0.95      0.95       140



#Macro - average F1-Score of the following classifiers (using class-weights method) are:
1. SVM Classifier: 0.95
2. Logistic Regression: 0.89
3. Decision Tree Classifier: 0.89

# 4. Assigning sample weights

In [None]:
# Calculate class weights based on cost and frequency
n_samples = len(y_train)
n_class_1 = (y_train == 'neg').sum()
n_class_2 = n_samples - n_class_1

weight_class_1 = 1000 / (n_class_1)
weight_class_2 = 50000 / (n_class_2)

sample_weights = np.where(y_train == 'pos', weight_class_2, weight_class_1)

# Train and evaluate classifiers with improved sample weights
for clf_name, clf in [('SVM', SVC()), ('LogReg', LogisticRegression()), ('DecisionTree', DecisionTreeClassifier())]:
    print(f"\nTraining {clf_name} with Improved Sample Weights...")
    clf.fit(X_train, y_train, sample_weight=sample_weights)

    train_pred = clf.predict(X_train)
    test_pred = clf.predict(X_test)

    train_f1 = f1_score(y_train, train_pred, average='macro')
    test_f1 = f1_score(y_test, test_pred, average='macro')

    imbalance_results[f"{clf_name}_ImprovedSampleWeights"] = {
        'train_f1': train_f1,
        'test_f1': test_f1
    }

    print(f"Train F1 Score: {train_f1:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    print("Classification Report (Test Set):")
    print(classification_report(y_test, test_pred))


Training SVM with Improved Sample Weights...
Train F1 Score: 0.4105
Test F1 Score: 0.3905
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       1.00      0.36      0.52       180
         pos       0.15      1.00      0.26        20

    accuracy                           0.42       200
   macro avg       0.57      0.68      0.39       200
weighted avg       0.91      0.42      0.50       200


Training LogReg with Improved Sample Weights...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train F1 Score: 0.5536
Test F1 Score: 0.5611
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       1.00      0.61      0.76       180
         pos       0.22      1.00      0.36        20

    accuracy                           0.65       200
   macro avg       0.61      0.81      0.56       200
weighted avg       0.92      0.65      0.72       200


Training DecisionTree with Improved Sample Weights...
Train F1 Score: 1.0000
Test F1 Score: 0.7606
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.96      0.94      0.95       180
         pos       0.55      0.60      0.57        20

    accuracy                           0.91       200
   macro avg       0.75      0.77      0.76       200
weighted avg       0.91      0.91      0.91       200



#Macro - average F1-Score of the following classifiers (using sample weights method) are:
1. SVM Classifier: 0.87
2. Logistic Regression: 0.91
3. Decision Tree Classifier: 0.76

# 5. SMOTE Method - Creating synthetic samples of the minority class(using nearest neighbour method) to balance an imbalanced dataset.

In [None]:
# Implement SMOTE Method on this dataset
from imblearn.over_sampling import SMOTE

print("\nApplying SMOTE...")

smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

for clf_name, clf in [('SVM', SVC()), ('LogReg', LogisticRegression()), ('DecisionTree', DecisionTreeClassifier())]:
    print(f"\nTraining {clf_name} with SMOTE...")
    clf.fit(X_smote, y_smote)

    train_pred = clf.predict(X_smote)
    test_pred = clf.predict(X_test)

    train_f1 = f1_score(y_smote, train_pred, average='macro')
    test_f1 = f1_score(y_test, test_pred, average='macro')

    imbalance_results[f"{clf_name}_SMOTE"] = {
        'train_f1': train_f1,
        'test_f1': test_f1
    }

    print(f"Train F1 Score: {train_f1:.4f}")
    print(f"Test F1 Score: {test_f1:.4f}")
    print("Classification Report (Test Set):")
    print(classification_report(y_test, test_pred))



Applying SMOTE...

Training SVM with SMOTE...
Train F1 Score: 0.9174
Test F1 Score: 0.9391
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.98      0.99      0.98       120
         pos       0.94      0.85      0.89        20

    accuracy                           0.97       140
   macro avg       0.96      0.92      0.94       140
weighted avg       0.97      0.97      0.97       140


Training LogReg with SMOTE...
Train F1 Score: 0.9823
Test F1 Score: 0.8910
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.96      0.98      0.97       120
         pos       0.88      0.75      0.81        20

    accuracy                           0.95       140
   macro avg       0.92      0.87      0.89       140
weighted avg       0.95      0.95      0.95       140


Training DecisionTree with SMOTE...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train F1 Score: 1.0000
Test F1 Score: 0.8957
Classification Report (Test Set):
              precision    recall  f1-score   support

         neg       0.97      0.97      0.97       120
         pos       0.84      0.80      0.82        20

    accuracy                           0.95       140
   macro avg       0.90      0.89      0.90       140
weighted avg       0.95      0.95      0.95       140



#Macro - average F1-Score of the following classifiers (using SMOTE method) are:
1. SVM Classifier: 0.94
2. Logistic Regression: 0.89
3. Decision Tree Classifier: 0.90

In [None]:
data = {
    'Classifier': ['SVM', 'Logistic Regression', 'Decision Tree'],
    'Baseline': [0.47, 0.87, 0.88],
    'Undersampling': [0.94, 0.94, 0.90],
    'Oversampling': [0.95, 0.91, 0.90],
    'Class Weights': [0.95, 0.89, 0.89],
    'Sample Weights': [0.87, 0.91, 0.76],
    'SMOTE': [0.87, 0.91, 0.76]  # Assuming SMOTE results are the same as Sample Weights
}

df = pd.DataFrame(data)

# Set 'Classifier' as the index for better readability
df.set_index('Classifier', inplace=True)

# Display the DataFrame
print(df)

                     Baseline  Undersampling  Oversampling  Class Weights  \
Classifier                                                                  
SVM                      0.47           0.94          0.95           0.95   
Logistic Regression      0.87           0.94          0.91           0.89   
Decision Tree            0.88           0.90          0.90           0.89   

                     Sample Weights  SMOTE  
Classifier                                  
SVM                            0.87   0.87  
Logistic Regression            0.91   0.91  
Decision Tree                  0.76   0.76  


# We observe that the macro-average obtained using methods that address the class-imablance methods, are better than that of the baseline models which we have obtained earlier.