In [9]:
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE, KMeansSMOTE, SVMSMOTE, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids, TomekLinks, EditedNearestNeighbours, RepeatedEditedNearestNeighbours, AllKNN, NearMiss, OneSidedSelection, CondensedNearestNeighbour
from imblearn.combine import SMOTEENN, SMOTETomek

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os
import dagshub
import pickle

In [3]:
dagshub.init(repo_owner='satyajeetrai007', repo_name='Youtube-Comment-Sentiment-Analysis', mlflow=True)
mlflow.set_experiment("Handling Imbalanced Data")

2025/08/25 20:36:03 INFO mlflow.tracking.fluent: Experiment with name 'Handling Imbalanced Data' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/4957100cdb83401aa6db764ab63e1e32', creation_time=1756134365045, experiment_id='4', last_update_time=1756134365045, lifecycle_stage='active', name='Handling Imbalanced Data', tags={}>

In [4]:
df = pd.read_csv('data_preprocessed.csv').dropna(subset=['clean_comment']).drop_duplicates()
df.shape

(36243, 2)

In [10]:
# Step 1: Function to run the experiment
def run_imbalanced_experiment(imbalance_method):
    ngram_range = (1, 3)  # Trigram setting
    max_features = 2000   # Limit TF-IDF features

    # Step 4: Train-test split before vectorization and resampling
    X_train, X_test, y_train, y_test = train_test_split(
        df['clean_comment'], df['category'],
        test_size=0.2, random_state=42, stratify=df['category']
    )

    # Step 2: Vectorization using TF-IDF, fit on training data only
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Step 3: Handle class imbalance (training set only)
    if imbalance_method == 'class_weights':
        class_weight = 'balanced'
    else:
        class_weight = None

        # Oversampling methods
        if imbalance_method == 'random_oversampling':
            ros = RandomOverSampler(random_state=42)
            X_train_vec, y_train = ros.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'smote':
            smote = SMOTE(random_state=42)
            X_train_vec, y_train = smote.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'adasyn':
            adasyn = ADASYN(random_state=42)
            X_train_vec, y_train = adasyn.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'borderline_smote':
            blsm = BorderlineSMOTE(random_state=42, kind='borderline-1')
            X_train_vec, y_train = blsm.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'kmeans_smote':
            kms = KMeansSMOTE(random_state=42)
            X_train_vec, y_train = kms.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'svm_smote':
            svms = SVMSMOTE(random_state=42)
            X_train_vec, y_train = svms.fit_resample(X_train_vec, y_train)

        # Undersampling methods
        elif imbalance_method == 'random_undersampling':
            rus = RandomUnderSampler(random_state=42)
            X_train_vec, y_train = rus.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'cluster_centroids':
            cc = ClusterCentroids(random_state=42)
            X_train_vec, y_train = cc.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'tomek_links':
            tl = TomekLinks()
            X_train_vec, y_train = tl.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'enn':
            enn = EditedNearestNeighbours()
            X_train_vec, y_train = enn.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'renn':
            renn = RepeatedEditedNearestNeighbours()
            X_train_vec, y_train = renn.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'allknn':
            allknn = AllKNN()
            X_train_vec, y_train = allknn.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'nearmiss':
            nm = NearMiss()
            X_train_vec, y_train = nm.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'oss':
            oss = OneSidedSelection(random_state=42)
            X_train_vec, y_train = oss.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'cnn':
            cnn = CondensedNearestNeighbour(random_state=42)
            X_train_vec, y_train = cnn.fit_resample(X_train_vec, y_train)

        # Hybrid methods
        elif imbalance_method == 'smote_enn':
            smote_enn = SMOTEENN(random_state=42)
            X_train_vec, y_train = smote_enn.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'smote_tomek':
            smt = SMOTETomek(random_state=42)
            X_train_vec, y_train = smt.fit_resample(X_train_vec, y_train)

    # Step 5: Train Random Forest model with MLflow logging
    with mlflow.start_run() as run:
        mlflow.set_tag("mlflow.runName", f"Imbalance_{imbalance_method}_RandomForest_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "imbalance_handling")
        mlflow.set_tag("model_type", "RandomForestClassifier")
        mlflow.set_tag("description", f"RandomForest with TF-IDF Trigrams, imbalance={imbalance_method}")

        mlflow.log_param("vectorizer_type", "TF-IDF")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)

        n_estimators = 200
        max_depth = 15
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("imbalance_method", imbalance_method)

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42,
            class_weight=class_weight
        )
        model.fit(X_train_vec, y_train)

        # Predictions + metrics
        y_pred = model.predict(X_test_vec)
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: Imbalance={imbalance_method}")
        cm_filename = f"confusion_matrix_{imbalance_method}.png"
        plt.savefig(cm_filename)
        mlflow.log_artifact(cm_filename)
        plt.close()

        with open( f"rf_tfidf_trigrams_{imbalance_method}", "wb") as f:
            pickle.dump(model, f)

        with open(f"rf_tfidf_trigrams_{imbalance_method}", "rb") as f:
            clf = pickle.load(f)

        mlflow.log_artifact(f"rf_tfidf_trigrams_{imbalance_method}", "model")


In [12]:
# Step 7: Run experiments for different imbalance methods
imbalance_methods = [
    # Class weighting
    "class_weights",

    # Random resampling
    "random_oversampling",
    "random_undersampling",

    # SMOTE family
    "smote",
    "borderline_smote",
    "svm_smote",
    "adasyn",

    # Undersampling techniques
    "tomek_links",
    "enn",  
    "all_knn",
    "near_miss",
    "cnn"

    "cluster_centroids",

    # Hybrid methods
    "smote_tomek",
    "smote_enn",
]


for method in imbalance_methods:
    print(f"\nRunning experiment with method: {method}")
    run_imbalanced_experiment(method)



Running experiment with method: class_weights
🏃 View run Imbalance_class_weights_RandomForest_TFIDF_Trigrams at: https://dagshub.com/satyajeetrai007/Youtube-Comment-Sentiment-Analysis.mlflow/#/experiments/4/runs/cab6f560a41c4c15aafd289a2d8231f0
🧪 View experiment at: https://dagshub.com/satyajeetrai007/Youtube-Comment-Sentiment-Analysis.mlflow/#/experiments/4

Running experiment with method: random_oversampling
🏃 View run Imbalance_random_oversampling_RandomForest_TFIDF_Trigrams at: https://dagshub.com/satyajeetrai007/Youtube-Comment-Sentiment-Analysis.mlflow/#/experiments/4/runs/6eaea8fa1f6c4503acd37d2f27b71c69
🧪 View experiment at: https://dagshub.com/satyajeetrai007/Youtube-Comment-Sentiment-Analysis.mlflow/#/experiments/4

Running experiment with method: random_undersampling
🏃 View run Imbalance_random_undersampling_RandomForest_TFIDF_Trigrams at: https://dagshub.com/satyajeetrai007/Youtube-Comment-Sentiment-Analysis.mlflow/#/experiments/4/runs/6e6a26758c7d421498818837b2e07b4e
🧪 Vi