In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("processed.csv")
df

Unnamed: 0.1,Unnamed: 0,clean_comment,category
0,0,family mormon never tried explain still stare ...,1
1,1,buddhism much lot compatible christianity espe...,1
2,2,seriously say thing first get complex explain ...,-1
3,3,learned want teach different focus goal not wr...,0
4,4,benefit may want read living buddha living chr...,1
...,...,...,...
36241,37243,agree push make nation either pity pakistan in...,-1
36242,37244,jesus,0
36243,37245,kya bhai pure saal chutiya banaya modi aur jab...,1
36244,37246,downvote karna tha par upvote hogaya,0


In [4]:
df.dropna(inplace=True)

In [4]:
import dagshub
dagshub.init(repo_owner='rajatchauhan99', repo_name='Youtube-Comment-Analysis-Chrome-Plugin', mlflow=True)

In [5]:
# Set or create an experiment
import mlflow
mlflow.set_experiment("Exp4 handling imbalanced dataset")

2024/10/19 17:01:19 INFO mlflow.tracking.fluent: Experiment with name 'Exp4 handling imbalanced dataset' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/811eea3d5aad4b89a7d3480bf056efe4', creation_time=1729337484649, experiment_id='3', last_update_time=1729337484649, lifecycle_stage='active', name='Exp4 handling imbalanced dataset', tags={}>

In [8]:
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.combine import SMOTEENN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os

We will try various imbalanced dataset techniques, and see which one gives the best result.

In [9]:
# Step 1: Function to run the experiment
def run_imbalanced_experiment(imbalance_method):
    ngram_range = (1, 3)  # Trigram setting
    max_features = 1000  # Set max_features to 1000 for TF-IDF

    # Step 4: Train-test split before vectorization and resampling
    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size=0.2, random_state=42, stratify=df['category'])

    # Step 2: Vectorization using TF-IDF, fit on training data only
    vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=max_features)
    X_train_vec = vectorizer.fit_transform(X_train)  # Fit on training data
    X_test_vec = vectorizer.transform(X_test)  # Transform test data

    # Step 3: Handle class imbalance based on the selected method (only applied to the training set)
    if imbalance_method == 'class_weights':
        # Use class_weight in Random Forest
        class_weight = 'balanced'
    else:
        class_weight = None  # Do not apply class_weight if using resampling

        # Resampling Techniques (only apply to the training set)
        if imbalance_method == 'oversampling':
            smote = SMOTE(random_state=42)
            X_train_vec, y_train = smote.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'adasyn':
            adasyn = ADASYN(random_state=42)
            X_train_vec, y_train = adasyn.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'undersampling':
            rus = RandomUnderSampler(random_state=42)
            X_train_vec, y_train = rus.fit_resample(X_train_vec, y_train)
        elif imbalance_method == 'smote_enn':
            smote_enn = SMOTEENN(random_state=42)
            X_train_vec, y_train = smote_enn.fit_resample(X_train_vec, y_train)

    # Step 5: Define and train a Random Forest model
    with mlflow.start_run() as run:
        # Set tags for the experiment and run
        mlflow.set_tag("mlflow.runName", f"Imbalance_{imbalance_method}_RandomForest_TFIDF_Trigrams")
        mlflow.set_tag("experiment_type", "imbalance_handling")
        mlflow.set_tag("model_type", "RandomForestClassifier")

        # Add a description
        mlflow.set_tag("description", f"RandomForest with TF-IDF Trigrams, imbalance handling method={imbalance_method}")

        # Log vectorizer parameters
        mlflow.log_param("vectorizer_type", "TF-IDF")
        mlflow.log_param("ngram_range", ngram_range)
        mlflow.log_param("vectorizer_max_features", max_features)

        # Log Random Forest parameters
        n_estimators = 200
        max_depth = 15

        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("imbalance_method", imbalance_method)

        # Initialize and train the model
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42, class_weight=class_weight)
        model.fit(X_train_vec, y_train)

        # Step 6: Make predictions and log metrics
        y_pred = model.predict(X_test_vec)

        # Log accuracy
        accuracy = accuracy_score(y_test, y_pred)
        mlflow.log_metric("accuracy", accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(8, 6))
        sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title(f"Confusion Matrix: TF-IDF Trigrams, Imbalance={imbalance_method}")
        confusion_matrix_filename = f"confusion_matrix_{imbalance_method}.png"
        plt.savefig(confusion_matrix_filename)
        mlflow.log_artifact(confusion_matrix_filename)
        plt.close()

        # Log the model
        mlflow.sklearn.log_model(model, f"random_forest_model_tfidf_trigrams_imbalance_{imbalance_method}")

# Step 7: Run experiments for different imbalance methods
imbalance_methods = ['class_weights', 'oversampling', 'adasyn', 'undersampling', 'smote_enn']

for method in imbalance_methods:
    run_imbalanced_experiment(method)


2024/10/19 17:11:38 INFO mlflow.tracking._tracking_service.client: 🏃 View run Imbalance_class_weights_RandomForest_TFIDF_Trigrams at: https://dagshub.com/rajatchauhan99/Youtube-Comment-Analysis-Chrome-Plugin.mlflow/#/experiments/3/runs/414c417a67fa40e191f9f51bb0624bcd.
2024/10/19 17:11:38 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/rajatchauhan99/Youtube-Comment-Analysis-Chrome-Plugin.mlflow/#/experiments/3.
2024/10/19 17:12:35 INFO mlflow.tracking._tracking_service.client: 🏃 View run Imbalance_oversampling_RandomForest_TFIDF_Trigrams at: https://dagshub.com/rajatchauhan99/Youtube-Comment-Analysis-Chrome-Plugin.mlflow/#/experiments/3/runs/ce9019b182764e1cb393b1fcbd136e4c.
2024/10/19 17:12:35 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/rajatchauhan99/Youtube-Comment-Analysis-Chrome-Plugin.mlflow/#/experiments/3.
2024/10/19 17:13:41 INFO mlflow.tracking._tracking_service.client: 🏃 View run Imbalance

oversampling step which is the SMOTE (Synthentic minority class oversampling technique) is giving the best result