## Vectorizations
We will explore which types of vectorizations are best and provides the most lift for the model.
For instance in BOW we have
1. ngrams
2. bigrams
3. trigram

In [None]:
!pip install mlflow boto3 awscli



In [None]:
from google.colab import userdata

In [None]:
!aws configure

In [None]:
import mlflow
# Step 2: Set up the MLflow tracking server
mlflow.set_tracking_uri(f"{userdata.get('MLFLOW_SERVER')}")

In [None]:
mlflow.set_experiment("Exp 2 - BoW vs TfIdf")

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import os


In [None]:
df = pd.read_csv("/content/reddit_preprocessing (1).csv")
df.shape

(36793, 2)

In [None]:
df.dropna(inplace=True)
df.shape

(36662, 2)

In [None]:
##tf-idf stands for term frequency inveser document frequency
## stat to show how important a word is to a document or collection

#step  1: funcition to run experiment

def run_experiment(vectorizer_type, ngram_range, vectorizer_max_features,vectorizer_name):
    ##step 2 vectorize
    if vectorizer_type == "BoW":
        vectorizer = CountVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram_range, max_features=vectorizer_max_features)

    X_train, X_test, y_train, y_test = train_test_split(df['clean_comment'], df['category'], test_size =0.2, random_state=42,stratify=df['category'])

    X_train = vectorizer.fit_transform(X_train)
    X_test = vectorizer.transform(X_test)

    ##step 3: train
    with mlflow.start_run() as run:
         mlflow.set_tag("mlflow.runName", f"{vectorizer_name}_{ngram_range}_RandomForest")
         mlflow.set_tag("experiment_type", "feature_engineering")
         mlflow.set_tag("model_type", "RandomForestClassifer")

        ## add description
         mlflow.set_tag("description", f"Random Forest with {vectorizer_name}, ngram range {ngram_range}, max features {vectorizer_max_features}")

         ##log params

         mlflow.log_param("vectorizer_type", vectorizer_type)
         mlflow.log_param("ngram_range", ngram_range)
         mlflow.log_param("vectorizer_max_features", vectorizer_max_features)

         ## rf parameters

         n_estimators = 100
         max_depth = 15
         mlflow.log_param("n_estimators", n_estimators)
         mlflow.log_param("max_depth", max_depth)

         ## init and train models
         model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
         model.fit(X_train, y_train)

         #make predictions
         y_pred = model.predict(X_test)

         #evaluate
         accuracy = accuracy_score(y_test, y_pred)
         mlflow.log_metric("accuracy", accuracy)
                    # Log classification report
         classification_rep = classification_report(y_test, y_pred, output_dict=True)
         for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log confusion matrix
         conf_matrix = confusion_matrix(y_test, y_pred)
         plt.figure(figsize=(8, 6))
         sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
         plt.xlabel("Predicted")
         plt.ylabel("Actual")
         plt.title(f"Confusion Matrix: {vectorizer_name}, {ngram_range}")
         plt.savefig("confusion_matrix.png")
         mlflow.log_artifact("confusion_matrix.png")
         plt.close()

ngram_ranges = [(1, 1), (1, 2), (1, 3)]  # unigrams, bigrams, trigrams
max_features = 5000  # Example max feature size

for ngram_range in ngram_ranges:
    # BoW Experiments
    run_experiment("BoW", ngram_range, max_features, vectorizer_name="BoW")

    # TF-IDF Experiments
    run_experiment("TF-IDF", ngram_range, max_features, vectorizer_name="TF-IDF")

mlflow.end_run()  # Always runs, even if error occurs
