Assumptions and Notes:
* Data already has a 'Processed Text' column, to ensure similar process between the 2 notebooks - This one, for model training, and the other one for unsupervised analysis and finalization of xlsx file with the predicted classes.

* In the tagged training data, labels appear in 'Themes' column, seperated by ', '.

* If you wish to further test the models created, you should have an additional test.csv tagged file to upload for the 'Evaluation' section. The quality of these models is assured using cross validation on a full dataset.

# Imports

In [11]:
!pip install -q optuna
!pip install scikit-optimize



In [2]:
# Basic Python Libraries
import numpy as np
import pandas as pd
import string
import ast  # For converting string representations of lists to actual lists

import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UndefinedMetricWarning)

# Machine Learning and Text Analysis
from copy import deepcopy
import optuna
import logging
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import precision_score, accuracy_score, f1_score, classification_report, recall_score, confusion_matrix, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import cross_val_score, StratifiedKFold
from skopt import gp_minimize

# Optional: Google Colab Specific Library (if using Google Colab)
import os
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient.discovery import build
from google.colab import files

# Export Model:
import pickle

# Get Train Data

In [3]:
import os
from google.colab import files

# Define the filename
filename = 'SecretKeys.py'

# Check if the file exists
if not os.path.exists(filename):
    print(f"{filename} not found. Please upload the file.")
    uploaded = files.upload()

# Import the module if it exists
if os.path.exists(filename):
    from SecretKeys import *
    print("File uploaded and module imported successfully.")
else:
    print(f"{filename} not found after upload attempt. Please check the file and try again.")

SecretKeys.py not found. Please upload the file.


Saving SecretKeys.py to SecretKeys.py
File uploaded and module imported successfully.


In [4]:
scope = ['https://spreadsheets.google.com/feeds', 'https://www.googleapis.com/auth/drive', 'https://www.googleapis.com/auth/documents.readonly']
credentials = ServiceAccountCredentials.from_json_keyfile_dict(GoogleDriveCreds, scope)
gc = gspread.authorize(credentials)

# 1. Access sending list as DataFrame
file_id = "1BEq6zEIWtBXT0zAk2ECQt-zHCN1CQ4EpRyEqigQ01UY"
sh = gc.open_by_key(file_id)

worksheet_name = "Train Data"
tagged_data = pd.DataFrame(sh.worksheet(worksheet_name).get_all_records())
# Keep only relevant columns and drop nulls - will affect the integrity of the automation
tagged_data.replace("", np.nan, inplace=True)
tagged_data.dropna(how='any', inplace=True)
tagged_data = tagged_data.drop_duplicates()

In [5]:
tagged_data.head()

Unnamed: 0,Source,Original Text,Processed Text,KeyWords,Themes
0,Undocumented,i appreciate that most of my managers are ther...,appreciate manager answer question provide ass...,"['manager', 'provide', 'appreciate', 'question...",management
1,Undocumented,my manager: in this case would be my team lead...,manager case team leader sure currently manage...,"['team', 'manager', 'best', 'person', 'helpful...",management
2,Undocumented,my manager hands down has been the best manage...,manager hand best manager year quick . approac...,"['need', 'manager', 'help', 'quick', 'year', '...","management, employee care and listening"
3,Undocumented,"my manager has always been ready, willing & ab...",manager ready willing able answer question ari...,"['work', 'manager', 'able', 'order', 'willing'...",management
4,Undocumented,mr. gray is always available to answer questio...,mr. gray available answer question unsure hand...,"['issue', 'available', 'shipment', 'internatio...",management


# Train Preprocess and Split (Train & Test)

Binarize Columns, split data, balance train

In [6]:
def _BinaryThemeColumns(tagged_data, themes_column='Themes'):
    """
    This function takes a DataFrame and creates binary columns for each unique theme.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    themes_column (str): The name of the column containing the list of themes.

    Returns:
    pd.DataFrame: The DataFrame with binary columns for each theme.
    """
    # Create a copy of the DataFrame to avoid modifying the original one
    df_copy = tagged_data.copy()

    # Split themes by comma and strip whitespace
    df_copy[themes_column] = df_copy[themes_column].apply(lambda x: [theme.strip() for theme in x.split(',')])

    # Get all unique themes
    unique_themes = set(theme for sublist in df_copy[themes_column] for theme in sublist)

    # Create binary columns for each unique theme
    for theme in unique_themes:
        df_copy[theme] = df_copy[themes_column].apply(lambda x: 1 if theme in x else 0)

    return df_copy


def TrainTestThemeSplit(tagged_data, themes_column = 'Themes', test_size=0.2, ratio = None, random_state=42):
    """
    Binarize the themes column using _BinaryThemeColumns()
    Split data into train and test sets for each theme, ensuring balance.

    Parameters:
    df (pd.DataFrame): The input DataFrame with binary theme columns.
    themes_column (str): The column containing the list of themes for each row.
    test_size (float): The proportion of the dataset to include in the test split.
    ratio (int): The ratio of non-theme rows you want in the data, for example, 3 for 1:3.
                  If input is None, will train the model compared to all of the elsewise tagged data.
    random_state (int): The seed used by the random number generator.

    Returns:
    dict: Dictionary with training and testing sets for each theme.
    """
    theme_datasets = {}
    tagged_data_b = _BinaryThemeColumns(tagged_data, themes_column)
    unique_themes = set(theme for sublist in tagged_data_b[themes_column] for theme in sublist)

    for theme in unique_themes:
        # Get all rows with the current theme
        theme_df = tagged_data_b[tagged_data_b[theme] == 1].copy()

        # Get all rows without the current theme
        non_theme_df = tagged_data_b[tagged_data_b[theme] == 0].copy()

        # Sample negative samples
        if ratio:
            non_theme_sample_size = min(len(theme_df) * ratio, len(non_theme_df))
            non_theme_df = non_theme_df.sample(n=non_theme_sample_size, random_state=random_state)

        # Split into train and test sets, ensure both positive and negative samples in both
        if len(theme_df) > 1:
            train_df_p, test_df_p = train_test_split(theme_df, test_size=test_size, random_state=random_state)
        else:
            train_df_p, test_df_p = theme_df, pd.DataFrame()

        if len(non_theme_df) > 1:
            train_df_n, test_df_n = train_test_split(non_theme_df, test_size=test_size, random_state=random_state)
        else:
            train_df_n, test_df_n = non_theme_df, pd.DataFrame()

        train_df = pd.concat([train_df_p, train_df_n]).sample(frac=1, random_state=random_state).reset_index(drop=True)
        test_df = pd.concat([test_df_p, test_df_n]).sample(frac=1, random_state=random_state).reset_index(drop=True)

        # Store in dictionary
        theme_datasets[theme] = {
            'train': train_df,
            'test': test_df,
            'full_ds': tagged_data_b
        }

    return theme_datasets

In [7]:
theme_datasets = TrainTestThemeSplit(tagged_data, ratio = None)

In [8]:
for theme in theme_datasets:
  print(f'{theme}:', ' Train Size=', len(theme_datasets[theme]['train']),
        '| Test Size=', len(theme_datasets[theme]['test']), '| DS Size=',len(theme_datasets[theme]['full_ds']))

innovation:  Train Size= 1045 | Test Size= 263 | DS Size= 1308
stability:  Train Size= 1046 | Test Size= 262 | DS Size= 1308
hr:  Train Size= 1045 | Test Size= 263 | DS Size= 1308
company culture:  Train Size= 1046 | Test Size= 262 | DS Size= 1308
overall engaged/satisfied:  Train Size= 1046 | Test Size= 262 | DS Size= 1308
fairness / equality / inclusion:  Train Size= 1046 | Test Size= 262 | DS Size= 1308
work processes:  Train Size= 1046 | Test Size= 262 | DS Size= 1308
challenging work:  Train Size= 1046 | Test Size= 262 | DS Size= 1308
cross-functional collaboration:  Train Size= 1046 | Test Size= 262 | DS Size= 1308
within team collaboration:  Train Size= 1046 | Test Size= 262 | DS Size= 1308
recognition:  Train Size= 1045 | Test Size= 263 | DS Size= 1308
flexibility:  Train Size= 1046 | Test Size= 262 | DS Size= 1308
salary / compensation:  Train Size= 1045 | Test Size= 263 | DS Size= 1308
workload / work-life balance:  Train Size= 1046 | Test Size= 262 | DS Size= 1308
sick / vac

# Model Creation

## Regular Training Flow

Each model will output a probabilities per class prediction. Then, the following process takes place:

Normalize Predictions -> Find Optimal Tresholds per label -> Apply Thresholds (Get Binary Preds) -> Evaluation (All using train-test sets).

This process is a relatively fast process, and it gets good results, using a minimal optimization. I do recommend using the Optuna based process, as it will attempt to find the best model possible for the classification problem.

In [None]:
def FindOptimalThreshold(y_true, y_probs, metric, border_thresh_for_scan = (0.1, 1)):
  """
  Find the optimal threshold to maximize the given metric.

  Parameters:
  y_true (array-like): True labels.
  y_probs (array-like): Predicted probabilities.
  metric (str): Metric to maximize ('precision', 'accuracy', 'f1').

  Returns:
  float: Optimal threshold.
  """
  best_threshold = 0.5
  best_score = 0

  min_thresh, max_thresh = border_thresh_for_scan
  thresholds = np.arange(min_thresh, max_thresh, 0.01)
  for threshold in thresholds:
      y_pred = (y_probs >= threshold).astype(int)
      if metric == 'precision':
          score = precision_score(y_true, y_pred)
      elif metric == 'accuracy':
          score = accuracy_score(y_true, y_pred)
      elif metric == 'f1':
          score = f1_score(y_true, y_pred)
      else:
          raise ValueError("Unsupported metric: choose from 'precision', 'accuracy', 'f1'")

      if score > best_score:
          best_score = score
          best_threshold = threshold

  return best_threshold



def TrainModels(theme_datasets, text_column='Processed Text', metric='precision', min_quality_thresh = 0.5, data_size_thresh = 30):
  """
  Train multiple models for each theme, evaluate their performance, and find optimal thresholds.

  Parameters:
  theme_datasets (dict): Dictionary containing training and testing sets for each theme.
  text_column (str): The column containing the text data.
  metric (str): Metric to maximize ('precision', 'accuracy', 'f1').
  min_quality_thresh (float): The minimum acceptable classification score by the model, to keep as a valid model.
  data_size_thresh (int): Min number of positive samples in the train to deem this model sufficient.

  Returns:
  dict: Dictionary containing the best models, classification reports, optimal thresholds, and vectorizer.
  """
  results = {}

  # Fit the TF-IDF vectorizer on the entire training dataset
  vectorizer = TfidfVectorizer()
  all_train_texts = pd.concat([datasets['train'][text_column] for datasets in theme_datasets.values()]).drop_duplicates()
  vectorizer.fit(all_train_texts)

  # Define the models to evaluate
  base_models = [
      ('SVM', SVC(probability=True, random_state=42)),
      ('RandomForest', RandomForestClassifier(random_state=42)),
      ('AdaBoost', AdaBoostClassifier(random_state=42))
  ]

  for theme, datasets in theme_datasets.items():
      print(f"\n[Runtime Status]: Training for theme {theme}")
      train_worthy = True
      train_df = datasets['train']
      test_df = datasets['test']
      full_ds = datasets['full_ds']

      best_model = None
      best_threshold = min_quality_thresh
      best_score = 0
      best_model_name = None
      best_conf_mat_test = None
      best_conf_mat_train = None
      best_conf_mat_full = None

      if len(train_df[train_df[theme] == 1]) <= data_size_thresh:
        train_worthy = False

      if train_worthy:
        X_train = vectorizer.transform(train_df[text_column])
        X_test = vectorizer.transform(test_df[text_column])
        X_full = vectorizer.transform(full_ds[text_column])

        for model_name, base_model in base_models:

            # Train the model
            model = deepcopy(base_model)
            model.fit(X_train, train_df[theme])

            # Find optimal thresh based on train & Predict probabilities (Train)
            train_proba = model.predict_proba(X_train)[:, 1]
            optimal_threshold = FindOptimalThreshold(train_df[theme], train_proba, metric)
            train_pred = (train_proba >= optimal_threshold).astype(int)

            # Predict probabilities (Test)
            test_proba = model.predict_proba(X_test)[:, 1]
            test_pred = (test_proba >= optimal_threshold).astype(int)

            # Predict probabilities (Full DS)
            full_data_proba = model.predict_proba(X_full)[:, 1]
            full_data_pred = (full_data_proba >= optimal_threshold).astype(int)

            # Calculate the score for the current model
            if metric == 'precision':
                score = precision_score(test_df[theme], test_pred)
            elif metric == 'accuracy':
                score = accuracy_score(test_df[theme], test_pred)
            elif metric == 'f1':
                score = f1_score(test_df[theme], test_pred)

            # Select the best model based on the score, increase thresh if possible
            if score >= best_score:
                best_model = model
                best_threshold = optimal_threshold
                best_score = score
                best_model_name = model_name
                tn, fp, fn, tp = confusion_matrix(test_df[theme], test_pred).ravel()
                best_conf_mat_test = {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn}
                tn, fp, fn, tp = confusion_matrix(train_df[theme], train_pred).ravel()
                best_conf_mat_train = {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn}
                tn, fp, fn, tp = confusion_matrix(full_ds[theme], full_data_pred).ravel()
                best_conf_mat_full = {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn}

      print(f"[Runtime Status]: Best model was {best_model_name}, with thresh: {best_threshold}")
      print(f"[Runtime Status]: Best model's score in the metric is {round(best_score,3)}")

      if best_conf_mat_test:
        print(f"[Runtime Status]: Confusion matrix test: {best_conf_mat_test}")
        print(f"[Runtime Status]: Confusion matrix train: {best_conf_mat_train}")
        print(f"[Runtime Status]: Confusion matrix full tagged dataset: {best_conf_mat_full}")

      # Store the best model, report, and optimal threshold
      results[theme] = {
          'model': best_model,
          'optimal_threshold': best_threshold,
          'confusion_matrix_test': best_conf_mat_test,
          'confusion_matrix_train': best_conf_mat_train,
          'confusion_matrix_full': best_conf_mat_full,
      }

  results['vectorizer'] = vectorizer
  return results

In [None]:
training_results = TrainModels(theme_datasets, metric='precision')
with open('ThemeClassifierModels.pkl', 'wb') as f:
    pickle.dump(training_results, f)


[Runtime Status]: Training for theme growth / career development
[Runtime Status]: Best model was SVM, with thresh: 0.9099999999999996
[Runtime Status]: Best model's score in the metric is 1.0
[Runtime Status]: Confusion matrix test: {'TP': 5, 'FP': 0, 'FN': 16, 'TN': 242}
[Runtime Status]: Confusion matrix train: {'TP': 80, 'FP': 0, 'FN': 0, 'TN': 965}
[Runtime Status]: Confusion matrix full tagged dataset: {'TP': 85, 'FP': 0, 'FN': 16, 'TN': 1207}

[Runtime Status]: Training for theme office environment
[Runtime Status]: Best model was SVM, with thresh: 0.6899999999999997
[Runtime Status]: Best model's score in the metric is 0.943
[Runtime Status]: Confusion matrix test: {'TP': 33, 'FP': 2, 'FN': 15, 'TN': 212}
[Runtime Status]: Confusion matrix train: {'TP': 192, 'FP': 0, 'FN': 0, 'TN': 854}
[Runtime Status]: Confusion matrix full tagged dataset: {'TP': 225, 'FP': 2, 'FN': 15, 'TN': 1066}

[Runtime Status]: Training for theme hr
[Runtime Status]: Best model was None, with thresh: 0

## Optimal Training using Optuna

This is an enhanced training process, aimed at creating the most accurate model for each theme. This process is a lot more expensive computationally, and should be considered as such.

For each theme, an optimization flow will take place, in which it will attamt to optimize it's hyperparameters for stable yet great results. After each model type it trained to it's best result, the best model will be chosen -> The model for which the best threshold yields the best results in the cross validation. This will be the final model chosen.

NOTES:  

*   Using ThreadPool might speedup the process, but might also mess the log prints at the current version. For me it's not important enough, but feel free to explore.
*   This training process is built on very small training set, so the models experimented with are relatively simple models.
* Given the nature of the problem as a multi-class multi-label problem, I chose to maximize the Percision score per class, while allowing the model to improve in recall, if reached percision = 100%. For computation considerations, I chose to stop training with a simple model, if the model reached percision >= 90% and recall >= 50%. This of course is flexible.




In [9]:
# Set up logging to minimize Optuna's output verbosity
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Hyperparameter search space for different models
model_hyperparameters = {
    'LogisticRegression': {
        'C': (1e-4, 1e2, 'loguniform'),
        'penalty': (['l2'], 'categorical'),
        'fit_intercept': ([True, False], 'categorical'),
    },
    'SVM': {
        'C': (1e-4, 1e2, 'loguniform'),
        'kernel': (['linear', 'poly', 'rbf', 'sigmoid'], 'categorical'),
        'degree': (2, 5, 'int'),  # Only relevant for 'poly' kernel
        'gamma': (['scale', 'auto'], 'categorical'),
        'coef0': (-1, 1, 'uniform'),  # Used for 'poly' and 'sigmoid'
    },
    'RandomForest': {
        'n_estimators': (50, 200, 'int'),
        'max_depth': (5, 30, 'int'),
        'min_samples_split': (2, 10, 'int'),
        'min_samples_leaf': (1, 5, 'int'),
    },
    'AdaBoost': {
        'n_estimators': (50, 200, 'int'),
        'learning_rate': (0.01, 1.0, 'loguniform'),
    }
}

def _suggest_hyperparameters(trial, hyperparams):
    """
    Suggest hyperparameters for a model using the Optuna trial object.
    """
    params = {}
    for key, value in hyperparams.items():
        if len(value) == 2 and value[1] == 'categorical':
            params[key] = trial.suggest_categorical(key, value[0])
        elif len(value) == 3:
            if value[2] == 'loguniform':
                params[key] = trial.suggest_float(key, value[0], value[1], log=True)
            elif value[2] == 'uniform':
                params[key] = trial.suggest_float(key, value[0], value[1])
            elif value[2] == 'int':
                params[key] = trial.suggest_int(key, value[0], value[1])
            elif value[2] == 'categorical':
                params[key] = trial.suggest_categorical(key, value[0])
            else:
                raise ValueError(f"Unknown value format for hyperparameter {key}: {value}")
    return params

def _find_optimal_threshold(y_true, y_probs, scoring_metric):
    """
    Find the optimal threshold using Bayesian optimization to maximize the given metric.
    """

    def objective(threshold):
        y_pred = (y_probs >= threshold).astype(int)
        return -scoring_metric(y_true, y_pred)

    result = gp_minimize(objective, [(0.1, 1.0)], n_calls=20)

    return result.x[0]  # The best threshold found


def _objective(trial, model, model_name, X_full, y_full, model_hyperparameters, scoring_metric):
    """
    Objective function for Optuna to optimize based on both model parameters and threshold.
    """

    # Get hyperparameters for the model
    hyperparams = model_hyperparameters[model_name]
    params = _suggest_hyperparameters(trial, hyperparams)

    # Set model parameters
    model.set_params(**params)

    # Train the model
    model.fit(X_full, y_full)

    # Predict probabilities
    y_probs = model.predict_proba(X_full)[:, 1]

    # Find the best threshold for this set of hyperparameters
    best_threshold = _find_optimal_threshold(y_full, y_probs, scoring_metric)

    # Convert probabilities to binary predictions based on the best threshold
    y_pred = (y_probs >= best_threshold).astype(int)

    # Evaluate the model with the chosen threshold
    score = scoring_metric(y_full, y_pred)

    # Log the threshold and score for tracking
    trial.set_user_attr("threshold", best_threshold)

    return score


def _optimize_model_with_optuna(model, model_name, X_full, y_full, model_hyperparameters, scoring_metric, n_trials=100, timeout=1200):
    """
    Optimize a machine learning model using Optuna for hyperparameter and threshold tuning.
    """
    study = optuna.create_study(direction='maximize')

    print(f"--> [Model Selection Status]: Starting Optuna optimization for {model_name} with {n_trials} trials.")

    # Optimize both hyperparameters and the threshold
    study.optimize(lambda trial: _objective(trial, model, model_name, X_full, y_full, model_hyperparameters, scoring_metric),
                   n_trials=n_trials, timeout=timeout)

    # Retrieve the best parameters and threshold from the study
    best_params = study.best_params
    best_value = study.best_value
    best_threshold = study.best_trial.user_attrs["threshold"]

    print(f"--> [Model Selection Status]: Finished Optuna optimization for {model_name}. Best metric score: {best_value}, Best threshold: {best_threshold}")

    return best_params, best_threshold, best_value


def TrainModelsWithOptuna(theme_datasets, text_column='Processed Text', metric=precision_score, min_quality_thresh=0.5,
                          sufficient_quality_thresh=0.9, sufficient_recall_thresh=0.5, data_size_thresh=30, n_trials=50, timeout=180):
    """
    Train models for each theme using Optuna to find the best hyperparameters and threshold.

    Args:
    - theme_datasets (dict): Datasets for each theme, with text and labels.
    - text_column (str): The column containing the processed text. Default is 'Processed Text'.
    - metric (callable): Metric to optimize (e.g., f1_score). Default is F1.
    - min_quality_thresh (float): Minimum acceptable score to keep the model. Default is 0.5.
    - sufficient_quality_thresh (float): Score where there is no need to test additional models types (refrain from relying on heavy models). Default is 0.9.
    - sufficient_recall_thresh (float): Ratio of TP/(TP+FN) for stopping early. Default is 0.5.
    - data_size_thresh (int): Minimum number of samples required to train a model. Default is 30.
    - n_trials (int): Number of Optuna trials to run. Default is 50.
    - timeout (int): Maximum time (seconds) to spend on optimization per theme. Default is 180.

    Returns:
    - results (dict): Best model, hyperparameters, threshold, and performance for each theme.
    """
    results = {}

    # Fit the TF-IDF vectorizer on the entire training dataset
    vectorizer = TfidfVectorizer()
    all_train_texts = pd.concat([datasets['train'][text_column] for datasets in theme_datasets.values()]).drop_duplicates()
    vectorizer.fit(all_train_texts)

    # Define the models to evaluate with Optuna
    candidate_models = {
        'LogisticRegression': LogisticRegression(random_state=42, max_iter=1000),
        'SVM': SVC(probability=True, random_state=42),
        'RandomForest': RandomForestClassifier(random_state=42),
        'AdaBoost': AdaBoostClassifier(random_state=42)
    }

    for theme, datasets in theme_datasets.items():
        print(f"\n[Runtime Status]: Starting training for theme: {theme}")

        train_df = datasets['train']
        full_ds = datasets['full_ds']

        best_model = None
        best_threshold = 0
        best_score = min_quality_thresh
        best_model_recall = 0
        best_model_name = None
        best_conf_mat_full = None

        if len(train_df[train_df[theme] == 1]) <= data_size_thresh:
            print(f"[Runtime Status]: Skipping theme {theme} due to insufficient positive samples (less than {data_size_thresh} positive records).")
            continue

        X_full = vectorizer.transform(full_ds[text_column])
        y_full = full_ds[theme]

        for model_name, base_model in candidate_models.items():
            print(f"[Runtime Status]: Optimizing model {model_name} for theme {theme}")

            model_copy = deepcopy(base_model)

            # Optimize model with Optuna (both hyperparameters and threshold)
            best_params, best_optuna_threshold, model_optuna_score = _optimize_model_with_optuna(
                model_copy, model_name, X_full, y_full, model_hyperparameters, metric, n_trials=n_trials, timeout=timeout
            )

            if model_optuna_score >= best_score:
              # Train the model with the best parameters
              optimized_model = model_copy.set_params(**best_params)
              optimized_model.fit(X_full, y_full)

              # Predict probabilities (Full DS)
              full_data_proba = optimized_model.predict_proba(X_full)[:, 1]
              full_data_pred = (full_data_proba >= best_optuna_threshold).astype(int)

              # Calculate recall for the optimised model
              total_model_recall_score = recall_score(y_full, full_data_pred) # For stopping criteria

              # Select the best model based on the score (CV Optuna), while allowing model's with similar score but higher recall to be picked
              if (model_optuna_score > best_score) or (total_model_recall_score > best_model_recall):
                best_model = optimized_model
                best_threshold = best_optuna_threshold
                best_score = model_optuna_score
                best_model_recall = total_model_recall_score
                best_model_name = model_name
                tn, fp, fn, tp = confusion_matrix(y_full, full_data_pred).ravel()
                best_conf_mat_full = {'TP': tp, 'FP': fp, 'FN': fn, 'TN': tn}

                # Early stop for the optimization process if a good enough result found:
                if best_score >= sufficient_quality_thresh and total_model_recall_score > sufficient_recall_thresh:
                  print('[Runtime Status]: Model score exceeded the required threshold, stopping training for this theme')
                  break  # Stop further model evaluation

        if not best_model_name:
          print(f"[Runtime Status]: Finished training for theme {theme}, but no model achieved sufficient results. No instance will be saved.")
        else:
          # After the loop ends (whether early stopped or not), store the final model and its results
          print(f"[Runtime Status]: Finished training for theme {theme}. Chose model: {best_model_name}")
          print(f"[Runtime Status]: Best model score on full data set: {round(best_score, 3)}, Recall={best_model_recall}")
          print(f"[Runtime Status]: Optimal threshold: {best_threshold}")

          if best_conf_mat_full:
              print(f"--> [Runtime Status]: Confusion matrix full tagged dataset: {best_conf_mat_full}")

          # Store the best model, report, and optimal threshold for the theme
          best_model_instance = deepcopy(best_model)
          results[theme] = {
              'model': deepcopy(best_model),
              'optimal_threshold': deepcopy(best_threshold),
              'confusion_matrix_full': deepcopy(best_conf_mat_full)
          }
    results['vectorizer'] = vectorizer
    return results

In [10]:
training_results = TrainModelsWithOptuna(theme_datasets)
with open('ThemeClassifierModels.pkl', 'wb') as f:
    pickle.dump(training_results, f)


[Runtime Status]: Starting training for theme: innovation
[Runtime Status]: Skipping theme innovation due to insufficient positive samples (less than 30 positive records).

[Runtime Status]: Starting training for theme: stability
[Runtime Status]: Skipping theme stability due to insufficient positive samples (less than 30 positive records).

[Runtime Status]: Starting training for theme: hr
[Runtime Status]: Skipping theme hr due to insufficient positive samples (less than 30 positive records).

[Runtime Status]: Starting training for theme: company culture
[Runtime Status]: Optimizing model LogisticRegression for theme company culture
--> [Model Selection Status]: Starting Optuna optimization for LogisticRegression with 50 trials.
--> [Model Selection Status]: Finished Optuna optimization for LogisticRegression. Best metric score: 1.0, Best threshold: 0.592859049685727
[Runtime Status]: Optimizing model SVM for theme company culture
--> [Model Selection Status]: Starting Optuna optim



--> [Model Selection Status]: Finished Optuna optimization for LogisticRegression. Best metric score: 1.0, Best threshold: 0.745396592017376
[Runtime Status]: Model score exceeded the required threshold, stopping training for this theme
[Runtime Status]: Finished training for theme cross-functional collaboration. Chose model: LogisticRegression
[Runtime Status]: Best model score on full data set: 1.0, Recall=1.0
[Runtime Status]: Optimal threshold: 0.745396592017376
--> [Runtime Status]: Confusion matrix full tagged dataset: {'TP': 53, 'FP': 0, 'FN': 0, 'TN': 1255}

[Runtime Status]: Starting training for theme: within team collaboration
[Runtime Status]: Optimizing model LogisticRegression for theme within team collaboration
--> [Model Selection Status]: Starting Optuna optimization for LogisticRegression with 50 trials.
--> [Model Selection Status]: Finished Optuna optimization for LogisticRegression. Best metric score: 1.0, Best threshold: 0.6977615938777294
[Runtime Status]: Optimi



--> [Model Selection Status]: Finished Optuna optimization for LogisticRegression. Best metric score: 1.0, Best threshold: 0.6629272283320496
[Runtime Status]: Optimizing model SVM for theme recognition
--> [Model Selection Status]: Starting Optuna optimization for SVM with 50 trials.
--> [Model Selection Status]: Finished Optuna optimization for SVM. Best metric score: 1.0, Best threshold: 0.7508685633395493
[Runtime Status]: Model score exceeded the required threshold, stopping training for this theme
[Runtime Status]: Finished training for theme recognition. Chose model: SVM
[Runtime Status]: Best model score on full data set: 1.0, Recall=1.0
[Runtime Status]: Optimal threshold: 0.7508685633395493
--> [Runtime Status]: Confusion matrix full tagged dataset: {'TP': 67, 'FP': 0, 'FN': 0, 'TN': 1241}

[Runtime Status]: Starting training for theme: flexibility
[Runtime Status]: Optimizing model LogisticRegression for theme flexibility
--> [Model Selection Status]: Starting Optuna optimiz



--> [Model Selection Status]: Finished Optuna optimization for LogisticRegression. Best metric score: 1.0, Best threshold: 0.5042268189708289
[Runtime Status]: Optimizing model SVM for theme salary / compensation
--> [Model Selection Status]: Starting Optuna optimization for SVM with 50 trials.




--> [Model Selection Status]: Finished Optuna optimization for SVM. Best metric score: 1.0, Best threshold: 0.9258108497145747
[Runtime Status]: Model score exceeded the required threshold, stopping training for this theme
[Runtime Status]: Finished training for theme salary / compensation. Chose model: SVM
[Runtime Status]: Best model score on full data set: 1.0, Recall=0.5373134328358209
[Runtime Status]: Optimal threshold: 0.9258108497145747
--> [Runtime Status]: Confusion matrix full tagged dataset: {'TP': 36, 'FP': 0, 'FN': 31, 'TN': 1241}

[Runtime Status]: Starting training for theme: workload / work-life balance
[Runtime Status]: Optimizing model LogisticRegression for theme workload / work-life balance
--> [Model Selection Status]: Starting Optuna optimization for LogisticRegression with 50 trials.




--> [Model Selection Status]: Finished Optuna optimization for LogisticRegression. Best metric score: 1.0, Best threshold: 0.5164249377073273
[Runtime Status]: Optimizing model SVM for theme workload / work-life balance
--> [Model Selection Status]: Starting Optuna optimization for SVM with 50 trials.
--> [Model Selection Status]: Finished Optuna optimization for SVM. Best metric score: 1.0, Best threshold: 0.5493207392090165
[Runtime Status]: Model score exceeded the required threshold, stopping training for this theme
[Runtime Status]: Finished training for theme workload / work-life balance. Chose model: SVM
[Runtime Status]: Best model score on full data set: 1.0, Recall=1.0
[Runtime Status]: Optimal threshold: 0.5493207392090165
--> [Runtime Status]: Confusion matrix full tagged dataset: {'TP': 125, 'FP': 0, 'FN': 0, 'TN': 1183}

[Runtime Status]: Starting training for theme: sick / vacation days
[Runtime Status]: Skipping theme sick / vacation days due to insufficient positive sa



--> [Model Selection Status]: Finished Optuna optimization for LogisticRegression. Best metric score: 1.0, Best threshold: 0.5430691805586467
[Runtime Status]: Optimizing model SVM for theme management
--> [Model Selection Status]: Starting Optuna optimization for SVM with 50 trials.
--> [Model Selection Status]: Finished Optuna optimization for SVM. Best metric score: 1.0, Best threshold: 0.8462131984953278
[Runtime Status]: Optimizing model RandomForest for theme management
--> [Model Selection Status]: Starting Optuna optimization for RandomForest with 50 trials.
--> [Model Selection Status]: Finished Optuna optimization for RandomForest. Best metric score: 1.0, Best threshold: 0.362925453128371
[Runtime Status]: Optimizing model AdaBoost for theme management
--> [Model Selection Status]: Starting Optuna optimization for AdaBoost with 50 trials.
--> [Model Selection Status]: Finished Optuna optimization for AdaBoost. Best metric score: 1.0, Best threshold: 0.6379002915523491
[Runtim



--> [Model Selection Status]: Finished Optuna optimization for LogisticRegression. Best metric score: 1.0, Best threshold: 0.6133188665846248
[Runtime Status]: Optimizing model SVM for theme communication
--> [Model Selection Status]: Starting Optuna optimization for SVM with 50 trials.
--> [Model Selection Status]: Finished Optuna optimization for SVM. Best metric score: 1.0, Best threshold: 0.8888735541044995
[Runtime Status]: Model score exceeded the required threshold, stopping training for this theme
[Runtime Status]: Finished training for theme communication. Chose model: SVM
[Runtime Status]: Best model score on full data set: 1.0, Recall=1.0
[Runtime Status]: Optimal threshold: 0.8888735541044995
--> [Runtime Status]: Confusion matrix full tagged dataset: {'TP': 109, 'FP': 0, 'FN': 0, 'TN': 1199}

[Runtime Status]: Starting training for theme: office environment
[Runtime Status]: Optimizing model LogisticRegression for theme office environment
--> [Model Selection Status]: Star



--> [Model Selection Status]: Finished Optuna optimization for SVM. Best metric score: 1.0, Best threshold: 0.47772624668875396
[Runtime Status]: Optimizing model RandomForest for theme employee care and listening
--> [Model Selection Status]: Starting Optuna optimization for RandomForest with 50 trials.




--> [Model Selection Status]: Finished Optuna optimization for RandomForest. Best metric score: 1.0, Best threshold: 0.2628889407738741
[Runtime Status]: Optimizing model AdaBoost for theme employee care and listening
--> [Model Selection Status]: Starting Optuna optimization for AdaBoost with 50 trials.
--> [Model Selection Status]: Finished Optuna optimization for AdaBoost. Best metric score: 1.0, Best threshold: 0.5278951823786457
[Runtime Status]: Finished training for theme employee care and listening. Chose model: RandomForest
[Runtime Status]: Best model score on full data set: 1.0, Recall=0.18518518518518517
[Runtime Status]: Optimal threshold: 0.2628889407738741
--> [Runtime Status]: Confusion matrix full tagged dataset: {'TP': 10, 'FP': 0, 'FN': 44, 'TN': 1254}

[Runtime Status]: Starting training for theme: work process
[Runtime Status]: Skipping theme work process due to insufficient positive samples (less than 30 positive records).

[Runtime Status]: Starting training for 

### Total preformance

In [12]:
def Evaluate(tagged_data, results, text_column='Processed Text'):
    """
    Evaluate all themes using the trained models and calculate classification reports.

    Parameters:
    tagged_data (pd.DataFrame): The input DataFrame.
    results (dict): Dictionary containing the best models, classification reports, optimal thresholds, and vectorizer.
    text_column (str): The column containing the text data.

    Returns:
    pd.DataFrame: DataFrame containing detailed metrics and averages for each theme.
    """
    metrics = []
    tagged_data_b = _BinaryThemeColumns(tagged_data, themes_column='Themes')
    vectorizer = results['vectorizer']
    X = vectorizer.transform(tagged_data_b[text_column])
    for theme, model_packet in results.items():
        if theme == 'vectorizer':
            continue

        y_true = tagged_data_b[theme].copy()
        model = model_packet['model']
        optimal_threshold = model_packet['optimal_threshold']

        if model is None:
            y_pred = np.zeros(len(tagged_data_b), dtype=int)
        else:
            # Predict theme for all samples
            y_probs = model.predict_proba(X)[:, 1]
            y_pred = (y_probs >= optimal_threshold).astype(int)

        # Calculate confusion matrix and metrics
        tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
        precision = round(precision_score(y_true, y_pred, zero_division=0), 3)
        recall = round(recall_score(y_true, y_pred, zero_division=0), 3)
        accuracy = round(accuracy_score(y_true, y_pred), 3)
        f1 = round(f1_score(y_true, y_pred, zero_division=0), 3)
        support = sum(y_true)
        total_predictions = sum(y_pred)

        # Store metrics for the current theme
        theme_res = {
            'theme': theme,
            'TP': tp,
            'FP': fp,
            'FN': fn,
            'TN': tn,
            'precision': precision,
            'recall': recall,
            'accuracy': accuracy,
            'f1': f1,
            'support': support,
            'total predictions': total_predictions
        }
        metrics.append(theme_res)

    # Create DataFrame from metrics
    metrics_df = pd.DataFrame(metrics)
    metrics_df[['TP', 'FP', 'FN', 'TN']] = metrics_df[['TP', 'FP', 'FN', 'TN']].astype(int)

    # Calculate simple averages
    simple_avg = metrics_df[['precision', 'recall', 'accuracy', 'f1']].mean().to_dict()
    simple_avg['theme'] = 'Simple Average'
    simple_avg['support'] = metrics_df['support'].sum().astype(int)
    simple_avg['total predictions'] = metrics_df['total predictions'].sum().astype(int)

    # Calculate weighted averages
    weighted_avg = metrics_df.apply(lambda x: x[['precision', 'recall', 'accuracy', 'f1']] * x['support'], axis=1).sum() / metrics_df['support'].sum()
    weighted_avg = weighted_avg.to_dict()
    weighted_avg['theme'] = 'Weighted Average'
    weighted_avg['total predictions'] = f"{round(metrics_df['total predictions'].sum().astype(int)/ metrics_df['support'].sum().astype(int),2)*100}%"

    # Append averages to metrics using pd.concat
    metrics_df = pd.concat([metrics_df, pd.DataFrame([simple_avg]), pd.DataFrame([weighted_avg])], ignore_index=True)

    return metrics_df

In [16]:
Evaluate(tagged_data, training_results, text_column='Processed Text')

Unnamed: 0,theme,TP,FP,FN,TN,precision,recall,accuracy,f1,support,total predictions
0,company culture,18.0,0.0,76.0,1214.0,1.0,0.191,0.942,0.321,94.0,18
1,cross-functional collaboration,53.0,0.0,0.0,1255.0,1.0,1.0,1.0,1.0,53.0,53
2,within team collaboration,43.0,0.0,6.0,1259.0,1.0,0.878,0.995,0.935,49.0,43
3,recognition,67.0,0.0,0.0,1241.0,1.0,1.0,1.0,1.0,67.0,67
4,flexibility,113.0,0.0,1.0,1194.0,1.0,0.991,0.999,0.996,114.0,113
5,salary / compensation,36.0,0.0,31.0,1241.0,1.0,0.537,0.976,0.699,67.0,36
6,workload / work-life balance,125.0,0.0,0.0,1183.0,1.0,1.0,1.0,1.0,125.0,125
7,systems,189.0,0.0,0.0,1119.0,1.0,1.0,1.0,1.0,189.0,189
8,training,201.0,0.0,0.0,1107.0,1.0,1.0,1.0,1.0,201.0,201
9,management,20.0,0.0,57.0,1231.0,1.0,0.26,0.956,0.412,77.0,20


# Predict on New Data

In [18]:
def Predict(tagged_data, results, text_column='Processed Text'):
    """
    Predict themes for the input DataFrame using the trained models and add a 'Themes' column.

    Parameters:
    tagged_data (pd.DataFrame): The input DataFrame without theme columns.
    results (dict): Dictionary containing the best models, classification reports, optimal thresholds, and vectorizer.
    text_column (str): The column containing the text data.

    Returns:
    pd.DataFrame: DataFrame with an added 'Themes' column containing the predicted themes.
    """
    df = tagged_data.copy()
    df = df.reset_index(drop=True)
    vectorizer = results['vectorizer']
    X = vectorizer.transform(df[text_column])

    theme_predictions = []

    for index, row in df.iterrows():
        predicted_themes = []
        for theme, model_packet in results.items():
            if theme == 'vectorizer':
                continue

            model = model_packet['model']
            optimal_threshold = model_packet['optimal_threshold']

            if model is not None:
                # Predict probabilities
                y_probs = model.predict_proba(X[index])[0, 1]

                # Predict label based on optimal threshold
                y_pred = (y_probs >= optimal_threshold).astype(int)

                if y_pred == 1:
                    predicted_themes.append(theme)

        theme_predictions.append(", ".join(predicted_themes))

    df['Themes_Predicted'] = theme_predictions
    df.replace('', np.nan, inplace=True)
    return df

In [None]:
# Input does not have to be tagged, but must have a 'Processed Text' column
Predict(tagged_data, training_results, text_column='Processed Text')