In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import pickle
import string, re, copy
from tqdm import tqdm
from wordcloud import WordCloud

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, DataCollatorWithPadding
import torch
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from umap import UMAP
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

# vocabulary stored in cache to avoid compute it every time 
vocabulary_cache = {}
# embeddings stored in cache to avoid compute it every time 
embeddings_cache = {}

torch.cuda.is_available()

True

In [8]:
############################# STEP 1 - PREPROCESS DATA ######################################


# Read the excel review file
df = pd.read_excel("/media/cattiaux/DATA/Wassati/team_data/schneider/OEM_Verbatims_base.xlsx", engine='openpyxl')

# Filter the countries we want to copy the original comment (those in english) in the translation column for comment
# Define the list of countries you want to update
countries_to_update = ['Australia', 'Canada', 'Egypt','India','Ireland','Kuwait','Malaysia','New Zealand','Oman','Pakistan','Qatar','Saudi Arabia','Singapore','South Africa','USA','Taiwan','United Arab Emirates','United Kingdom']
# Concatenate all comment column with english content
columns_to_join = ['Translation_Customer_Comments', 'Translation_Overall_Additional_Comments', 'Translation_Anything_Else_Comment', 'Translation_Reason_for_Score_Comment']
# Define a list of words to filter
words_to_filter = ['no','not','none','nil','ok','okay','ras','done','nothing','hi','all','thanks','well','everything','right','very','no problem','great','good','bad','excellent','particularly']

# Define a dictionary of words replacements
replacements = {' se ': ' schneider electric ',
                'delivery dates': 'delivery date',
                'time delivery': 'delivery time',
                'delivered time': 'delivery time',
                'solved problem': 'solve problem',
                'solve problems': 'solve problem',
                'tech support': 'technical support',
                'circuit breaker': 'circuit breakers',
                'called back': 'call back',
                'lead times': 'lead time',
                'leadtime': 'lead time',
                'support service': 'service support',
                'sales representatives': 'sales representative',
                'service sales': 'sales service',
                'account managers': 'account manager',
                'power supplies': 'power supply',
                'touch panels': 'touch panel',
                'touch screens': 'touch screen',
                'frequency inverter': 'frequency inverters',
                'frequency converter': 'frequency converters',
                'servo motor': 'servo motors',
                'servomotor': 'servo motors',
                'servomotors': 'servo motors',
                'control system': 'control systems',
                'electrical component': 'electrical components',
                'control cabinet': 'control cabinets',
                'speed drive': 'speed drives',
                'push button': 'push buttons',
                'touchscreen': 'touch screen',
                'spare parts': 'spare part',
                'becuase': 'because',
                'xxxx': ''
                }

def filter_and_join_columns(df, columns_to_join, join_column_name, words_to_filter, sep):
    """
    Filter rows of a DataFrame where any of the specified columns contains only punctuation marks or only one or more occurrences of the specified words, possibly mixed with punctuation marks, and then join the content from the specified columns using a separator.
    
    :param df: A DataFrame to filter and join
    :type df: pandas.DataFrame
    :param columns_to_join: A list of column names to join
    :type columns_to_join: list of str
    :param words_to_filter: A list of words to filter
    :type words_to_filter: list of str
    :param sep: A separator string to use between values when joining columns
    :type sep: str
    :return: A DataFrame containing the filtered and joined rows
    :rtype: pandas.DataFrame
    """
    # Create a regular expression pattern to match values that contain only punctuation marks or only one or more occurrences of the specified words, possibly mixed with punctuation marks
    pattern = r'^\s*[\W\s]*\s*$|^\s*(\W*\b(' + '|'.join(re.escape(word) for word in words_to_filter) + r')\b\W*)+\s*$'
    
    # Define a custom function to filter rows where any of the specified columns contains only punctuation marks or only one or more occurrences of the specified words, possibly mixed with punctuation marks. It filters also the non-ASCII characters
    def filter_rows(row):
        for col in columns_to_join:
            value = row[col]
            if pd.notnull(value):
                # Filter out non-ASCII characters from the value
                value = value.encode('ascii', 'ignore').decode('ascii')
                if re.search(pattern, value, re.IGNORECASE):
                    return False
        return True
    
    # Filter the DataFrame using the custom filter_rows function
    filtered_df = df[df.apply(filter_rows, axis=1)]
    
    # Define a custom function to join the content from the specified columns using a separator
    def join_columns(row):
        values = []
        for col in columns_to_join:
            value = row[col]
            if pd.notnull(value) and value.strip():
                if values and values[-1][-1] not in string.punctuation:
                    values.append(sep)
                else:
                    values.append(' ')
                values.append(value)
        return ''.join(values).strip()
    
    # Join the content from the specified columns using the custom join_columns function
    filtered_df[join_column_name] = filtered_df.apply(join_columns, axis=1)

    # Return the resulting DataFrame containing the filtered and joined rows
    return filtered_df[filtered_df[join_column_name].str.len() > 0]

def preprocess_data(df, countries_to_update, columns_to_join, join_column_name, words_to_filter, replacements):
    """
    Preprocess a DataFrame of data.

    This function takes a DataFrame of data, a list of countries to update, a list of columns to join and a list of words to filter as input. It adds additional columns to the DataFrame, renames some columns for readability, updates the translation columns for specific rows, and concatenates all comment columns with English content. The modified DataFrame is then returned.

    :param df: A DataFrame of input data.
    :param countries_to_update: A list of countries for which to update the translation columns.
    :param columns_to_join: A list of columns to join.
    :param words_to_filter: A list of words to filter.
    :return: A modified DataFrame with additional columns, renamed columns, updated translation columns, and concatenated comment columns.
    """

    # add some additional info as columns
    df['year'] = pd.DatetimeIndex(df["Creation Date"]).year
    df['id'] = range(len(df))
    
    # Rename columns for readability
    df = df.rename(columns={
        'Translation to English for: Customer Comments (edited)': 'Translation_Customer_Comments',
        'Customer Comments (edited)': 'Customer_Comments',
        'Translation to English for: Overall Additional Comments (edited)': 'Translation_Overall_Additional_Comments',
        'Overall Additional Comments (edited)': 'Overall_Additional_Comments',
        'Translation to English for: Anything else comment': 'Translation_Anything_Else_Comment',
        'Anything else comment': 'Anything_Else_Comment',
        'Translation to English for: Reason for score comment': 'Translation_Reason_for_Score_Comment',
        'Reason for score comment': 'Reason_for_Score_Comment'
    })

    # Create a mask to filter the rows where the 'Account Countries' column is in the list of countries to update
    mask1 = df['Account Country'].isin(countries_to_update)

    # Update the translation columns only for the rows where the mask1 is True
    mask2 = ((df['Customer_Comments'] != '') & df['Customer_Comments'].notnull()) & df['Translation_Customer_Comments'].isnull()
    mask = mask1 & mask2
    df.loc[mask, 'Translation_Customer_Comments'] = df.loc[mask, 'Customer_Comments']

    mask2 = ((df['Overall_Additional_Comments'] != '') & df['Overall_Additional_Comments'].notnull()) & df['Translation_Overall_Additional_Comments'].isnull()
    mask = mask1 & mask2
    df.loc[mask, 'Translation_Overall_Additional_Comments'] = df.loc[mask, 'Overall_Additional_Comments']

    mask2 = ((df['Anything_Else_Comment'] != '') & df['Anything_Else_Comment'].notnull()) & df['Translation_Anything_Else_Comment'].isnull()
    mask = mask1 & mask2
    df.loc[mask, 'Translation_Anything_Else_Comment'] = df.loc[mask, 'Anything_Else_Comment']

    mask2 = ((df['Reason_for_Score_Comment'] != '') & df['Reason_for_Score_Comment'].notnull()) & df['Translation_Reason_for_Score_Comment'].isnull()
    mask = mask1 & mask2
    df.loc[mask, 'Translation_Reason_for_Score_Comment'] = df.loc[mask, 'Reason_for_Score_Comment']

    # Concatenate all comment column with english content
    # Call the filter_and_join_columns function to filter and join the DataFrame
    df = filter_and_join_columns(df, columns_to_join, join_column_name, words_to_filter, '. ')

    # Remove punctuation added to join the comment column in order to filter empty comment
    df = df[df['allComment'].str.strip('. ').str.len() > 0]

    # Filter out non-ASCII characters for comments that have both original language and translation in the same spot
    df['allComment'] = df['allComment'].str.encode('ascii', 'ignore').str.decode('ascii') 
    df = df[df['allComment'].str.len() > 0]

    # Define an inner replace_words function
    def replace_words(series):
        # Lowercase the documents
        series = series.str.lower()
        
        # Iterate over the word replacements
        for old_word, new_word in replacements.items():
            # Replace the old word with the new word in the documents using the str.replace method
            series = series.str.replace(old_word, new_word, regex=False)
        
        # Return the modified series as a Pandas Series
        return series
    
    # Call the inner replace_words function on the 'allComment' column
    df[join_column_name] = replace_words(df[join_column_name])

    return df

def replace_words(docs, replacements):
    """
    Replace words in a list of documents.

    This function takes a list of documents and a dictionary of word replacements as input. It iterates over the input documents, lowercases them, and replaces the old words with the new words specified in the replacements dictionary. The modified documents are then returned as a new list.

    :param docs: A list of input documents, where each document is a string.
    :param replacements: A dictionary of word replacements, where the keys are the old words to be replaced and the values are the new words to replace them with.
    :return: A list of modified documents, where the old words have been replaced with the new words.
    """
    
    # Initialize an empty list to store the modified documents
    new_docs = []

    # Iterate over the input documents
    for doc in docs:
        # Lowercase the document
        doc = doc.lower()
        # Iterate over the words replacements
        for old_word, new_word in replacements.items():
            # Replace the old word with the new word in the document
            doc = doc.replace(old_word, new_word)

        # Add the modified document to the list of new documents
        new_docs.append(doc)

    return new_docs


# Filter the countries we want to copy the original comment (those in english) in the translation column for comment
df = preprocess_data(df, countries_to_update, columns_to_join, "allComment", words_to_filter, replacements)

# Put the data in the correct format for bertopic
docs = df["allComment"].astype(str).tolist()

# Call the replace_words function to replace the specified bigrams in the documents
docs = replace_words(docs, replacements)

In [None]:
############################# STEP 2 - COMPUTE CUSTOM VOCABULARY ###############################


# list of the ngrams wanted for our data
ngrams_list = ['schneider electric','supply chain','solve problem','lead time','price performance ratio','price quality ratio','pro face','spare part',
           'product range','product line','user friendly','data sheets','allen bradley',
           'technical support','tehnical assistance','technical team','technical service',
           'delivery time','long time','delivery date','response time','quick response',
           'customer service','customer center','customer support','contact person','service support','call center','call back',
           'sales service','sales team','account manager','pre sales','sales representative',
           'circuit breakers','frequency converters','low voltage','touch panel','frequency inverters','push buttons','electrical components','control cabinets','touch screen','control systems','servo motors','power supply','speed drives',
           'alles tip top','alles tip','sinan chalabi']

def create_custom_vocabulary(docs, ngrams_list, model_name="all-MiniLM-L6-v2", **kwargs):
    """
    Create a custom vocabulary from a list of documents using KeyBERT.

    This function takes a list of documents, a list of custom n-grams, and an optional model name as input. It preprocesses the documents by replacing the custom n-grams with single tokens containing underscores. Then, it initializes a GPU-enabled SentenceTransformer model and uses KeyBERT to extract keywords from the preprocessed documents. The extracted keywords are postprocessed by replacing single tokens with the original n-grams and removing duplicates to create the custom vocabulary. The vocabulary is cached for future use and returned as a list of strings.

    :param docs: A list of input documents, where each document is a string.
    :param ngrams_list: A list of custom n-grams to be replaced with single tokens during preprocessing.
    :param model_name: An optional string specifying the name of the SentenceTransformer model to use. Defaults to "all-MiniLM-L6-v2".
    :param kwargs: Additional keyword arguments to be passed to the KeyBERT `extract_keywords` method.
    :return: A list of strings representing the custom vocabulary created from the input documents.
    """

    # Preprocess documents by replacing custom n-grams with single tokens with underscore "_"
    preprocessed_docs = []
    for doc in docs:
        for ngram in ngrams_list:
            if ngram in doc:
                ngram_underscore = ngram.replace(" ","_")
                doc = doc.replace(ngram, ngram_underscore)
        preprocessed_docs.append(doc)

    # Initialize a GPU-enabled SentenceTransformer model
    model = SentenceTransformer(model_name, device="cuda")

    # Create custom vocabulary with KeyBERT
    # Use the vocabulary in cache if it exists otherwise compute it and store it in cache
    cache_key = (hash(str(preprocessed_docs)), hash(str(ngrams_list)), hash(model_name), hash(str(kwargs)))
    if cache_key in vocabulary_cache:
        vocabulary = vocabulary_cache[cache_key]
        return vocabulary
    else:
        # Extract keywords
        kw_model = KeyBERT(model=model)
        keywords = kw_model.extract_keywords(preprocessed_docs, **kwargs)
        
        # Flatten the list of lists and remove duplicates to create the vocabulary
        vocabulary = list(set([word for sublist in keywords for word, score in sublist]))

        # Postprocess extracted keywords by replacing single tokens with original n-grams
        postprocessed_vocab = []
        for keyword in vocabulary:
            space_keyword = keyword.replace("_"," ")
            if space_keyword in ngrams_list:
                keyword = space_keyword
            postprocessed_vocab.append(keyword)

        vocabulary_cache[cache_key] = postprocessed_vocab
        return postprocessed_vocab

# NLTK English stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
# add stopwords to the list from nltk
more_stopwords = ['schneider electric','se','schneider','schneiders','also','xxx','xxxx','xxxxx','ok','okk','okay','ras','na','nil','none','mr','mrs','monsieur','thank','esther','ester','paulo','paolo','sadao','carlos','pereira','ken','benoit','sergio','catalina','cesar','rufo','moraleda','ferrer','guido','smekens','castelli','muiz','roberto','matteo','guerriera','mike','elena','isabel','jurrie','javier','anna','fernandez','reyes','cichinelli','inicio','incio','jos','fabio','canedo','mituo','eduardo','roberto','santos','inicio','silva','arnaldo','sgueglia','squeglia','sandrine','laroche','lavinia','salerno','fahler','rodriguez','perez','prieto','heleni','henri','henrique','henrik','sammy','gregoire','denis','thomas','divani','flavio','rosetti','fabbri','danilo','evandro','sahil','kundli','maggico','cindy','martin','gabrielsson','edoardo','martha','ponte','aponte','pinkowitz','cortese','nicole','gahner','maulady','ahmad','heidi','okino','wang','jason','james','rhandzi','cecil','went','goes','thanks','thank','alles tip','alles tip top','66666666666666','000000000000000','666666666','eng','particular','particularly','alles']
stopwords.extend(more_stopwords)

# Create vocabulary with KeyBERT using custom ngrams list
vocabulary = create_custom_vocabulary(docs, ngrams_list, top_n=4, use_mmr=True, stop_words=stopwords, keyphrase_ngram_range=(1,1), nr_candidates=12)

In [None]:
############################# STEP 3 - RUN BERTOPIC MODEL ###############################


# NLTK English stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
# add stopwords to the list from nltk
more_stopwords = ['schneider electric','se','schneider','schneiders','also','xxx','xxxx','xxxxx','ok','okk','okay','ras','na','nil','none','mr','mrs','monsieur','thank','esther','ester','paulo','paolo','sadao','carlos','pereira','ken','benoit','sergio','catalina','cesar','rufo','moraleda','ferrer','guido','smekens','castelli','muiz','roberto','matteo','guerriera','mike','elena','isabel','jurrie','javier','anna','fernandez','reyes','cichinelli','inicio','incio','jos','fabio','canedo','mituo','eduardo','roberto','santos','inicio','silva','arnaldo','sgueglia','squeglia','sandrine','laroche','lavinia','salerno','fahler','rodriguez','perez','prieto','heleni','henri','henrique','henrik','sammy','gregoire','denis','thomas','divani','flavio','rosetti','fabbri','danilo','evandro','sahil','kundli','maggico','cindy','martin','gabrielsson','edoardo','martha','ponte','aponte','pinkowitz','cortese','nicole','gahner','maulady','ahmad','heidi','okino','wang','jason','james','rhandzi','cecil','went','goes','thanks','thank','alles tip','alles tip top','66666666666666','000000000000000','666666666','eng','particular','particularly','alles']
stopwords.extend(more_stopwords)


def run_bertopic(docs, model_name="all-MiniLM-L6-v2", **kwargs):
    """
    Run BERTopic on a list of documents.

    This function takes a list of documents, an optional model name, and additional keyword arguments as input. It extracts embeddings for the input documents using a SentenceTransformer model and caches them for future use. Then, it runs BERTopic on the input documents and embeddings and returns the resulting topics and probabilities.

    :param docs: A list of input documents, where each document is a string.
    :param model_name: An optional string specifying the name of the SentenceTransformer model to use. Defaults to "all-MiniLM-L6-v2".
    :param kwargs: Additional keyword arguments to be passed to the BERTopic constructor.
    :return: A tuple containing two elements: a list of topics assigned to each input document, and a matrix of topic probabilities for each input document.
    """

    # Extract embeddings
    cache_key = (hash(str(docs)), hash(model_name))
    if cache_key in embeddings_cache:
        sentence_model = SentenceTransformer(model_name, device="cuda")
        embeddings = embeddings_cache[cache_key]
    else:
        # compute embeddings using GPU
        sentence_model = SentenceTransformer(model_name, device="cuda")
        embeddings = sentence_model.encode(docs, show_progress_bar=True)
        # store embeddings in cache
        embeddings_cache[cache_key] = embeddings

    # Run BERTopic
    topic_model = BERTopic(embedding_model=sentence_model, **kwargs)
    topics, probs = topic_model.fit_transform(docs, embeddings)

    return topics, probs, topic_model

# Prepare sub-models
custom_vectorizer_model = CountVectorizer(vocabulary=vocabulary, stop_words=stopwords, lowercase=True, ngram_range=(1, 2))
# custom_vectorizer_model.fit(docs)
    # Create your representation model
representation_model = KeyBERTInspired()
representation_model_MMR = MaximalMarginalRelevance(diversity=0.5) # Diversify topic representation
representation_models = [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)] 
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# Our set of predefined labels
labels = [['product','features','performance','software','framework','touch screen','sensor','drive','servo motors','license','manual','cad','sensor','eocr','pro face','proface','atv','converter','inverter','frequency converters','tcp','modbus','vijeo','gp','control cabinet','cupboard'],
['pricing','price','offers','orders','payment','invoice','quote','quotation'],
['quality','reliability','warranty','repair','maintenance','incident','fault','complication','issue'],
['delivery','logistic','delay','stock','shipping','lead time','deadline','package','ddt'],
['customer support','feedback','communication','technical support','technician','customer service','assistance','support','chat','email','telephone','ticket','attendance','interlocutor','operator','response','resolution','intervention','help','guidance','query','inquiry','answer','solve','experience','uncomfy','inconvenient','agility','competence','professionalism','explanation','remark','improvement'],
['business relation','factories','supplier','cooperation','partnership','oem','business','commercial','siemens'],
['reputation','public perception','brand image','gratitude','reliability']]

topics, probs, topic_model = run_bertopic(docs, vectorizer_model=custom_vectorizer_model, ctfidf_model=ctfidf_model, representation_model=representation_models, calculate_probabilities=False, top_n_words=20, min_topic_size=50, seed_topic_list=labels)

In [None]:
topic_model.get_topic_freq().head(30)

In [9]:
# Read the excel review file
df = pd.read_excel("/media/cattiaux/DATA/Wassati/team_data/schneider/OEM_Verbatims_base.xlsx", engine='openpyxl')

# Filter the countries we want to copy the original comment (those in english) in the translation column for comment
# Define the list of countries you want to update
countries_to_update = ['Australia', 'Canada', 'Egypt','India','Ireland','Kuwait','Malaysia','New Zealand','Oman','Pakistan','Qatar','Saudi Arabia','Singapore','South Africa','USA','Taiwan','United Arab Emirates','United Kingdom']
# Concatenate all comment column with english content
columns_to_join = ['Translation_Customer_Comments', 'Translation_Overall_Additional_Comments', 'Translation_Anything_Else_Comment', 'Translation_Reason_for_Score_Comment']
# Define a list of words to filter
words_to_filter = ['no','not','none','nil','ok','okay','ras','done','nothing','hi','all','thanks','well','everything','right','very','no problem','great','good','bad','excellent','particularly']

# Define a dictionary of words replacements
replacements = {' se ': ' schneider electric ',
                'delivery dates': 'delivery date',
                'time delivery': 'delivery time',
                'delivered time': 'delivery time',
                'solved problem': 'solve problem',
                'solve problems': 'solve problem',
                'tech support': 'technical support',
                'circuit breaker': 'circuit breakers',
                'called back': 'call back',
                'lead times': 'lead time',
                'leadtime': 'lead time',
                'support service': 'service support',
                'sales representatives': 'sales representative',
                'service sales': 'sales service',
                'account managers': 'account manager',
                'power supplies': 'power supply',
                'touch panels': 'touch panel',
                'touch screens': 'touch screen',
                'frequency inverter': 'frequency inverters',
                'frequency converter': 'frequency converters',
                'servo motor': 'servo motors',
                'servomotor': 'servo motors',
                'servomotors': 'servo motors',
                'control system': 'control systems',
                'electrical component': 'electrical components',
                'control cabinet': 'control cabinets',
                'speed drive': 'speed drives',
                'push button': 'push buttons',
                'touchscreen': 'touch screen',
                'spare parts': 'spare part',
                ' plcs': ' programmable logic controller',
                ' plc': ' programmable logic controller',
                # ' apc': 'american power conversion',
                ' upss': ' uninterruptible power supply',
                'becuase': 'because',
                'xxxx': ''
                }

# list of the ngrams wanted for our data
ngrams_list = ['schneider electric','supply chain','solve problem','lead time','price performance ratio','price quality ratio','pro face','spare part','programmable logic controller', 'uninterruptible power supply', #'american power conversion',
           'product range','product line','user friendly','data sheets','allen bradley',
           'technical support','tehnical assistance','technical team','technical service',
           'delivery time','long time','delivery date','response time','quick response',
           'customer service','customer center','customer support','contact person','service support','call center','call back',
           'sales service','sales team','account manager','pre sales','sales representative',
           'circuit breakers','frequency converters','low voltage','touch panel','frequency inverters','push buttons','electrical components','control cabinets','touch screen','control systems','servo motors','power supply','speed drives',
           'alles tip top','alles tip','sinan chalabi']

# NLTK English stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
# add stopwords to the list from nltk
more_stopwords = ['schneider electric','se','schneider','schneiders','also','xxx','xxxx','xxxxx','ok','okk','okay','ras','na','nil','none','mr','mrs','monsieur','thank','esther','ester','paulo','paolo','sadao','carlos','pereira','ken','benoit','sergio','catalina','cesar','rufo','moraleda','ferrer','guido','smekens','castelli','muiz','roberto','matteo','guerriera','mike','elena','isabel','jurrie','javier','anna','fernandez','reyes','cichinelli','inicio','incio','jos','fabio','canedo','mituo','eduardo','roberto','santos','inicio','silva','arnaldo','sgueglia','squeglia','sandrine','laroche','lavinia','salerno','fahler','rodriguez','perez','prieto','heleni','henri','henrique','henrik','sammy','gregoire','denis','thomas','divani','flavio','rosetti','fabbri','danilo','evandro','sahil','kundli','maggico','cindy','martin','gabrielsson','edoardo','martha','ponte','aponte','pinkowitz','cortese','nicole','gahner','maulady','ahmad','heidi','okino','wang','jason','james','rhandzi','cecil','went','goes','thanks','thank','alles tip','alles tip top','66666666666666','000000000000000','666666666','eng','particular','particularly','alles']
stopwords.extend(more_stopwords)

# def preprocess_data(df, countries_to_update):
#     """
#     Preprocess a DataFrame of data.

#     This function takes a DataFrame of data and a list of countries to update as input. It adds additional columns to the DataFrame, renames some columns for readability, updates the translation columns for specific rows, and concatenates all comment columns with English content. The modified DataFrame is then returned.

#     :param df: A DataFrame of input data.
#     :param countries_to_update: A list of countries for which to update the translation columns.
#     :return: A modified DataFrame with additional columns, renamed columns, updated translation columns, and concatenated comment columns.
#     """

#     # add some additional info as columns
#     df['year'] = pd.DatetimeIndex(df["Creation Date"]).year
#     df['id'] = range(len(df))
    
#     # Rename columns for readability
#     df = df.rename(columns={
#         'Translation to English for: Customer Comments (edited)': 'Translation_Customer_Comments',
#         'Customer Comments (edited)': 'Customer_Comments',
#         'Translation to English for: Overall Additional Comments (edited)': 'Translation_Overall_Additional_Comments',
#         'Overall Additional Comments (edited)': 'Overall_Additional_Comments',
#         'Translation to English for: Anything else comment': 'Translation_Anything_Else_Comment',
#         'Anything else comment': 'Anything_Else_Comment',
#         'Translation to English for: Reason for score comment': 'Translation_Reason_for_Score_Comment',
#         'Reason for score comment': 'Reason_for_Score_Comment'
#     })

#     # Create a mask to filter the rows where the 'Account Countries' column is in the list of countries to update
#     mask1 = df['Account Country'].isin(countries_to_update)

#     # Update the translation columns only for the rows where the mask1 is True
#     mask2 = ((df['Customer_Comments'] != '') & df['Customer_Comments'].notnull()) & df['Translation_Customer_Comments'].isnull()
#     mask = mask1 & mask2
#     df.loc[mask, 'Translation_Customer_Comments'] = df.loc[mask, 'Customer_Comments']

#     mask2 = ((df['Overall_Additional_Comments'] != '') & df['Overall_Additional_Comments'].notnull()) & df['Translation_Overall_Additional_Comments'].isnull()
#     mask = mask1 & mask2
#     df.loc[mask, 'Translation_Overall_Additional_Comments'] = df.loc[mask, 'Overall_Additional_Comments']

#     mask2 = ((df['Anything_Else_Comment'] != '') & df['Anything_Else_Comment'].notnull()) & df['Translation_Anything_Else_Comment'].isnull()
#     mask = mask1 & mask2
#     df.loc[mask, 'Translation_Anything_Else_Comment'] = df.loc[mask, 'Anything_Else_Comment']

#     mask2 = ((df['Reason_for_Score_Comment'] != '') & df['Reason_for_Score_Comment'].notnull()) & df['Translation_Reason_for_Score_Comment'].isnull()
#     mask = mask1 & mask2
#     df.loc[mask, 'Translation_Reason_for_Score_Comment'] = df.loc[mask, 'Reason_for_Score_Comment']

#     # Concatenate all comment column with english content
#     df['allComment'] = df['Translation_Customer_Comments'].fillna('').astype(str) + ". " + df['Translation_Overall_Additional_Comments'].fillna('').astype(str) + ". " + df['Translation_Anything_Else_Comment'].fillna('').astype(str) + ". " + df['Translation_Reason_for_Score_Comment'].fillna('').astype(str)

#     # Remove punctuation added to join the comment column in order to filter empty comment
#     df = df[df['allComment'].str.strip('. ').str.len() > 0]

#     # Filter out non-ASCII characters for comments that have both original language and translation in the same spot
#     df['allComment'] = df['allComment'].str.encode('ascii', 'ignore').str.decode('ascii') 
#     df = df[df['allComment'].str.len() > 0]

#     return df

# def replace_words(docs, replacements):
#     """
#     Replace words in a list of documents.

#     This function takes a list of documents and a dictionary of word replacements as input. It iterates over the input documents, lowercases them, and replaces the old words with the new words specified in the replacements dictionary. The modified documents are then returned as a new list.

#     :param docs: A list of input documents, where each document is a string.
#     :param replacements: A dictionary of word replacements, where the keys are the old words to be replaced and the values are the new words to replace them with.
#     :return: A list of modified documents, where the old words have been replaced with the new words.
#     """
    
#     # Initialize an empty list to store the modified documents
#     new_docs = []

#     # Iterate over the input documents
#     for doc in docs:
#         # Lowercase the document
#         doc = doc.lower()
#         # Iterate over the words replacements
#         for old_word, new_word in replacements.items():
#             # Replace the old word with the new word in the document
#             doc = doc.replace(old_word, new_word)

#         # Add the modified document to the list of new documents
#         new_docs.append(doc)

#     return new_docs

def filter_and_join_columns(df, columns_to_join, join_column_name, words_to_filter, sep):
    """
    Filter rows of a DataFrame where any of the specified columns contains only punctuation marks or only one or more occurrences of the specified words, possibly mixed with punctuation marks, and then join the content from the specified columns using a separator.
    
    :param df: A DataFrame to filter and join
    :type df: pandas.DataFrame
    :param columns_to_join: A list of column names to join
    :type columns_to_join: list of str
    :param words_to_filter: A list of words to filter
    :type words_to_filter: list of str
    :param sep: A separator string to use between values when joining columns
    :type sep: str
    :return: A DataFrame containing the filtered and joined rows
    :rtype: pandas.DataFrame
    """
    # Create a regular expression pattern to match values that contain only punctuation marks or only one or more occurrences of the specified words, possibly mixed with punctuation marks
    pattern = r'^\s*[\W\s]*\s*$|^\s*(\W*\b(' + '|'.join(re.escape(word) for word in words_to_filter) + r')\b\W*)+\s*$'
    
    # Define a custom function to filter rows where any of the specified columns contains only punctuation marks or only one or more occurrences of the specified words, possibly mixed with punctuation marks. It filters also the non-ASCII characters
    def filter_rows(row):
        for col in columns_to_join:
            value = row[col]
            if pd.notnull(value):
                # Filter out non-ASCII characters from the value
                value = value.encode('ascii', 'ignore').decode('ascii')
                if re.search(pattern, value, re.IGNORECASE):
                    return False
        return True
    
    # Filter the DataFrame using the custom filter_rows function
    filtered_df = df[df.apply(filter_rows, axis=1)]
    
    # Define a custom function to join the content from the specified columns using a separator
    def join_columns(row):
        values = []
        for col in columns_to_join:
            value = row[col]
            if pd.notnull(value) and value.strip():
                if values and values[-1][-1] not in string.punctuation:
                    values.append(sep)
                else:
                    values.append(' ')
                values.append(value)
        return ''.join(values).strip()
    
    # Join the content from the specified columns using the custom join_columns function
    filtered_df[join_column_name] = filtered_df.apply(join_columns, axis=1)

    # Return the resulting DataFrame containing the filtered and joined rows
    return filtered_df[filtered_df[join_column_name].str.len() > 0]

def preprocess_data(df, countries_to_update, columns_to_join, join_column_name, words_to_filter, replacements):
    """
    Preprocess a DataFrame of data.

    This function takes a DataFrame of data, a list of countries to update, a list of columns to join and a list of words to filter as input. It adds additional columns to the DataFrame, renames some columns for readability, updates the translation columns for specific rows, and concatenates all comment columns with English content. The modified DataFrame is then returned.

    :param df: A DataFrame of input data.
    :param countries_to_update: A list of countries for which to update the translation columns.
    :param columns_to_join: A list of columns to join.
    :param words_to_filter: A list of words to filter.
    :return: A modified DataFrame with additional columns, renamed columns, updated translation columns, and concatenated comment columns.
    """

    # add some additional info as columns
    df['year'] = pd.DatetimeIndex(df["Creation Date"]).year
    df['id'] = range(len(df))

    # Rename columns for readability
    df = df.rename(columns={
        'Translation to English for: Customer Comments (edited)': 'Translation_Customer_Comments',
        'Customer Comments (edited)': 'Customer_Comments',
        'Translation to English for: Overall Additional Comments (edited)': 'Translation_Overall_Additional_Comments',
        'Overall Additional Comments (edited)': 'Overall_Additional_Comments',
        'Translation to English for: Anything else comment': 'Translation_Anything_Else_Comment',
        'Anything else comment': 'Anything_Else_Comment',
        'Translation to English for: Reason for score comment': 'Translation_Reason_for_Score_Comment',
        'Reason for score comment': 'Reason_for_Score_Comment'
    })

    # Create a mask to filter the rows where the 'Account Countries' column is in the list of countries to update
    mask1 = df['Account Country'].isin(countries_to_update)

    # Update the translation columns only for the rows where the mask1 is True
    mask2 = ((df['Customer_Comments'] != '') & df['Customer_Comments'].notnull()) & df['Translation_Customer_Comments'].isnull()
    mask = mask1 & mask2
    df.loc[mask, 'Translation_Customer_Comments'] = df.loc[mask, 'Customer_Comments']

    mask2 = ((df['Overall_Additional_Comments'] != '') & df['Overall_Additional_Comments'].notnull()) & df['Translation_Overall_Additional_Comments'].isnull()
    mask = mask1 & mask2
    df.loc[mask, 'Translation_Overall_Additional_Comments'] = df.loc[mask, 'Overall_Additional_Comments']

    mask2 = ((df['Anything_Else_Comment'] != '') & df['Anything_Else_Comment'].notnull()) & df['Translation_Anything_Else_Comment'].isnull()
    mask = mask1 & mask2
    df.loc[mask, 'Translation_Anything_Else_Comment'] = df.loc[mask, 'Anything_Else_Comment']

    mask2 = ((df['Reason_for_Score_Comment'] != '') & df['Reason_for_Score_Comment'].notnull()) & df['Translation_Reason_for_Score_Comment'].isnull()
    mask = mask1 & mask2
    df.loc[mask, 'Translation_Reason_for_Score_Comment'] = df.loc[mask, 'Reason_for_Score_Comment']

    # Concatenate all comment column with english content
    # Call the filter_and_join_columns function to filter and join the DataFrame
    df = filter_and_join_columns(df, columns_to_join, join_column_name, words_to_filter, '. ')

    # Define an inner replace_words function
    def replace_words(series):
        # Lowercase the documents
        series = series.str.lower()

        # Iterate over the word replacements
        for old_word, new_word in replacements.items():
            # Replace the old word with the new word in the documents using the str.replace method
            series = series.str.replace(old_word, new_word, regex=False)

        # Return the modified series as a Pandas Series
        return series

    # Call the inner replace_words function on the 'allComment' column
    df[join_column_name] = replace_words(df[join_column_name])

    return df
    # Remove punctuation added to join the comment column in order to filter empty comment
    # df = df[df['allComment'].str.strip('. ').str.len() > 0]

    # Filter out non-ASCII characters for comments that have both original language and translation in the same spot
    # df['allComment'] = df['allComment'].str.encode('ascii', 'ignore').str.decode('ascii') 
    # df = df[df['allComment'].str.len() > 0]

def create_custom_vocabulary(docs, ngrams_list, model_name="all-MiniLM-L6-v2", **kwargs):
    """
    Create a custom vocabulary from a list of documents using KeyBERT.

    This function takes a list of documents, a list of custom n-grams, and an optional model name as input. It preprocesses the documents by replacing the custom n-grams with single tokens containing underscores. Then, it initializes a GPU-enabled SentenceTransformer model and uses KeyBERT to extract keywords from the preprocessed documents. The extracted keywords are postprocessed by replacing single tokens with the original n-grams and removing duplicates to create the custom vocabulary. The vocabulary is cached for future use and returned as a list of strings.

    :param docs: A list of input documents, where each document is a string.
    :param ngrams_list: A list of custom n-grams to be replaced with single tokens during preprocessing.
    :param model_name: An optional string specifying the name of the SentenceTransformer model to use. Defaults to "all-MiniLM-L6-v2".
    :param kwargs: Additional keyword arguments to be passed to the KeyBERT `extract_keywords` method.
    :return: A list of strings representing the custom vocabulary created from the input documents.
    """

    # Preprocess documents by replacing custom n-grams with single tokens with underscore "_"
    preprocessed_docs = []
    for doc in docs:
        for ngram in ngrams_list:
            if ngram in doc:
                ngram_underscore = ngram.replace(" ","_")
                doc = doc.replace(ngram, ngram_underscore)
        preprocessed_docs.append(doc)

    # Initialize a GPU-enabled SentenceTransformer model
    model = SentenceTransformer(model_name, device="cuda")

    # Create custom vocabulary with KeyBERT
    # Use the vocabulary in cache if it exists otherwise compute it and store it in cache
    cache_key = (hash(str(preprocessed_docs)), hash(str(ngrams_list)), hash(model_name), hash(str(kwargs)))
    if cache_key in vocabulary_cache:
        vocabulary = vocabulary_cache[cache_key]
        return vocabulary
    else:
        # Extract keywords
        kw_model = KeyBERT(model=model)
        keywords = kw_model.extract_keywords(preprocessed_docs, **kwargs)
        
        # Flatten the list of lists and remove duplicates to create the vocabulary
        vocabulary = list(set([word for sublist in keywords for word, score in sublist]))

        # Postprocess extracted keywords by replacing single tokens with original n-grams
        postprocessed_vocab = []
        for keyword in vocabulary:
            space_keyword = keyword.replace("_"," ")
            if space_keyword in ngrams_list:
                keyword = space_keyword
            postprocessed_vocab.append(keyword)

        vocabulary_cache[cache_key] = postprocessed_vocab
        return postprocessed_vocab

def run_bertopic(docs, model_name="all-MiniLM-L6-v2", **kwargs):
    """
    Run BERTopic on a list of documents.

    This function takes a list of documents, an optional model name, and additional keyword arguments as input. It extracts embeddings for the input documents using a SentenceTransformer model and caches them for future use. Then, it runs BERTopic on the input documents and embeddings and returns the resulting topics and probabilities.

    :param docs: A list of input documents, where each document is a string.
    :param model_name: An optional string specifying the name of the SentenceTransformer model to use. Defaults to "all-MiniLM-L6-v2".
    :param kwargs: Additional keyword arguments to be passed to the BERTopic constructor.
    :return: A tuple containing four elements: a list of topics assigned to each input document, a matrix of topic probabilities for each input document, the BERTopic model used and the embeddings.
    """

    # Extract embeddings
    cache_key = (hash(str(docs)), hash(model_name))
    if cache_key in embeddings_cache:
        sentence_model = SentenceTransformer(model_name, device="cuda")
        embeddings = embeddings_cache[cache_key]
    else:
        # compute embeddings using GPU
        sentence_model = SentenceTransformer(model_name, device="cuda")
        embeddings = sentence_model.encode(docs, show_progress_bar=True)
        # store embeddings in cache
        embeddings_cache[cache_key] = embeddings

    # Run BERTopic
    topic_model = BERTopic(embedding_model=sentence_model, **kwargs)
    topics, probs = topic_model.fit_transform(docs, embeddings)

    return topics, probs, topic_model, embeddings

def bertopic_pipeline(df, countries_to_update, replacements, ngrams_list, stopwords, 
            use_custom_vocab=True, keybert_kwargs={}, countvectorizer_kwargs={}, bertopic_kwargs={}):
    """
    Run a BERTopic pipeline on a DataFrame of data.

    This function takes a DataFrame of data, a list of countries to update, a dictionary of word replacements, a list of custom n-grams, a list of stopwords, and additional parameters as input. It preprocesses the data by filtering the countries to update and replacing the specified n-grams in the documents. If `use_custom_vocab` is `True`, it creates a custom vocabulary using KeyBERT and runs BERTopic on the preprocessed data using the custom vocabulary and stopwords. If `use_vocabulary` is `False`, it runs BERTopic on the preprocessed data without using the custom vocabulary. The resulting topics and probabilities are returned along with the BERTopic model.

    :param df: A DataFrame of input data.
    :param countries_to_update: A list of countries for which to update the translation columns.
    :param replacements: A dictionary of word replacements, where the keys are the old words to be replaced and the values are the new words to replace them with.
    :param ngrams_list: A list of custom n-grams to be used when creating the custom vocabulary.
    :param stopwords: A list of stopwords to be used when running BERTopic.
    :param use_custom_vocab: A boolean value indicating whether to use a custom vocabulary when running BERTopic. Defaults to `True`.
    :param keybert_kwargs: A dictionary of keyword arguments to be passed to the `create_custom_vocabulary` function.
    :param countvectorizer_kwargs: A dictionary of keyword arguments to be passed to the `CountVectorizer` constructor.
    :param bertopic_kwargs: A dictionary of keyword arguments to be passed to the BERTopic constructor.
    :return: A tuple containing five elements: a list of topics assigned to each input document, a matrix of topic probabilities for each input document, the BERTopic model used, the embeddings and the documents used.
    """

    ### Step 1: Preprocess docs
    #     # Filter the countries we want to copy the original comment (those in english) in the translation column for comment
    # df = preprocess_data(df, countries_to_update)
    #     # Put the data in the correct format for bertopic
    # docs = df["allComment"].astype(str).tolist()
    #     # Call the replace_words function to replace the specified ngrams in the documents
    # docs = replace_words(docs, replacements)
    
    # Filter the countries we want to copy the original comment (those in english) in the translation column for comment
    df = preprocess_data(df, countries_to_update, columns_to_join, "allComment", words_to_filter, replacements)

    # Put the data in the correct format for bertopic
    docs = df["allComment"].astype(str).tolist()
        
    if use_custom_vocab:
        ### Step 2: Create vocabulary with KeyBERT using custom ngrams list
        vocabulary = create_custom_vocabulary(docs, ngrams_list, **keybert_kwargs)

        ### Step 3: Run BERTopic
        # Prepare countVectorizer model
        countvectorizer_kwargs = {**countvectorizer_kwargs, "vocabulary": vocabulary}
        custom_vectorizer_model = CountVectorizer(**countvectorizer_kwargs)
        topics, probs, topic_model, embeddings = run_bertopic(docs, vectorizer_model=custom_vectorizer_model, **bertopic_kwargs)

    else:
        custom_vectorizer_model = CountVectorizer(**countvectorizer_kwargs)
        topics, probs, topic_model, embeddings = run_bertopic(docs, vectorizer_model=custom_vectorizer_model, **bertopic_kwargs)

    return topics, probs, topic_model, embeddings, docs

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cattiaux/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# vocabulary stored in cache to avoid compute it every time 
vocabulary_cache = {}
# embeddings stored in cache to avoid compute it every time 
embeddings_cache = {}

In [3]:
# Our set of predefined labels
labels = [['product','features','performance','software','framework','touch screen','touch panels','sensor','drive','servo motors','license','manual','cad','sensor','eocr','pro face','proface','atv','converter','inverter','frequency converters','tcp','modbus','vijeo','gp','control cabinet','cupboard','programmable logic controller','power supply'],
['pricing','price','offers','orders','payment','invoice','quote','quotation'],
['quality','reliability','complaint','issue','warranty','repair','maintenance','incident','fault','complication','rate','rating','inconvenient','remark','improvement','feedback'],
['delivery','delivery date','delivery time','delays','stock','shipping','lead time','deadline','package','ddt','schedule'],
['customer support','communication','customer service','assistance','support','chat','email','telephone','ticket','attendance','interlocutor','operator','help','experience','uncomfy','agility','professionalism'],
['technical support','technical','technician','guidance','intervention','competence','explanation','solve','query','inquiry','resolution','troubleshooting'],
['response time','correspondence','response','quick','answer'],
['business relation','factories','supplier','cooperation','partnership','oem','business','commercial','siemens']
# ['reputation','public perception','brand image','gratitude','reliability']
]

topics, probs, topic_model, embeddings, docs = bertopic_pipeline(
    df,
    countries_to_update,
    replacements,
    ngrams_list,
    stopwords,
    use_custom_vocab=True,
    keybert_kwargs={
        "top_n": 4,
        "use_mmr": True,
        "stop_words": stopwords,
        "keyphrase_ngram_range": (1, 1),
        "nr_candidates": 12
    },
    countvectorizer_kwargs={
        "stop_words": stopwords,
        "lowercase": True,
        "ngram_range": (1, 3)
    },
    # umap_kwargs={
    #     "n_neighbors":50, 
    #     "n_components":5, 
    #     "min_dist":0.0, 
    #     "metric":'cosine', 
    #     "random_state":100
    # },
    bertopic_kwargs={
        "seed_topic_list":labels, # Create a new BERTopic model using our predefined labels as topics
        # "nr_topics":"auto",
        "ctfidf_model": ClassTfidfTransformer(reduce_frequent_words=True),
        "representation_model": [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)], # possible values : KeyBERTInspired() or MaximalMarginalRelevance(diversity=0.5)
        "top_n_words":10,
        "min_topic_size":70,
        "calculate_probabilities": True
    }
)

Batches: 100%|██████████| 1119/1119 [01:03<00:00, 17.55it/s] 


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  idf = np.log((avg_nr_samples / df)+1)


In [7]:
len(docs)

35804

In [10]:
def save_bertopic_model(topic_model, topics, probs, embeddings, docs, filename):
    """
    Save a BERTopic model and associated data to a file.
    
    :param topic_model: The BERTopic model to save.
    :param topics: The topics variable to save.
    :param probs: The probs variable to save.
    :param docs: The docs variable to save.
    :param filename: The name of the file to save the data to.
    """
    # Save the BERTopic model
    topic_model.save(filename)
    
    # Save the topics, probs, and docs variables
    with open(filename + '_data.pkl', 'wb') as f:
        pickle.dump((topics, probs, embeddings, docs), f)

def load_bertopic_model(filename):
    """
    Load a BERTopic model and associated data from a file.
    
    :param filename: The name of the file to load the data from.
    :return: A tuple containing the loaded BERTopic model, topics, probs, and docs variables.
    """
    # Load the BERTopic model
    topic_model = BERTopic.load(filename)
    
    # Load the topics, probs, and docs variables
    with open(filename + '_data.pkl', 'rb') as f:
        topics, probs, embeddings, docs = pickle.load(f)
    
    return topic_model, topics, probs, embeddings, docs

In [None]:
# Save the BERTopic model and associated data
save_bertopic_model(topic_model, topics, probs, embeddings, docs, 'raw_keybert_bertopic_model____')

In [11]:
# Load the BERTopic model and associated data
topic_model, topics, probs, embeddings, docs = load_bertopic_model('raw_keybert_bertopic_model')

In [None]:
# Visualize Term Score Decline
topic_model.visualize_term_rank() # log_scale=True

In [227]:
topic_model.get_topic_freq().head(30)

Unnamed: 0,Topic,Count
3,-1,15590
5,0,2214
15,1,1673
4,2,1333
13,3,1306
19,4,978
6,5,948
1,6,929
2,7,858
24,8,479


In [None]:
topic_model.visualize_topics()

In [None]:
topic_model_merged.visualize_barchart(top_n_topics=65)

In [6]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 59/59 [00:21<00:00,  2.79it/s]


In [None]:
tree = topic_model.get_topic_tree(hierarchical_topics)
# print(tree)

In [12]:
topics_to_merge = [ [42,3,0,13], #Delivery Deadlines : challenges and strategies involved in managing delivery deadlines in logistics operations. (vert)
                    [20,50,27], #Quotation and Pricing Strategies (vert bas)
                    [35,32], #Touch Panels and Screens (rouge, haut)
                    [40,36], #Frequency Converters : frequency converters used in industrial applications and the technical support provided by manufacturers and suppliers (rouge, suite)
                    [37,21,6,12,9,4,1,14,16,31,19], #“Automation Components” : hardware and software components used in industrial automation systems. (rouge centre)
                    [33,46,8], #Product Evaluation : evaluate the quality, affordability and reliability of products and services (rouge, fin)
                    [44,51,23,41,49,57,22], #Customer Support : Reliability and Quality in Customer Service and Support (bleu ciel)
                    [58,59], #Quick Customer Service (marron)
                    [38,10,26,52,39,43], #Problem Solving and Communication (focus on the importance of being efficient and precise when solving problems) (jaune)
                    [45,47,55,53,54], #Assistance and Guidance (noir)
                    [29,30,11,24], #Power Supply Issues (2e vert, haut)
                    [7,5,2,25,15,34,18,28,17], #Technical Support (2e vert, bas)
                    [48,56] #None : positive feedback (2e rouge)
]

names = ["Delivery Deadlines",
    "Pricing", #Quotation and Pricing Strategies
    "Touch Screens", #Touch Panels and Screens
    "Frequency Converters",
    "Automation Components",
    "Product Evaluation",
    "Customer Support", #Reliability and Quality in Customer Service and Support
    "Quick Customer Service",
    "Problem Solving & Comm",
    "Assistance", #Assistance and Guidance
    "Power Supply Issues",
    "Technical Support",
    "positive feedback"]

# Create a dictionary where the keys are the topics and the values are the custom labels
topic_labels_dict = {}
topic_labels_dict[-1]="Outliers"
for i in range(len(topics_to_merge)):
    for topic in topics_to_merge[i]:
        topic_labels_dict[topic] = names[i]

# topic_model_merged = copy.deepcopy(topic_model)
# topic_model_merged.set_topic_labels(topic_labels_dict)
# topic_model_merged.merge_topics(docs, topics_to_merge)

# topic_model_merged.visualize_barchart(top_n_topics=50, custom_labels=True)

In [159]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, custom_labels=True)

100%|██████████| 59/59 [00:08<00:00,  6.85it/s]


In [226]:
# [0,13] :  management of delivery times, shipments, and deadlines, as well as the logistics of delivering orders from suppliers.


# Merge les keywords de différents topics en gardant leur ordre d'importance
topics = [0,13,3]
n = 10
topic_keywords = {}
for topic in topics:
    topic_words = [word for word, _ in topic_model.get_topic(topic)[:n]]
    topic_keywords[topic] = topic_words

merged_keywords = []
for i in range(n):
    for topic in topics:
        merged_keywords.append(topic_keywords[topic][i])
", ".join(merged_keywords)

'delivery time, deliveries, delivery date, delivery date, delivery time, delivery time, deliveries, delays, delivery, delivery, delivering, deliveries, delays, delivery, delayed, deliver, purchasing, delays, shipments, deliver, deliver, deadlines, orders, postponed, arrive, logistics, deadlines, shipment, supplier, deadline'

In [None]:
topic_labels = topic_model.generate_topic_labels(nr_words=3, 
                                                 topic_prefix=False,
                                                 word_length=10,
                                                 separator=", ")
topic_model.set_topic_labels(topic_labels)


In [None]:
# Optimizing Label with 0shot

from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# A selected topic representation
sequence_to_classify =  ", ".join([word for word, _ in topic_model.get_topic(14)])

# Our set of potential topic labels
product = 'product quality : features and performance, license, software and framework, warranty and repairs, maintenance'
price = 'pricing and offers : price, orders, payment, invoice, quote'
quality = 'quality, warranty and repairs : complaint, issue, maintenance'
delivery = 'delivery : shipping, lead time, deadline, ddt, schedule, delay'
customer = 'customer support : customer service, communication, assistance, professionalism, response time'
tech = 'technical support : technician, intervention, guidance, competence, explanation, solve, query, inquiry, resolution, troubleshooting'
response = "response time : correspondence, response, quick, answer"
business = "business relation : collaboration, partnership, company, factories, supplier, cooperation, partnership, oem, business, commercial, siemens"
reputation = 'reputation : gratitude, public perception, brand image'

candidate_labels = ['product, parts and components', 'pricing and offers', 'quality, warranty and repairs','delivery','customer support','business relations']
# candidate_labels = [product, price, delivery, customer, business, reputation]

classifier(sequence_to_classify, candidate_labels)

In [None]:
# visualize the topic representation of major topics per class:
topics_per_class = model.topics_per_class(docs, classes=df.Liked.to_list())
model.visualize_topics_per_class(topics_per_class, top_n_topics=14)

In [199]:
# Create a dictionary where the keys are the topics and the values are the probabilities
# proba_dict = []
# for proba in probs:
#     proba_dict.append(dict(zip(topic_model.get_topics().keys(), proba)))

# Get a list of the topics in the correct order, excluding topic -1
topic_order = [topic for topic in topic_model.get_topics().keys() if topic != -1]

# Create a dictionary where the keys are the topic labels and the values are the probabilities
proba_dict = []
for proba in probs:
    proba_dict.append(dict(zip(topic_order, proba)))
proba_dict[0]


{0: 0.006422966174473185,
 1: 0.0030636787080041564,
 2: 0.0016778322661592353,
 3: 0.0013679151056926239,
 4: 0.0027962226268213202,
 5: 0.0037184175416787916,
 6: 0.0019099200177096953,
 7: 0.004046900614063682,
 8: 0.0063088903410571376,
 9: 0.003265074051261276,
 10: 0.009237731021548958,
 11: 0.0024937312603352775,
 12: 0.0036314962643877284,
 13: 0.003955375042104501,
 14: 0.003204122371745636,
 15: 0.014746595146322117,
 16: 0.0030439382085932242,
 17: 0.004613907603554464,
 18: 0.010559578738295102,
 19: 0.006045203541753921,
 20: 0.0018021515961719272,
 21: 0.0031188798661248773,
 22: 0.0788126682704439,
 23: 0.008208460197756494,
 24: 0.003955768477742252,
 25: 0.006684707513072302,
 26: 0.012296267393307445,
 27: 0.007762271089481098,
 28: 0.010488798585732085,
 29: 0.007457265970370559,
 30: 0.006787875751524238,
 31: 0.0037245132137788467,
 32: 0.00469340609503844,
 33: 0.003920331056633325,
 34: 0.008189835807416938,
 35: 0.0015797374562505663,
 36: 0.0031085878232085685,

In [13]:
df = pd.read_excel("/media/cattiaux/DATA/Wassati/team_data/schneider/OEM_Verbatims_base.xlsx", engine='openpyxl')
df = preprocess_data(df, countries_to_update, columns_to_join, "allComment", words_to_filter, replacements)

# Add the proba column to the dataframe
# df['proba'] = probs.tolist()

# Get the topic and keywords
topic_keywords = {}
n = 10  # Number of keywords to include
for topic in topic_model.get_topics().keys():
    topic_words = [word for word, _ in topic_model.get_topic(topic)[:n]]
    topic_keywords[topic] = ", ".join(topic_words)

df['topic'] = topics
df['keywords'] = df['topic'].map(topic_keywords)

# Add the label and keywords columns to the dataframe
topic_labels_dict[-1] = 'Outlier'
df['label'] = df['topic'].map(topic_labels_dict)

# Get a list of the topics in the correct order, excluding topic -1
topic_order = [topic for topic in topic_model.get_topics().keys() if topic != -1]

# Create a dictionary where the keys are the topic labels and the values are the probabilities
proba_dict = []
for proba in probs:
    proba_dict.append(dict(sorted(zip(topic_order, proba), key=lambda item: item[1], reverse=True)))
    
# Add the proba_dict column to the dataframe
df['proba_dict'] = proba_dict

# df.to_csv("/media/cattiaux/DATA/Wassati/team_data/schneider/df_labelled.csv",header=True)
df.head(5)
len(df)

35804

In [16]:
res = df.iloc[[x[1] for x in sorted([(max(v.values()), i) for i, v in enumerate(df['proba_dict'])], reverse=True)]]
res[res["topic"] == -1].to_csv("outliers_15K.csv", header=True)

In [7]:
model_name="all-MiniLM-L6-v2"

outliers_df = df[df["topic"] == -1]
outliers_docs = outliers_df["allComment"].astype(str).tolist()

vocabulary = create_custom_vocabulary(outliers_docs, ngrams_list, top_n=4, use_mmr=True, stop_words=stopwords, keyphrase_ngram_range=(1,1), nr_candidates=12)
# Prepare sub-models
custom_vectorizer_model = CountVectorizer(vocabulary=vocabulary, stop_words=stopwords, lowercase=True, ngram_range=(1, 3))
representation_models = [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)] 
ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True)

# compute embeddings using GPU
sentence_model = SentenceTransformer(model_name, device="cuda")
embeddings = sentence_model.encode(outliers_docs, show_progress_bar=True)

Batches: 100%|██████████| 488/488 [00:31<00:00, 15.29it/s]


In [8]:
# compute embeddings using GPU
sentence_model = SentenceTransformer(model_name, device="cuda")
embeddings = sentence_model.encode(outliers_docs, show_progress_bar=True)

Batches: 100%|██████████| 488/488 [00:32<00:00, 15.13it/s] 


In [None]:
    # Run BERTopic
topic_model = BERTopic(embedding_model=sentence_model,
                        vectorizer_model=custom_vectorizer_model,
                        seed_topic_list=labels,
                        ctfidf_model = ClassTfidfTransformer(reduce_frequent_words=True),
                        representation_model= [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)],
                        top_n_words=10,
                        min_topic_size=70,
                        calculate_probabilities=True)

topics, probs = topic_model.fit_transform(outliers_docs, embeddings)

In [18]:
print("outliers: ",len(outliers_df))
txt = df["allComment"].astype(str).tolist()

# Use the "c-TF-IDF" strategy with a threshold
new_topics = topic_model.reduce_outliers(txt, topics, strategy="c-tf-idf", threshold=0.1)
topic_model.update_topics(txt, vectorizer_model=custom_vectorizer_model, topics=new_topics)
topic_model.get_topic_freq().head()

outliers:  15590


  idf = np.log((avg_nr_samples / df)+1)


Unnamed: 0,Topic,Count
6,-1,4705
5,0,2331
21,1,1971
4,2,1502
30,4,1457


In [17]:
hierarchical_topics_reduced = topic_model.hierarchical_topics(txt)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics_reduced, custom_labels=True)

100%|██████████| 59/59 [00:00<00:00, 59.88it/s]
