In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import pickle
import string, re, copy
from tqdm import tqdm
from wordcloud import WordCloud

from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, DataCollatorWithPadding
import torch
from bertopic import BERTopic
from bertopic.vectorizers import ClassTfidfTransformer
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from umap import UMAP
from sentence_transformers import SentenceTransformer
from keybert import KeyBERT

torch.cuda.is_available()

True

In [75]:
df = pd.read_csv("../data/csv_files/full_schneider_schwartz_multilabels.csv")
df_preprocessed = df[df['non_empty_rows']]
# Extract documents from DataFrame
docs = df_preprocessed["processed_data"].astype(str).tolist()


Columns (11) have mixed types. Specify dtype option on import or set low_memory=False.



In [92]:
# Define a dictionary mapping old values to new values for the "Zone" column
zone_values = {
    "China": "China & HK",
    "East Asia": "East Asia Japan"
}

# Define a dictionary mapping old values to new values for the "Clusters" column
cluster_values = {
    "Cluster KSAPYB": "Saudia Arabia & Yemen",
    "North Andean": "Andean Cluster",
    "Chile": "Andean Cluster",
    "South Andean": "Andean Cluster",
    "Turkey Central Asia": "Turkey Central Asia & Pakistan"
}

# Replace the values in the "Zone" column
df["Zone"] = df["Zone"].replace(zone_values)

# Replace the values in the "Clusters" column
df["Clusters"] = df["Clusters"].replace(cluster_values)

Unnamed: 0,Zone,Clusters,Account Country


In [3]:
from utils.schneider import countries_to_update, text_data_column, words_to_filter, replacements, ngrams_list, keybert_kwargs, more_stopwords, labels
from vocabulary.vocabulary import VocabularyCreator
from clustering.clustering import ClusteringMethod
from sklearn.feature_extraction.text import CountVectorizer
import nltk

model_name = "all-MiniLM-L6-v2"

keybert_kwargs={
    "top_n": 4,
    "use_mmr": True,
    "stop_words": "english",
    "keyphrase_ngram_range": (1, 1),
    "nr_candidates": 12
}

vocabulary_creator = VocabularyCreator(model_name,ngrams_list,**keybert_kwargs)
vocabulary_list = vocabulary_creator.keybert_vocabulary(df_preprocessed)
print(len(vocabulary_list))

11077


In [21]:
############################# STEP 2 - COMPUTE CUSTOM VOCABULARY ###############################

# vocabulary stored in cache to avoid compute it every time 
vocabulary_cache = {}
# embeddings stored in cache to avoid compute it every time 
embeddings_cache = {}

# list of the ngrams wanted for our data
ngrams_list = ['schneider electric','supply chain','solve problem','lead time','price performance ratio','price quality ratio','pro face','spare part',
           'product range','product line','user friendly','data sheets','allen bradley',
           'technical support','tehnical assistance','technical team','technical service',
           'delivery time','long time','delivery date','response time','quick response',
           'customer service','customer center','customer support','contact person','service support','call center','call back',
           'sales service','sales team','account manager','pre sales','sales representative',
           'circuit breakers','frequency converters','low voltage','touch panel','frequency inverters','push buttons','electrical components','control cabinets','touch screen','control systems','servo motors','power supply','speed drives',
           'alles tip top','alles tip','sinan chalabi']

def create_custom_vocabulary(docs, ngrams_list, model_name="all-MiniLM-L6-v2", **kwargs):
    """
    Create a custom vocabulary from a list of documents using KeyBERT.

    This function takes a list of documents, a list of custom n-grams, and an optional model name as input. It preprocesses the documents by replacing the custom n-grams with single tokens containing underscores. Then, it initializes a GPU-enabled SentenceTransformer model and uses KeyBERT to extract keywords from the preprocessed documents. The extracted keywords are postprocessed by replacing single tokens with the original n-grams and removing duplicates to create the custom vocabulary. The vocabulary is cached for future use and returned as a list of strings.

    :param docs: A list of input documents, where each document is a string.
    :param ngrams_list: A list of custom n-grams to be replaced with single tokens during preprocessing.
    :param model_name: An optional string specifying the name of the SentenceTransformer model to use. Defaults to "all-MiniLM-L6-v2".
    :param kwargs: Additional keyword arguments to be passed to the KeyBERT `extract_keywords` method.
    :return: A list of strings representing the custom vocabulary created from the input documents.
    """

    # Preprocess documents by replacing custom n-grams with single tokens with underscore "_"
    preprocessed_docs = []
    for doc in docs:
        for ngram in ngrams_list:
            if ngram in doc:
                ngram_underscore = ngram.replace(" ","_")
                doc = doc.replace(ngram, ngram_underscore)
        preprocessed_docs.append(doc)

    # Initialize a GPU-enabled SentenceTransformer model
    model = SentenceTransformer(model_name, device="cuda")

    # Create custom vocabulary with KeyBERT
    # Use the vocabulary in cache if it exists otherwise compute it and store it in cache
    cache_key = (hash(str(preprocessed_docs)), hash(str(ngrams_list)), hash(model_name), hash(str(kwargs)))
    if cache_key in vocabulary_cache:
        vocabulary = vocabulary_cache[cache_key]
        return vocabulary
    else:
        # Extract keywords
        kw_model = KeyBERT(model=model)
        keywords = kw_model.extract_keywords(preprocessed_docs, **kwargs)
        
        # Flatten the list of lists and remove duplicates to create the vocabulary
        vocabulary = list(set([word for sublist in keywords for word, score in sublist]))

        # Postprocess extracted keywords by replacing single tokens with original n-grams
        postprocessed_vocab = []
        for keyword in vocabulary:
            space_keyword = keyword.replace("_"," ")
            if space_keyword in ngrams_list:
                keyword = space_keyword
            postprocessed_vocab.append(keyword)

        vocabulary_cache[cache_key] = postprocessed_vocab
        return postprocessed_vocab

# NLTK English stopwords
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
# add stopwords to the list from nltk
more_stopwords = ['schneider electric','se','schneider','schneiders','also','xxx','xxxx','xxxxx','ok','okk','okay','ras','na','nil','none','mr','mrs','monsieur','thank','esther','ester','paulo','paolo','sadao','carlos','pereira','ken','benoit','sergio','catalina','cesar','rufo','moraleda','ferrer','guido','smekens','castelli','muiz','roberto','matteo','guerriera','mike','elena','isabel','jurrie','javier','anna','fernandez','reyes','cichinelli','inicio','incio','jos','fabio','canedo','mituo','eduardo','roberto','santos','inicio','silva','arnaldo','sgueglia','squeglia','sandrine','laroche','lavinia','salerno','fahler','rodriguez','perez','prieto','heleni','henri','henrique','henrik','sammy','gregoire','denis','thomas','divani','flavio','rosetti','fabbri','danilo','evandro','sahil','kundli','maggico','cindy','martin','gabrielsson','edoardo','martha','ponte','aponte','pinkowitz','cortese','nicole','gahner','maulady','ahmad','heidi','okino','wang','jason','james','rhandzi','cecil','went','goes','thanks','thank','alles tip','alles tip top','66666666666666','000000000000000','666666666','eng','particular','particularly','alles']
stopwords.extend(more_stopwords)

# Create vocabulary with KeyBERT using custom ngrams list
vocabulary = create_custom_vocabulary(docs, ngrams_list, top_n=4, use_mmr=True, stop_words=stopwords, keyphrase_ngram_range=(1,1), nr_candidates=12)
len(vocabulary)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cattiaux/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


11464

In [5]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('english')
stopwords.extend(more_stopwords)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/cattiaux/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [22]:
clustering = ClusteringMethod(model_name)

bertopic_kwargs={
    # "seed_topic_list": labels, # Create a new BERTopic model using our predefined labels as topics
    "nr_topics":"auto",
    "ctfidf_model": ClassTfidfTransformer(reduce_frequent_words=True),
    "representation_model": [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.4)], # possible values : KeyBERTInspired() or MaximalMarginalRelevance(diversity=0.5)
    "top_n_words":10,
    "min_topic_size":70,
    "calculate_probabilities": True
}

bertopic_kwargs['vectorizer_model'] = CountVectorizer(
                    vocabulary=vocabulary, 
                    stop_words=stopwords, 
                    lowercase=True, 
                    ngram_range=(1, 3)
                )


topics, probs, topic_model, embeddings = clustering.run_bertopic(
    df= df_preprocessed,
    **bertopic_kwargs
    )

Batches:   0%|          | 0/1229 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av


divide by zero encountered in divide


divide by zero encountered in divide



In [54]:
topic_model.get_topic_freq().head(30)

# Get the top 10 keywords for each topic
for i in range(33):
    keywords = topic_model.get_topic(i)[:10]
    # print(f"Topic {i}: {keywords}")
    # # Only print the keywords, not the scores
    # print(f"Topic {i}: {', '.join([word for word, score in keywords])}")
    # Print the keywords and the scores formatted to two decimal places
    print(f"Topic {i}: {', '.join([f'{word} ({score:.2f})' for word, score in keywords])}")

Topic 0: sales (0.37), prices (0.36), products (0.35), delays (0.34), customers (0.33), deliveries (0.32), delivery time (0.31), brand (0.31), product (0.30), customer (0.30)
Topic 1: technical support (0.47), reliability (0.42), quality (0.40), customer service (0.39), reliable (0.38), sales service (0.37), products (0.36), customers (0.35), product (0.32), rating (0.32)
Topic 2: quick response (0.50), answered (0.44), answers (0.42), responses (0.42), precise (0.42), response (0.41), accurately (0.40), answer (0.39), easy (0.38), question (0.38)
Topic 3: technician (0.80), technicians (0.78), competent (0.65), tech (0.64), competence (0.59), techs (0.58), technical (0.56), technical support (0.53), skills (0.52), expertise (0.51)
Topic 4: quality (0.37), complaints (0.36), ten (0.33), issues (0.33), products (0.32), rate (0.32), brands (0.32), rating (0.29), product (0.28), nine (0.28)
Topic 5: troubleshooting (0.49), resolved (0.36), issue (0.32), workaround (0.32), defective (0.29)

In [25]:
topic_model.visualize_topics()

In [26]:
hierarchical_topics = topic_model.hierarchical_topics(docs)
topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)

100%|██████████| 32/32 [00:06<00:00,  5.13it/s]


In [45]:
# topic_model.visualize_barchart(top_n_topics=65)

In [44]:
def save_bertopic_model(topic_model, topics, probs, embeddings, docs, filename):
    """
    Save a BERTopic model and associated data to a file.
    
    :param topic_model: The BERTopic model to save.
    :param topics: The topics variable to save.
    :param probs: The probs variable to save.
    :param docs: The docs variable to save.
    :param filename: The name of the file to save the data to.
    """
    # Save the BERTopic model
    topic_model.save(filename)
    
    # Save the topics, probs, and docs variables
    with open(filename + '_data.pkl', 'wb') as f:
        pickle.dump((topics, probs, embeddings, docs), f)

def load_bertopic_model(filename):
    """
    Load a BERTopic model and associated data from a file.
    
    :param filename: The name of the file to load the data from.
    :return: A tuple containing the loaded BERTopic model, topics, probs, and docs variables.
    """
    # Load the BERTopic model
    topic_model = BERTopic.load(filename)
    
    # Load the topics, probs, and docs variables
    with open(filename + '_data.pkl', 'rb') as f:
        topics, probs, embeddings, docs = pickle.load(f)
    
    return topic_model, topics, probs, embeddings, docs

# Save the BERTopic model and associated data
# save_bertopic_model(topic_model, topics, probs, embeddings, docs, '../models/schneider_bertopic_model_2023')

# Load the BERTopic model and associated data
topic_model, topics, probs, embeddings, docs = load_bertopic_model('../models/schneider_bertopic_model_2023')

In [55]:
topics_to_merge = [
                    [0,1,4,8,11,20], # Product Quality and Sales
                    [9,14], # Inverters and Drives Support : about providing support and service for inverters and drives
                    [23,5,12,6,7], # Technical Support and Maintenance for UPS Systems and Touch Panels
                    [13,15], # Quotations and Offers Management
                    [29,27,16,22], # Customer Service and Delivery Scheduling
                    [2,3,10,17,25,21], # Technical Support and Problem Resolution
                    [24,18,30,31,19,32,28,26] # Communication, Guidance and Feedback
]

names = ["Product Quality & Sales",
    "Inverters & Drives Support",
    "UPS & Touch Panels", #Technical Support and Maintenance for UPS Systems and Touch Panels
    "Quotations & Offers", # Quotations and Offers Management
    "Customer Service and Delivery Scheduling",
    "Technical Support & Problem Resolution",
    "Communication, Guidance & Feedback"]

# Create a dictionary where the keys are the topics and the values are the custom labels
topic_labels_dict = {}
topic_labels_dict[-1]="Outliers"
for i in range(len(topics_to_merge)):
    for topic in topics_to_merge[i]:
        topic_labels_dict[topic] = names[i]

In [56]:
topic_model_merged = copy.deepcopy(topic_model)
topic_model_merged.set_topic_labels(topic_labels_dict)
topic_model_merged.merge_topics(docs, topics_to_merge)

topic_model_merged.visualize_barchart(top_n_topics=50, custom_labels=True)

RuntimeError: CUDA error: unknown error
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
# Define a dictionary mapping old column names to new column names
column_names = {
    "topic": "topic_old",
    "keywords": "keywords_old",
    "label": "label_old",
}

# Rename the columns
df.rename(columns=column_names, inplace=True)

In [None]:
# Get the topic and keywords
topic_keywords = {}
n = 10  # Number of keywords to include
for topic in topic_model.get_topics().keys():
    topic_words = [word for word, _ in topic_model.get_topic(topic)[:n]]
    topic_keywords[topic] = ", ".join(topic_words)

df['topic'] = topics
df['keywords'] = df['topic'].map(topic_keywords)

# Add the label and keywords columns to the dataframe
topic_labels_dict[-1] = 'Outlier'
df['label'] = df['topic'].map(topic_labels_dict)

# Get a list of the topics in the correct order, excluding topic -1
topic_order = [topic for topic in topic_model.get_topics().keys() if topic != -1]

# Create a dictionary where the keys are the topic labels and the values are the probabilities
proba_dict = []
for proba in probs:
    proba_dict.append(dict(sorted(zip(topic_order, proba), key=lambda item: item[1], reverse=True)))
    
# Add the proba_dict column to th