In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pickle, ast

from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, pipeline, DataCollatorWithPadding
import torch
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic

torch.cuda.is_available()

True

In [35]:
# Read the excel review file
df = pd.read_csv("/media/cattiaux/DATA/Wassati/team_data/schneider/df_labelled.csv")
df2 = pd.read_csv("/media/cattiaux/DATA/Wassati/team_data/schneider/df_sentiment_labelled.csv")
df3 = pd.read_csv("/media/cattiaux/DATA/Wassati/team_data/schneider/df_emotions_labelled.csv")

# Put the data in the correct format for bertopic
docs = df3["allComment"].astype(str).tolist()


Columns (6,7,11,15,16,17,19,30,31,38,39,40,41) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (7,8,12,16,17,18,20,31,32,39,40,41,42) have mixed types. Specify dtype option on import or set low_memory=False.


Columns (8,9,13,17,18,19,21,32,33,40,41,42,43) have mixed types. Specify dtype option on import or set low_memory=False.



In [None]:
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

classifier = pipeline('text-classification', model=model, tokenizer=tokenizer, max_length=512, truncation=True)

predictions = []

for review in [[x] for x in df.allComment.tolist()]:
    prediction = classifier(review)
    predictions.append(prediction)

res = pd.DataFrame([item for sublist in predictions for item in sublist])
df_pred = pd.concat([df, res], axis=1)

In [2]:
df2 = df_pred.copy()
# Rename the second 'label' column to 'sentiment_label'
cols = df2.columns.tolist()
cols[len(cols) - 1 - cols[::-1].index('label')] = 'sentiment_label'
df2.columns = cols
df2.columns
# df2.to_csv("/media/cattiaux/DATA/Wassati/team_data/schneider/df_sentiment_labelled.csv")

  df2 = pd.read_csv("/media/cattiaux/DATA/Wassati/team_data/schneider/df_sentiment_labelled.csv")


In [14]:
# Check if a CUDA-enabled GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_name = "SamLowe/roberta-base-go_emotions"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type="multi_label_classification", max_length=512)

# Move the model to the GPU
model = model.to(device)

# Set the batch size
batch_size = 2

# Create a list of label names
label_emotions = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring', 'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval', 'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief', 'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization', 'relief', 'remorse', 'sadness', 'surprise','neutral']

# Initialize lists to store the predicted labels and scores
predicted_labels = []
predicted_scores = []

df3 = df2
# Iterate over the rows of the DataFrame in batches
for i in range(0, len(df3), batch_size):
    batch = df3[i:i+batch_size]
    texts = batch['allComment'].tolist()
    inputs = tokenizer(texts, return_tensors="pt", padding=True, truncation=True)
    
    # Move the inputs to the GPU
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    outputs = model(**inputs)
    probs = outputs.logits.sigmoid().detach().cpu().numpy()
    
    # Apply a threshold to the probabilities to get the predicted labels
    threshold = 0.5
    labels = [[label_emotions[i] for i, prob in enumerate(prob_row) if prob > threshold] for prob_row in probs]
    
    # Store the predicted labels and scores
    predicted_labels.extend(labels)
    scores = [{label_emotions[i]: prob for i, prob in enumerate(prob_row)} for prob_row in probs]
    # predicted_scores.extend(probs.tolist())
    predicted_scores.extend(scores)

# Add the predicted labels and scores as new columns in the DataFrame
df3['predicted_labels'] = predicted_labels
df3['predicted_scores'] = predicted_scores

# Convert the string values in the 'predicted_labels' column into lists
df3['predicted_labels'] = df3['predicted_labels'].apply(lambda x: ast.literal_eval(x))
# Convert the string values in the 'predicted_scores' column into dictionaries
df3['predicted_scores'] = df3['predicted_scores'].apply(lambda x: ast.literal_eval(x))

# df3.to_csv("/media/cattiaux/DATA/Wassati/team_data/schneider/df_emotions_labelled.csv")

In [23]:
df3.single_emotion_label.unique()

array(['admiration', 'disappointment', 'neutral', 'approval',
       'disapproval', 'gratitude', 'caring', 'realization', 'desire',
       'annoyance', 'confusion', 'surprise', 'joy', 'optimism',
       'nervousness', 'love', 'fear', 'curiosity', 'excitement',
       'amusement', 'relief', 'sadness', 'remorse', 'disgust',
       'embarrassment', 'anger'], dtype=object)

In [36]:
positive_emotions = ['admiration','approval','gratitude','caring','realization','joy','optimism','love','excitement','amusement','relief']
negative_emotions = ['disappointment','disapproval','annoyance','confusion','nervousness','fear','sadness','remorse','disgust','embarrassment','anger']
neutral_emotions = ['neutral','desire','surprise','curiosity']

# Convert the string values in the 'predicted_labels' column into lists
df3['predicted_labels'] = df3['predicted_labels'].apply(lambda x: ast.literal_eval(x))
# Convert the string values in the 'predicted_scores' column into dictionaries
df3['predicted_scores'] = df3['predicted_scores'].apply(lambda x: ast.literal_eval(x))

# Create a new column 'single_emotion_label' that contains the label with the highest score
df3['single_emotion_label'] = df3['predicted_scores'].apply(lambda x: max(x, key=x.get))
# Create a new column 'sentiment' that contains the sentiment of the emotion in the 'single_emotion_label' column
df3['single_sentiment_label'] = df3['single_emotion_label'].apply(lambda x: 'positive' if x in positive_emotions else ('negative' if x in negative_emotions else ('neutral' if x in neutral_emotions else 'unknown')))

# Use the explode() method to transform each element of the list into a row
new_df3 = df3.explode('predicted_labels')
new_df3 = new_df3.rename(columns={'predicted_labels': 'emotion_label'})

# Pour retourner au df initial avec une liste de label
# Group the rows by the original index and aggregate the 'emotion_label' column into a list
# original_df = new_df.groupby(new_df.index).agg({'emotion_label': list})
new_df3.tail()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Account Country,Front Office Country,Clusters,Zone,Operation,Primary Coverage Model,Account owner role,...,topic,keywords,label,proba_dict,sentiment_label,score,emotion_label,predicted_scores,single_emotion_label,single_sentiment_label
35800,35800,35800,55368,USA,USA,USA,US,North America Operations,,,...,-1,"customer service, delivery time, supplier, sal...",Outlier,"{51: 0.08053342011888666, 15: 0.07930872579322...",positive,0.654282,,"{'admiration': 0.023237899, 'amusement': 0.000...",neutral,neutral
35801,35801,35801,55369,Finland,Finland,Finland & Baltics,Nordic & Baltics,Europe Operations,,,...,0,"delivery time, delivery date, deliveries, deli...",Delivery Deadlines,"{0: 0.07563492692888851, 42: 0.036687726791776...",positive,0.958176,admiration,"{'admiration': 0.6631243, 'amusement': 0.00047...",admiration,positive
35802,35802,35802,55370,Finland,Finland,Finland & Baltics,Nordic & Baltics,Europe Operations,,,...,-1,"customer service, delivery time, supplier, sal...",Outlier,"{51: 0.109790295057112, 8: 0.04224633837231928...",positive,0.91531,admiration,"{'admiration': 0.50303435, 'amusement': 0.0004...",approval,positive
35802,35802,35802,55370,Finland,Finland,Finland & Baltics,Nordic & Baltics,Europe Operations,,,...,-1,"customer service, delivery time, supplier, sal...",Outlier,"{51: 0.109790295057112, 8: 0.04224633837231928...",positive,0.91531,approval,"{'admiration': 0.50303435, 'amusement': 0.0004...",approval,positive
35803,35803,35803,55371,China,China,China,China & HK,China & East Asia Operations,,,...,-1,"customer service, delivery time, supplier, sal...",Outlier,"{16: 0.04066095582105752, 14: 0.02977564802662...",neutral,0.694542,neutral,"{'admiration': 0.052684933, 'amusement': 0.001...",neutral,neutral


In [37]:
# Calculate the total number of rows for each sentiment in the 'sentiment_label' column
total_positive = len(df3[df3['sentiment_label'] == 'positive'])
total_negative = len(df3[df3['sentiment_label'] == 'negative'])
total_neutral = len(df3[df3['sentiment_label'] == 'neutral'])

# Calculate the number of matches for each sentiment
positive_matches = len(df3[(df3['sentiment_label'] == 'positive') & (df3['single_sentiment_label'] == 'positive')])
negative_matches = len(df3[(df3['sentiment_label'] == 'negative') & (df3['single_sentiment_label'] == 'negative')])
neutral_matches = len(df3[(df3['sentiment_label'] == 'neutral') & (df3['single_sentiment_label'] == 'neutral')])

# Calculate the percentage of matches for each sentiment
positive_match_percent = positive_matches / total_positive * 100 if total_positive > 0 else 0
negative_match_percent = negative_matches / total_negative * 100 if total_negative > 0 else 0
neutral_match_percent = neutral_matches / total_neutral * 100 if total_neutral > 0 else 0

# Calculate the number of differences for each sentiment
positive_differences = total_positive - positive_matches
negative_differences = total_negative - negative_matches
neutral_differences = total_neutral - neutral_matches

# Calculate the percentage of differences for each sentiment
positive_difference_percent = positive_differences / total_positive * 100 if total_positive > 0 else 0
negative_difference_percent = negative_differences / total_negative * 100 if total_negative > 0 else 0
neutral_difference_percent = neutral_differences / total_neutral * 100 if total_neutral > 0 else 0

# Calculate the distribution of differences for each sentiment
positive_to_negative = len(df3[(df3['sentiment_label'] == 'positive') & (df3['single_sentiment_label'] == 'negative')])
positive_to_neutral = len(df3[(df3['sentiment_label'] == 'positive') & (df3['single_sentiment_label'] == 'neutral')])
negative_to_positive = len(df3[(df3['sentiment_label'] == 'negative') & (df3['single_sentiment_label'] == 'positive')])
negative_to_neutral = len(df3[(df3['sentiment_label'] == 'negative') & (df3['single_sentiment_label'] == 'neutral')])
neutral_to_positive = len(df3[(df3['sentiment_label'] == 'neutral') & (df3['single_sentiment_label'] == 'positive')])
neutral_to_negative = len(df3[(df3['sentiment_label'] == 'neutral') & (df3['single_sentiment_label'] == 'negative')])

# Print the results
print(f"Positive matches: {positive_match_percent:.2f}%")
print(f"Negative matches: {negative_match_percent:.2f}%")
print(f"Neutral matches: {neutral_match_percent:.2f}%")
print(f"Positive differences: {positive_difference_percent:.2f}%")
print(f"Negative differences: {negative_difference_percent:.2f}%")
print(f"Neutral differences: {neutral_difference_percent:.2f}%")
print(f"Positive to Negative: {positive_to_negative / total_positive * 100:.2f}%" if total_positive > 0 else "Positive to Negative: N/A")
print(f"Positive to Neutral: {positive_to_neutral / total_positive * 100:.2f}%" if total_positive > 0 else "Positive to Neutral: N/A")
print(f"Negative to Positive: {negative_to_positive / total_negative * 100:.2f}%" if total_negative > 0 else "Negative to Positive: N/A")
print(f"Negative to Neutral: {negative_to_neutral / total_negative * 100:.2f}%" if total_negative > 0 else "Negative to Neutral: N/A")
print(f"Neutral to Positive: {neutral_to_positive / total_neutral * 100:.2f}%" if total_neutral > 0 else "Neutral to Positive: N/A")
print(f"Neutral to Negative: {neutral_to_negative / total_neutral * 100:.2f}%" if total_neutral > 0 else "Neutral to Negative: N/A")

# Filter the DataFrame to only include rows where the 'sentiment_label' is 'negative' and the 'single_sentiment_label' is 'neutral'
negative_to_neutral = df3[(df3['sentiment_label'] == 'negative') & (df3['single_sentiment_label'] == 'neutral')]

# Calculate the distribution of emotions in the 'single_emotion_label' column
emotion_distribution = negative_to_neutral['single_emotion_label'].value_counts(normalize=True) * 100

# Print the results
emotion_distribution

Positive matches: 74.55%
Negative matches: 33.00%
Neutral matches: 76.09%
Positive differences: 25.45%
Negative differences: 67.00%
Neutral differences: 23.91%
Positive to Negative: 2.13%
Positive to Neutral: 23.32%
Negative to Positive: 9.39%
Negative to Neutral: 57.61%
Neutral to Positive: 19.39%
Neutral to Negative: 4.52%


single_emotion_label
neutral      97.918474
desire        1.105811
curiosity     0.607112
surprise      0.368604
Name: proportion, dtype: float64

In [38]:
def load_bertopic_model(filename):
    """
    Load a BERTopic model and associated data from a file.
    
    :param filename: The name of the file to load the data from.
    :return: A tuple containing the loaded BERTopic model, topics, probs, and docs variables.
    """
    # Load the BERTopic model
    topic_model = BERTopic.load(filename)
    
    # Load the topics, probs, and docs variables
    with open(filename + '_data.pkl', 'rb') as f:
        topics, probs, embeddings, docs = pickle.load(f)
    
    return topic_model, topics, probs, embeddings, docs

topic_model, topics, probs, embeddings, docs = load_bertopic_model('raw_keybert_bertopic_model')

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [39]:
import copy

topics_to_merge = [ [42,3,0,13], #Delivery Deadlines : challenges and strategies involved in managing delivery deadlines in logistics operations. (vert)
                    [20,50,27], #Quotation and Pricing Strategies (vert bas)
                    [35,32], #Touch Panels and Screens (rouge, haut)
                    [40,36], #Frequency Converters : frequency converters used in industrial applications and the technical support provided by manufacturers and suppliers (rouge, suite)
                    [37,21,6,12,9,4,1,14,16,31,19], #“Automation Components” : hardware and software components used in industrial automation systems. (rouge centre)
                    [33,46,8], #Product Evaluation : evaluate the quality, affordability and reliability of products and services (rouge, fin)
                    [44,51,23,41,49,57,22], #Customer Support : Reliability and Quality in Customer Service and Support (bleu ciel)
                    [58,59], #Quick Customer Service (marron)
                    [38,10,26,52,39,43], #Problem Solving and Communication (focus on the importance of being efficient and precise when solving problems) (jaune)
                    [45,47,55,53,54], #Assistance and Guidance (noir)
                    [29,30,11,24], #Power Supply Issues (2e vert, haut)
                    [7,5,2,25,15,34,18,28,17], #Technical Support (2e vert, bas)
                    [48,56] #None : positive feedback (2e rouge)
]

# names = ["Delivery Deadlines",
#     "Pricing", #Quotation and Pricing Strategies
#     "Touch Screens", #Touch Panels and Screens
#     "Frequency Converters",
#     "Automation Components",
#     "Product Evaluation",
#     "Customer Support", #Reliability and Quality in Customer Service and Support
#     "Quick Customer Service",
#     "Problem Solving & Comm",
#     "Assistance", #Assistance and Guidance
#     "Power Supply Issues",
#     "Technical Support",
#     "positive feedback"]

# # Create a dictionary where the keys are the topics and the values are the custom labels
# topic_labels_dict = {}
# topic_labels_dict[-1]="Outliers"
# for i in range(len(topics_to_merge)):
#     for topic in topics_to_merge[i]:
#         topic_labels_dict[topic] = names[i]

topic_model_merged = copy.deepcopy(topic_model)
topic_model_merged.merge_topics(docs, topics_to_merge)
# topic_model_merged.set_topic_labels(topic_labels_dict)

# topic_model_merged.visualize_barchart(top_n_topics=50, custom_labels=True)

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [40]:
label_names = [
    "Outliers",
    "Automation Components",
    "Technical Support",
    "Delivery Deadlines",
    "Problem Solving & Comm",
    "Power Supply Issues",
    "Customer Support", #Reliability and Quality in Customer Service and Support
    "Product Evaluation",
    "Pricing", #Quotation and Pricing Strategies
    "Assistance", #Assistance and Guidance
    "Touch Screens", #Touch Panels and Screens
    "Frequency Converters",
    "Positive feedback",
    "Quick Customer Service"
    ]
mergedtopic_labels_dict = {i-1: item for i, item in enumerate(label_names)}

topic_model_merged.set_topic_labels(mergedtopic_labels_dict)

topic_model_merged.get_topic_info()

Unnamed: 0,Topic,Count,Name,CustomName,Representation,Representative_Docs
0,-1,15590,-1_customer service_delivery time_sales_techni...,Outliers,"[customer service, delivery time, sales, techn...",[it's impossible for me to recommend the last ...
1,0,6150,0_technical support_programmable logic control...,Automation Components,"[technical support, programmable logic control...","[in general, the products that we buy from you..."
2,1,4786,1_technical support_response time_customer ser...,Technical Support,"[technical support, response time, customer se...","[the other day, i received support for the fir..."
3,2,4044,2_delivery time_delays_delivery date_deliveries,Delivery Deadlines,"[delivery time, delays, delivery date, deliver...",[at schneider electric there is a lack of comm...
4,3,1079,3_answered_answers_precise_answer,Problem Solving & Comm,"[answered, answers, precise, answer, accuratel...",[because there is an accurate answer to the qu...
5,4,985,4_ups_uninterruptible power supply_technical s...,Power Supply Issues,"[ups, uninterruptible power supply, technical ...",[we are facing a problem and we have got it co...
6,5,825,5_technical support_reliability_quality_techni...,Customer Support,"[technical support, reliability, quality, tech...","[quality products, perfect service and technic..."
7,6,707,6_rating_quality_performance_reliability,Product Evaluation,"[rating, quality, performance, reliability, co...",[the reason for the rating is good quality and...
8,7,519,7_pricing_prices_discount_quotation,Pricing,"[pricing, prices, discount, quotation, orders,...",[i would not recommend it because i requested ...
9,8,352,8_responsiveness_responsively_responsive_feedback,Assistance,"[responsiveness, responsively, responsive, fee...","[responsiveness and advice, good advice, respo..."


In [41]:
# visualize the topic representation of major topics per class:
topics_per_sentiment = topic_model_merged.topics_per_class(df3["allComment"].astype(str).tolist(), classes=df3.sentiment_label.to_list())
topic_model_merged.visualize_topics_per_class(topics_per_sentiment, custom_labels=True)

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [21]:
# visualize the topic representation of major topics per class:
topics_per_emotion = topic_model_merged.topics_per_class(df3["allComment"].astype(str).tolist(), classes=df3.single_emotion_label.to_list())
topic_model_merged.visualize_topics_per_class(topics_per_emotion, top_n_topics=30, custom_labels=True)

'1.23.5'