In [12]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

# df = pd.read_csv("/media/cattiaux/DATA/Wassati/team_data/schneider/df_all_labelled.csv", dtype={'year': str})

  df = pd.read_csv("/media/cattiaux/DATA/Wassati/team_data/schneider/df_all_labelled.csv", dtype={'year': str})


In [21]:
df_full = pd.read_csv("../data/csv_files/schneider_all_processed_labelled_full.csv", dtype={'year': str})
df = df_full[df_full['non_empty_rows']]

  df_full = pd.read_csv("../data/csv_files/schneider_all_processed_labelled_full.csv", dtype={'year': str})


In [16]:
df.columns

Index(['Unnamed: 0.4', 'Unnamed: 0.3', 'Unnamed: 0.2', 'Unnamed: 0.1',
       'Unnamed: 0', 'Account Country', 'Front Office Country', 'Clusters',
       'Zone', 'Operation', 'Primary Coverage Model', 'Account owner role',
       'Market Segment', 'Leading BU', 'Personas', 'State/Province',
       'Account ID Text', 'Classification Level 1 Name',
       'Classification Level 2', 'bFO Account ID', 'Job Title', 'Job Role',
       'Contact ID', 'bFO Contact ID', 'Survey ID', 'Unique ID',
       'Creation Date', 'Local Response Date', 'Response Date',
       'Accepted for reporting on date', 'Touchpoint', 'Interaction', 'Brand',
       'Survey language', 'Quarantine Rule Exception',
       'What is the Customer Feedback?', 'Likelihood to Recommend (SE)',
       'Customer_Comments', 'Translation_Customer_Comments',
       'Overall_Additional_Comments',
       'Translation_Overall_Additional_Comments', 'Overall Satisfaction',
       'Reason_for_Score_Comment', 'Translation_Reason_for_Score_C

In [17]:
df_full.columns

Index(['Account Country', 'Front Office Country', 'Clusters', 'Zone',
       'Operation', 'Primary Coverage Model', 'Account owner role',
       'Market Segment', 'Leading BU', 'Personas', 'State/Province',
       'Account ID Text', 'Classification Level 1 Name',
       'Classification Level 2', 'bFO Account ID', 'Job Title', 'Job Role',
       'Contact ID', 'bFO Contact ID', 'Survey ID', 'Unique ID',
       'Creation Date', 'Local Response Date', 'Response Date',
       'Accepted for reporting on date', 'Touchpoint', 'Interaction', 'Brand',
       'Survey language', 'Quarantine Rule Exception',
       'What is the Customer Feedback?', 'Likelihood to Recommend (SE)',
       'Customer_Comments', 'Translation_Customer_Comments',
       'Overall_Additional_Comments',
       'Translation_Overall_Additional_Comments', 'Overall Satisfaction',
       'Reason_for_Score_Comment', 'Translation_Reason_for_Score_Comment',
       'Anything_Else_Comment', 'Translation_Anything_Else_Comment', 'year',

In [6]:
def load_model_huggingface(model_name, task, problem_type=None, **kwargs):
    """
    This function loads a model and tokenizer from a given model name, then creates a pipeline to perform a specified task.

    Args:
        model_name (str): The name of the model to load.
        task (str): The type of task to perform with the pipeline.
        problem_type (str): The type of problem to solve ("multi_label_classification" for multi-label tasks).
        **kwargs: Additional arguments to pass to the pipeline.

    Returns:
        pipeline: A pipeline configured to perform the specified task with the loaded model and tokenizer.
    """
    model = AutoModelForSequenceClassification.from_pretrained(model_name, problem_type=problem_type)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    classifier = pipeline(task, model=model, tokenizer=tokenizer, **kwargs)
    return classifier

def add_single_label_predictions(df, predictions, predicted_column_name):
    """
    This function merges the DataFrame of single-label predictions with the original DataFrame.

    Args:
        df (pd.DataFrame): The original DataFrame.
        predictions (list): The list of predictions. Each prediction is a dictionary containing a 'label' and a 'score'.
        predicted_column_name (str): The name of the column to be added to the DataFrame.

    Returns:
        pd.DataFrame: The original DataFrame with added columns for the predicted labels and their scores.
    """
    predicted_df = df
    # Convert the predictions to a DataFrame
    prediction_results = pd.DataFrame(predictions)
    prediction_results.rename(columns={'label': predicted_column_name}, inplace=True)
    # # Reset the indices of the DataFrames (if necessary)
    # df.reset_index(drop=True, inplace=True)
    # prediction_results.reset_index(drop=True, inplace=True)
    # Merge the original DataFrame with the prediction results
    df_predicted = pd.concat([predicted_df, prediction_results], axis=1)
    return df_predicted

def add_multi_label_predictions(df, predictions, predicted_column_name):
    """
    This function adds a new column with multi-label predictions to the DataFrame and also adds two more columns for 
    the best label and its score.

    Args:
        df (pd.DataFrame): The original DataFrame.
        predictions (list): The list of predictions. Each prediction is a list of dictionaries, where each dictionary 
                            contains a 'label' and a 'score'.
        predicted_column_name (str): The name of the column to be added to the DataFrame.

    Returns:
        pd.DataFrame: The original DataFrame with added columns for the predicted labels and their scores, as well as 
                      columns for the best label and its score.
    """
    predicted_df = df
    # Keep the original predictions as they are (a list of dictionaries) and add them to the DataFrame as a new column
    predicted_df[predicted_column_name] = predictions
    # Add columns for the best label and its score
    predicted_df[f'best_{predicted_column_name}'] = predicted_df[predicted_column_name].apply(lambda x: max(x.keys(), key=lambda k: x[k]) if x else None)
    predicted_df[f'best_{predicted_column_name}_score'] = predicted_df[predicted_column_name].apply(lambda x: x[max(x.keys(), key=lambda k: x[k])] if x else None)
    return predicted_df

def make_predictions_df(classifier, df, predicted_column_name):
    """
    This function makes predictions on a DataFrame of documents using a given classifier. It adds the predictions to 
    the DataFrame as new columns. If the classifier is for single-label classification, it adds one column for the 
    predicted label and one for the score. If the classifier is for multi-label classification, it adds one column 
    with a dictionary of label-score pairs for each document, and two additional columns for the best label and its score.

    Args:
        classifier (pipeline): The Hugging Face pipeline object for making predictions.
        df (pd.DataFrame): The DataFrame containing the documents to make predictions on. It must have a 'processed_data' 
                           column with the preprocessed text of each document.
        predicted_column_name (str): The name of the column to be added to the DataFrame for the predictions.

    Returns:
        pd.DataFrame: The original DataFrame with added columns for the predictions.
    """
    # Get the list of documents from the DataFrame
    docs = df["processed_data"].tolist()
    # Get predictions
    predictions = classifier(docs)
    
    # Check if predictions is a list of dictionaries (single-label case)
    if isinstance(predictions, list) and isinstance(predictions[0], dict):
        df_predicted = add_single_label_predictions(df, predictions, predicted_column_name)
    
    # Multi-label case
    elif isinstance(predictions, list) and isinstance(predictions[0], list):
        df_predicted = add_multi_label_predictions(df, predictions, predicted_column_name)

    return df_predicted

In [None]:
# classifier = load_model_huggingface("cardiffnlp/twitter-roberta-base-sentiment-latest", "text-classification", max_length=512, truncation=True)
# predicted_df = make_predictions_df(classifier, df, 'sentiment_label')

In [9]:
classifier = load_model_huggingface("tum-nlp/Deberta_Human_Value_Detector", "text-classification", max_length=512, truncation=True, trust_remote_code=True)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [1]:
# tokenizer =  AutoTokenizer.from_pretrained("tum-nlp/Deberta_Human_Value_Detector")
# trained_model = AutoModelForSequenceClassification.from_pretrained("tum-nlp/Deberta_Human_Value_Detector", trust_remote_code=True)

# example_text ='We should ban whaling because whales are a species at the risk of distinction'

# encoding = tokenizer.encode_plus(
#         example_text,
#         add_special_tokens=True,
#         max_length=512,
#         return_token_type_ids=False,
#         padding="max_length",
#         return_attention_mask=True,
#         return_tensors='pt',
#     )

# with torch.no_grad():
#         test_prediction = trained_model(encoding["input_ids"], encoding["attention_mask"])
#         test_prediction = test_prediction["output"].flatten().numpy()

In [6]:
# THRESHOLD = 0.25
# LABEL_COLUMNS = ['Self-direction: thought','Self-direction: action','Stimulation','Hedonism','Achievement','Power: dominance','Power: resources','Face','Security: personal','Security: societal','Tradition','Conformity: rules','Conformity: interpersonal','Humility','Benevolence: caring','Benevolence: dependability','Universalism: concern','Universalism: nature','Universalism: tolerance','Universalism: objectivity']
# print(f"Predictions:")
# for label, prediction in zip(LABEL_COLUMNS, test_prediction):
#     if prediction < THRESHOLD:
#         continue
#     print(f"{label}: {prediction}")

Predictions:
Universalism: nature: 0.9919975399971008


In [19]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("tum-nlp/Deberta_Human_Value_Detector")
model = AutoModelForSequenceClassification.from_pretrained("tum-nlp/Deberta_Human_Value_Detector", trust_remote_code=True)

# Define the threshold and label columns
THRESHOLD = 0.25
LABEL_COLUMNS = ['Self-direction: thought','Self-direction: action','Stimulation','Hedonism','Achievement','Power: dominance','Power: resources','Face','Security: personal',
                 'Security: societal','Tradition','Conformity: rules','Conformity: interpersonal','Humility','Benevolence: caring','Benevolence: dependability','Universalism: concern','Universalism: nature','Universalism: tolerance','Universalism: objectivity']

# Check if a GPU is available and if not, use a CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Move the model to the GPU if available
model.to(device)

def predict_values(row):
    text = row['processed_data']    
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        return_token_type_ids=False,
        padding="max_length",
        return_attention_mask=True,
        return_tensors='pt',
    )

    # Move the tensors to the GPU if available
    encoding = {key: tensor.to(device) for key, tensor in encoding.items()}

    with torch.no_grad():
        predictions = model(encoding["input_ids"], encoding["attention_mask"])
        predictions = predictions["output"].flatten()

    # Move the predictions back to CPU and convert to numpy
    predictions = predictions.cpu().numpy()
    
    labels_scores = [(label, prediction) for label, prediction in zip(LABEL_COLUMNS, predictions) if prediction >= THRESHOLD]
    
    return labels_scores

In [22]:
df2 = df.copy()

# Apply the function to the dataframe
df2['predictions'] = df2.apply(predict_values, axis=1)

# # Explode the predictions into separate rows
# df2 = df2.explode('predictions')

# # Split the predictions into two separate columns
# df2[['schwartz_label', 'schwartz_score']] = pd.DataFrame(df2['predictions'].tolist(), index=df2.index)

# # Drop the predictions column
# df2 = df2.drop(columns='predictions')

KeyboardInterrupt: 

In [8]:
df2['schwartz_label'].value_counts()
# retrieve the texts with the highest scores for each label
# df.loc[df.groupby('schwartz_label')['schwartz_score'].idxmax()]

Achievement                   33824
Universalism: objectivity     26631
Benevolence: caring           25461
Self-direction: action        14192
Security: personal            11726
Benevolence: dependability     9745
Power: resources               2448
Conformity: rules              2339
Face                           2280
Self-direction: thought        2104
Conformity: interpersonal      1929
Power: dominance               1524
Security: societal              609
Universalism: concern           473
Hedonism                        247
Universalism: tolerance         172
Universalism: nature            157
Tradition                       111
Humility                        110
Stimulation                     109
Name: schwartz_label, dtype: int64

In [6]:
df2.to_csv("/media/cattiaux/DATA/Wassati/team_data/schneider/df_schwartz_labelled.csv")

In [15]:
len(df2)

136274

In [14]:
df2.shape[0]

136274

In [14]:
def search_terms_in_df(df, column, terms):
    """
    Search for multiple terms in a specific column in a DataFrame and add a new column with the found terms.

    Parameters:
    df (pandas.DataFrame): The DataFrame to search.
    column (str): The column in which to search for the terms.
    terms (list): The list of terms to search for.

    Returns:
    pandas.DataFrame: A DataFrame containing only the rows where any of the terms were found, with an additional column 'Found Terms'.
    """
    df_res = df.copy()
    mask = df[column].apply(lambda x: [term for term in terms if term.lower() in str(x).lower()])
    df_res['Found Terms'] = mask
    df_res = df_res[df_res['Found Terms'].apply(lambda x: len(x) > 0)]
    return df_res

terms = [
    'carbon', 
    'energy', 
    'energy saving', 
    'renewable energy', 
    'solar power', 
    'wind energy', 
    'hydroelectric power', 
    'geothermal energy', 
    'bioenergy', 
    'energy efficiency', 
    'carbon footprint', 
    'greenhouse gas emissions', 
    'climate change',
    'sustainability', 
    'conservation'
]

useful_col = ["Account Country", "Clusters", "Zone", "Market Segment", "Unique ID", "Creation Date", "year", "year_month", "Likelihood to Recommend (SE)", "Overall Satisfaction", "allComment", "processed_data", "label", "sentiment_label", "single_emotion_label", "single_sentiment_from_emotion", "score","predicted_labels", "predicted_scores"]
search_terms_in_df(df[useful_col], 'processed_data', terms)

Unnamed: 0,Account Country,Clusters,Zone,Market Segment,Unique ID,Creation Date,year,year_month,Likelihood to Recommend (SE),Overall Satisfaction,allComment,processed_data,label,sentiment_label,single_emotion_label,single_sentiment_from_emotion,score,predicted_labels,predicted_scores,Found Terms
499,Israel,Israel,CEEI,Semiconductor,zoop_823001254,2023-05-11 09:10:06,2023,2023-05-01,8.0,8.0,"excellent equipment, high responsiveness and s...","excellent equipment, high responsiveness and s...",Outlier,negative,disappointment,negative,0.898911,['disappointment'],"{'admiration': 0.026408667, 'amusement': 0.001...",[energy]
501,Switzerland,Switzerland,DACH,Automotive & E-Mobility,zoop_823001147,2023-05-09 17:15:05,2023,2023-05-01,8.0,8.0,schneider electric offers good advice to solve...,schneider electric offers good advice to solve...,Outlier,positive,approval,positive,0.798206,['approval'],"{'admiration': 0.08126737, 'amusement': 0.0004...",[energy]
1191,Australia,Australia,Pacific,Power & Grid,OP-230207-12877737_Q,2023-02-16 23:32:30,2023,2023-02-01,,1.0,"product was required urgently, sales/support t...","product was required urgently, sales/support t...",Outlier,negative,neutral,neutral,0.522348,['neutral'],"{'admiration': 0.0011441872, 'amusement': 0.00...",[energy]
1864,Finland,Finland & Baltics,Nordic & Baltics,MMM,OP-221020-12570971_Q,2022-12-07 23:33:01,2022,2022-12-01,,8.0,energy efficiency and energy saving have a hig...,energy efficiency and energy saving have a hig...,Outlier,positive,approval,positive,0.587052,[],"{'admiration': 0.08122513, 'amusement': 0.0005...","[energy, energy saving, energy efficiency]"
2042,Turkey,Turkey Central Asia and Pakistan,Middle East and Africa,Machinery,5008V00001Tj3vSQAR,2022-11-17 23:31:20,2022,2022-11-01,,10.0,"thanks to mehmet ali̇ for their support, and t...","thanks to mehmet ali̇ for their support, and t...",Automation Components,positive,gratitude,positive,0.954546,['gratitude'],"{'admiration': 0.01158437, 'amusement': 0.0023...",[energy]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31326,France,France,France,Unknown,zoop_422200637,2018-06-30 00:19:37,2018,2018-05-01,9.0,9.0,the expertise provided to me is good for my fi...,the expertise provided to me is good for my fi...,Automation Components,positive,approval,positive,0.891485,[],"{'admiration': 0.31609055, 'amusement': 0.0005...",[energy]
31484,Canada,Canada,Canada,Unknown,zoop_421969920,2018-06-30 00:19:37,2018,2018-05-01,4.0,4.0,"the service is very, very slow and the product...","the service is very, very slow and the product...",Automation Components,negative,neutral,neutral,0.936458,['neutral'],"{'admiration': 0.0014525086, 'amusement': 0.00...",[energy]
31754,China,China,China & HK,Unknown,zoop_420078798,2018-06-30 00:19:37,2018,2018-04-01,10.0,10.0,we have purchased more than 10 to 20 million's...,we have purchased more than 10 to 20 million's...,Outlier,positive,neutral,neutral,0.704671,['neutral'],"{'admiration': 0.008560948, 'amusement': 0.000...",[energy]
33070,Egypt,North East Africa and Levant,Middle East and Africa,Unknown,zoop_355802772,2018-06-30 00:39:48,2018,2017-12-01,10.0,10.0,احنا المفروض ان احنا مصنعين لمنتجات شنايدر لوح...,احنا المفروض ان احنا مصنعين لمنتجات شنايدر لوح...,Power Supply Issues,neutral,neutral,neutral,0.595841,['neutral'],"{'admiration': 0.0013201903, 'amusement': 0.00...",[solar power]
