In [17]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('sentences_author.csv')

In [18]:
from transformers import BertForSequenceClassification, BertTokenizer
import torch
from torch.nn import Sigmoid

# Load model and tokenizer (ensure they are already initialized)
# checkpoint path: /home/nana/DASP_report_template/model_training/nlp/review_to_theme/results/checkpoint-1810
# bert-base-uncased
model = BertForSequenceClassification.from_pretrained("/home/nana/DASP_report_template/model_training/nlp/review_to_theme/results/final_model", num_labels=11, problem_type="multi_label_classification")
tokenizer = BertTokenizer.from_pretrained("/home/nana/DASP_report_template/model_training/nlp/review_to_theme/results/final_model")

# Make sure the model is in evaluation mode
model.eval()

def predict(input_text, threshold=0.5):
    # Tokenize the input text using tokenizer (handles padding, truncation, etc.)
    inputs = tokenizer(input_text, truncation=True, padding="max_length", max_length=128, return_tensors="pt")

    # Run the model for prediction
    with torch.no_grad():  # Disable gradient calculation during inference
        outputs = model(**inputs)
        logits = outputs.logits

    # Apply sigmoid to logits (since it's multi-label classification)
    sigmoid = Sigmoid()
    probabilities = sigmoid(logits)

    # Apply threshold to get binary predictions (0 or 1)
    predictions = (probabilities > threshold).int()

    # Return the binary predictions (as a list)
    return predictions.squeeze().tolist()

# Example usage
input_text = "Lack of limitations."
predicted_labels = predict(input_text)

print("Predicted Labels:", predicted_labels)


In [19]:
labels = ['ANA', 'BIB', 'DAT', 'EXP', 'INT', 'MET', 'OAL', 'PDI', 'RES', 'RWK', 'TNF']
# Find all indices of 1
indices = [i for i, value in enumerate(predicted_labels) if value == 1]
true_labels = [labels[i] for i in indices]

In [20]:
true_labels

In [21]:
def predict_theme_category(text):
    # Load the pretrained model and tokenizer
    model = BertForSequenceClassification.from_pretrained("/home/nana/DASP_report_template/model_training/nlp/review_to_theme/results/final_model", num_labels=11, problem_type="multi_label_classification")
    tokenizer = BertTokenizer.from_pretrained("/home/nana/DASP_report_template/model_training/nlp/review_to_theme/results/final_model")
    model.eval()
    threshold = 0.5
    # Tokenize the input text using tokenizer (handles padding, truncation, etc.)
    inputs = tokenizer(text, truncation=True, padding="max_length", max_length=128, return_tensors="pt")

    # Run the model for prediction
    with torch.no_grad():  # Disable gradient calculation during inference
        outputs = model(**inputs)
        logits = outputs.logits

    # Apply sigmoid to logits (since it's multi-label classification)
    sigmoid = Sigmoid()
    probabilities = sigmoid(logits)

    # Apply threshold to get binary predictions (0 or 1)
    predictions = (probabilities > threshold).int()
    predictions = predictions.squeeze().tolist()

    labels = ['ANA', 'BIB', 'DAT', 'EXP', 'INT', 'MET', 'OAL', 'PDI', 'RES', 'RWK', 'TNF']
    # Find all indices of 1
    indices = [i for i, value in enumerate(predictions) if value == 1]
    true_labels = [labels[i] for i in indices]

    # Return the binary predictions (as a list)
    return true_labels

In [22]:
df['attitude_themes'] = df['sentence'].apply(predict_theme_category)

In [23]:
df

In [24]:
def create_clusters(row):
    # Extract root and themes
    root = row["attitude_root"]
    themes = row["attitude_theme"]
    
    # Combine root with each theme
    clusters = [f"{root}({theme})" for theme in themes]
    return clusters

In [25]:
# Apply the function to create clusters
df_cluster = df
df_cluster["clusters"] = df_cluster.apply(create_clusters, axis=1)
# Explode clusters to create one row per cluster
df_cluster = df_cluster.explode("clusters", ignore_index=True)
print(df_cluster)