In [None]:
import pandas as pd

# Load your dataset
data = pd.read_csv('labeled_comments_cleaned.csv')

# Ensure there are no missing values in the relevant columns
data = data[['comment_full_text', 'level_0']].dropna()

# Split the data into batches
batch_size = 50  # Adjust this based on your memory capacity
batches = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]

# Inspect the first batch
print(batches[0].head())

# Example of how you might process each batch
for batch in batches:
    texts = batch['comment_full_text'].tolist()
    labels = batch['level_0'].tolist()
    # Here you would add your processing logic, e.g., zero-shot or n-shot predictions


                                   comment_full_text               level_0
0                                                Ok?  INFORMATION EXCHANGE
1  This has been discussed in the Executive Summa...          MODIFICATION
2  This has been discussed in the Executive Summa...  SOCIAL COMMUNICATION
3  This has been discussed in the Executive Summa...          MODIFICATION
4    CODING\n\nCode qualitative data for WAVGUAGE03A  INFORMATION EXCHANGE


In [None]:
import pandas as pd
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Load your dataset
data = pd.read_csv('labeled_comments_cleaned.csv')  # Use your actual file path

# Ensure there are no missing values in the relevant columns
data = data[['comment_full_text', 'level_0']].dropna()

# Split the data into batches
batch_size = 100  # Adjust this based on your memory capacity
batches = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]

# Initialize the zero-shot classifier
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define candidate labels (this can be based on your specific use case)
candidate_labels = ["INFORMATION EXCHANGE", "MODIFICATION", "SOCIAL COMMUNICATION"]

# Lists to hold true labels and predicted labels
all_true_labels = []
all_predicted_labels = []

# Process each batch
for batch in batches:
    texts = batch['comment_full_text'].tolist()
    true_labels = batch['level_0'].tolist()

    # Store true labels
    all_true_labels.extend(true_labels)

    # Make predictions
    for text in texts:
        result = zero_shot_classifier(text, candidate_labels)
        predicted_label = result['labels'][0]  # Take the label with the highest score
        all_predicted_labels.append(predicted_label)

# Calculate performance metrics
accuracy = accuracy_score(all_true_labels, all_predicted_labels)
precision = precision_score(all_true_labels, all_predicted_labels, average='weighted')
recall = recall_score(all_true_labels, all_predicted_labels, average='weighted')
f1 = f1_score(all_true_labels, all_predicted_labels, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Accuracy: 0.47325185333600484
Precision: 0.49669816947247025
Recall: 0.47325185333600484
F1 Score: 0.4208836468269039


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, precision_recall_fscore_support  # <-- Add this import

# Load your dataset
data = pd.read_csv('labeled_comments_cleaned.csv')  # Use your actual file path

# Ensure there are no missing values in the relevant columns
data = data[['comment_full_text', 'level_0']].dropna()

# Map labels to integers
label_list = data['level_0'].unique().tolist()
label_to_id = {label: i for i, label in enumerate(label_list)}
data['labels'] = data['level_0'].map(label_to_id)

# Split the data into training and validation sets
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the text
def tokenize(batch):
    return tokenizer(batch['comment_full_text'], padding=True, truncation=True)

# Convert datasets to the Hugging Face `Dataset` format
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize, batched=True, remove_columns=['comment_full_text', 'level_0'])
val_dataset = val_dataset.map(tokenize, batched=True, remove_columns=['comment_full_text', 'level_0'])

# Convert datasets to the Torch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Load the model
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=len(label_list)
)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch',
    save_strategy='epoch',
)

# Define a function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model on the validation set
eval_results = trainer.evaluate()

print(f"Validation Accuracy: {eval_results['eval_accuracy']}")
print(f"Validation F1 Score: {eval_results['eval_f1']}")
print(f"Validation Precision: {eval_results['eval_precision']}")
print(f"Validation Recall: {eval_results['eval_recall']}")


Map:   0%|          | 0/3992 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.320747,0.515516,0.364303,0.433476,0.515516
2,No log,1.071949,0.595596,0.510645,0.577812,0.595596
3,No log,0.820963,0.690691,0.633566,0.595691,0.690691


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.6906906906906907
Validation F1 Score: 0.6335664302872711
Validation Precision: 0.5956913793797542
Validation Recall: 0.6906906906906907


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import pandas as pd
from transformers import pipeline
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset
data = pd.read_csv('labeled_comments_cleaned.csv')

# Ensure there are no missing values in the relevant columns
data = data[['comment_full_text', 'level_0']].dropna()

# Split the data into batches
batch_size = 100  # Adjust this based on your memory capacity
batches = [data[i:i + batch_size] for i in range(0, len(data), batch_size)]

# Initialize the zero-shot classifier
zero_shot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

# Define candidate labels
candidate_labels = ["INFORMATION EXCHANGE", "MODIFICATION", "SOCIAL COMMUNICATION"]

# Lists to hold true labels and predicted labels
all_true_labels = []
all_predicted_labels = []

# Process each batch
for i, batch in enumerate(batches):
    texts = batch['comment_full_text'].tolist()
    true_labels = batch['level_0'].tolist()

    # Store true labels
    all_true_labels.extend(true_labels)

    # Make predictions
    batch_predicted_labels = []
    for text in texts:
        result = zero_shot_classifier(text, candidate_labels)
        predicted_label = result['labels'][0]  # Take the label with the highest score
        batch_predicted_labels.append(predicted_label)

    # Ensure that the lengths match
    if len(batch_predicted_labels) != len(true_labels):
        print(f"Batch {i} has mismatched lengths: {len(true_labels)} true labels vs {len(batch_predicted_labels)} predicted labels")

    all_predicted_labels.extend(batch_predicted_labels)

# After the loop, check the final lengths
if len(all_true_labels) != len(all_predicted_labels):
    print(f"Final mismatch: {len(all_true_labels)} true labels vs {len(all_predicted_labels)} predicted labels")

# Calculate performance metrics
accuracy = accuracy_score(all_true_labels, all_predicted_labels)
precision = precision_score(all_true_labels, all_predicted_labels, average='weighted')
recall = recall_score(all_true_labels, all_predicted_labels, average='weighted')
f1 = f1_score(all_true_labels, all_predicted_labels, average='weighted')

print(f"Zero-shot Accuracy: {accuracy}")
print(f"Zero-shot Precision: {precision}")
print(f"Zero-shot Recall: {recall}")
print(f"Zero-shot F1 Score: {f1}")

# Generate confusion matrix
conf_matrix = confusion_matrix(all_true_labels, all_predicted_labels, labels=candidate_labels)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=candidate_labels, yticklabels=candidate_labels)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Zero-shot Learning Confusion Matrix')
plt.show()
