### Testing for Zero shot pipeline

In [3]:
from transformers import pipeline
import json

# Initialize zero-shot classification pipelines for BERT and RoBERTa
bert_classifier = pipeline("zero-shot-classification", model="bert-base-uncased")
roberta_classifier = pipeline("zero-shot-classification", model="roberta-base")

# Sample text
text = "I'm having trouble with my internet connection."

# Candidate labels (intents)
candidate_labels = ["Technical Support", "Billing Inquiry", "Service Complaint", "General Inquiry"]

# Perform zero-shot classification with BERT
bert_predictions = bert_classifier(text, candidate_labels)

# Perform zero-shot classification with RoBERTa
roberta_predictions = roberta_classifier(text, candidate_labels)

# Print results
print("BERT Predictions:")
print(json.dumps(bert_predictions,indent=4))

print("\nRoBERTa Predictions:")
print(json.dumps(roberta_predictions,indent=4))

# Get the best prediction for BERT
best_bert_prediction = max(bert_predictions["scores"])
best_bert_label = bert_predictions["labels"][bert_predictions["scores"].index(best_bert_prediction)]

# Get the best prediction for RoBERTa
best_roberta_prediction = max(roberta_predictions["scores"])
best_roberta_label = roberta_predictions["labels"][roberta_predictions["scores"].index(best_roberta_prediction)]

# Print the best predictions
print("Best BERT Prediction:", best_bert_label)
print("Best RoBERTa Prediction:", best_roberta_label)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model co

BERT Predictions:
{
    "sequence": "I'm having trouble with my internet connection.",
    "labels": [
        "General Inquiry",
        "Billing Inquiry",
        "Service Complaint",
        "Technical Support"
    ],
    "scores": [
        0.33063021302223206,
        0.23186515271663666,
        0.22003042697906494,
        0.21747422218322754
    ]
}

RoBERTa Predictions:
{
    "sequence": "I'm having trouble with my internet connection.",
    "labels": [
        "Technical Support",
        "Service Complaint",
        "Billing Inquiry",
        "General Inquiry"
    ],
    "scores": [
        0.2516486644744873,
        0.2500995695590973,
        0.24931930005550385,
        0.24893245100975037
    ]
}
Best BERT Prediction: General Inquiry
Best RoBERTa Prediction: Technical Support


### Test with 100 datapoint with 3 model only just to have a quick view 

In [5]:
import time
import pandas as pd
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


import warnings

# Suppress the specific UserWarning
warnings.filterwarnings("ignore", message="Length of IterableDataset")

# Load the dataset from CSV
dataset = pd.read_csv("customer_support_tickets.csv")
# Load the dataset from CSV


# Select a subset of 100 samples
subset = dataset.sample(n=100, random_state=42)

# Define the models
models = {
    "BERT": "bert-base-uncased",
    "RoBERTa": "roberta-base",
    "SBERT": "sentence-transformers/paraphrase-MiniLM-L6-v2"
}

# Initialize an empty DataFrame to store metrics
metrics_df = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1-score", "Time taken"])

# Iterate over models
for model_name, model in models.items():
    print(f"Training and evaluating {model_name}...")
    
    # Initialize zero-shot classification pipeline
    classifier = pipeline("zero-shot-classification", model=model)
    
    # Extract ticket descriptions and true labels from the subset
    texts = subset["Ticket Description"].tolist()
    true_labels = subset["Ticket Type"].tolist()
    
    # Candidate labels (intents) - assuming unique ticket types are the labels
    candidate_labels = subset["Ticket Type"].unique().tolist()
    
    # Measure the time taken for classification
    start_time = time.time()
    
    # Perform zero-shot classification
    predictions = classifier(texts, candidate_labels=candidate_labels, multi_label=False)
    
    # Measure the time taken for classification
    end_time = time.time()
    duration = end_time - start_time
    
    # Extract predicted labels
    predicted_labels = [prediction['labels'][0] for prediction in predictions]
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    
    # Append metrics to DataFrame
    metrics_df = metrics_df.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1,
        "Time taken": duration
    }, ignore_index=True)

# Print the metrics DataFrame
print(metrics_df)


Training and evaluating BERT...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
  _warn_prf(average, modifier, msg_start, len(result))


Training and evaluating RoBERTa...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
  _warn_prf(average, modifier, msg_start, len(result))


Training and evaluating SBERT...


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/paraphrase-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


     Model  Accuracy  Precision  Recall  F1-score  Time taken
0     BERT      0.17   0.193575    0.17  0.087747   59.506135
1  RoBERTa      0.21   0.086684    0.21  0.119684   63.319368
2    SBERT      0.24   0.243208    0.24  0.207631    8.992744


  _warn_prf(average, modifier, msg_start, len(result))


### Test with 100 datapoint with 9 model only just to have a quick view

In [8]:
import time
import pandas as pd
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load the dataset from CSV
dataset = pd.read_csv("customer_support_tickets.csv")

# Select a subset of 100 samples
subset = dataset.sample(n=100, random_state=42)

# Define the models for comparison
models = {
    "BERT": "bert-base-uncased",
    "RoBERTa": "roberta-base",
    "SBERT": "sentence-transformers/paraphrase-MiniLM-L6-v2",
    "XLNet": "xlnet-base-cased",
    "DistilBERT": "distilbert-base-uncased",
    "ALBERT": "albert-base-v2",
    "Electra": "google/electra-base-discriminator",
    "GPT-2": "gpt2",
    "T5": "t5-base",
}

# Initialize an empty DataFrame to store metrics
metrics_df = pd.DataFrame(columns=["Model", "Accuracy", "Precision", "Recall", "F1-score", "Time taken"])

# Iterate over models
for model_name, model in models.items():
    print(f"Training and evaluating {model_name}...")
    
    # Initialize zero-shot classification pipeline
    classifier = pipeline("zero-shot-classification", model=model)
    
    # Extract ticket descriptions and true labels from the subset
    texts = subset["Ticket Description"].tolist()
    true_labels = subset["Ticket Type"].tolist()
    
    # Candidate labels (intents) - assuming unique ticket types are the labels
    candidate_labels = subset["Ticket Type"].unique().tolist()
    
    # Measure the time taken for classification
    start_time = time.time()
    
    # Perform zero-shot classification
    predictions = classifier(texts, candidate_labels=candidate_labels, multi_label=False)
    
    # Measure the time taken for classification
    end_time = time.time()
    duration = end_time - start_time
    
    # Extract predicted labels
    predicted_labels = [prediction['labels'][0] for prediction in predictions]
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    
    # Append metrics to DataFrame
    metrics_df = metrics_df.append({
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1,
        "Time taken": duration
    }, ignore_index=True)

# Print the metrics DataFrame
print(metrics_df)


Training and evaluating BERT...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
  _warn_prf(average, modifier, msg_start, len(result))


Training and evaluating RoBERTa...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
  _warn_prf(average, modifier, msg_start, len(result))


Training and evaluating SBERT...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/paraphrase-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
  _warn_prf(average, modifier, msg_start, len(result))


Training and evaluating XLNet...


config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.bias', 'logits_proj.weight', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Training and evaluating DistilBERT...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
  _warn_prf(average, modifier, msg_start, len(result))


Training and evaluating ALBERT...


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


Training and evaluating Electra...


config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
  _warn_prf(average, modifier, msg_start, len(result))


Training and evaluating GPT-2...


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Tokenizer was not supporting padding necessary for zero-shot, attempting to use  `pad_token=eos_token`
  _warn_prf(average, modifier, msg_start, len(result))


Training and evaluating T5...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at t5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.


        Model  Accuracy  Precision  Recall  F1-score  Time taken
0        BERT      0.20   0.233477    0.20  0.134553   47.856331
1     RoBERTa      0.25   0.105000    0.25  0.130286   49.809778
2       SBERT      0.14   0.050000    0.14  0.063846   10.026513
3       XLNet      0.21   0.116282    0.21  0.127258   83.101644
4  DistilBERT      0.21   0.087683    0.21  0.114476   17.317916
5      ALBERT      0.18   0.178392    0.18  0.170265   30.024693
6     Electra      0.21   0.083936    0.21  0.094188   25.235507
7       GPT-2      0.14   0.136278    0.14  0.090994   26.970650
8          T5      0.20   0.387683    0.20  0.118937  118.375080


### Test with full  datapoint(8000*) with 9 model to have a full view 

In [1]:
import time
import pandas as pd
from transformers import pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from concurrent.futures import ThreadPoolExecutor


import warnings

# Suppress the specific UserWarning
warnings.filterwarnings("ignore", message="Length of IterableDataset")


# Define the function to evaluate a single model
def evaluate_model(model_name, model, subset):
    print(f"Evaluating {model_name}...\n")
    
    # Initialize zero-shot classification pipeline
    classifier = pipeline("zero-shot-classification", model=model)
    
    # Extract ticket descriptions and true labels from the subset
    texts = subset["Ticket Description"].tolist()
    true_labels = subset["Ticket Type"].tolist()
    
    # Candidate labels (intents) - assuming unique ticket types are the labels
    candidate_labels = subset["Ticket Type"].unique().tolist()
    
    # Measure the time taken for classification
    start_time = time.time()
    
    # Perform zero-shot classification
    predictions = classifier(texts, candidate_labels=candidate_labels, multi_label=False)
    
    # Measure the time taken for classification
    end_time = time.time()
    duration = end_time - start_time
    
    # Extract predicted labels
    predicted_labels = [prediction['labels'][0] for prediction in predictions]
    
    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    
    # Print the metrics for this model
    print(f"\nMetrics for {model_name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")
    print(f"Time taken: {duration} seconds")
    
    # Return evaluation metrics
    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-score": f1,
        "Time taken": duration
    }

# Load the dataset from CSV
subset = pd.read_csv("customer_support_tickets.csv")

# Define the models for comparison
models = {
    "BERT": "bert-base-uncased",
    "RoBERTa": "roberta-base",
    "SBERT": "sentence-transformers/paraphrase-MiniLM-L6-v2",
    "XLNet": "xlnet-base-cased",
    "DistilBERT": "distilbert-base-uncased",
    "ALBERT": "albert-base-v2",
    "Electra": "google/electra-base-discriminator",
    "GPT-2": "gpt2",
    "T5": "t5-base"
}

# Initialize an empty list to store results
results = []

# Initialize ThreadPoolExecutor
with ThreadPoolExecutor() as executor:
    # Submit tasks for each model
    futures = {model_name: executor.submit(evaluate_model, model_name, model, subset) for model_name, model in models.items()}
    
    # Collect and print results as they complete
    for model_name, future in futures.items():
        result = future.result()
        results.append(result)

# Create a DataFrame from the results
metrics_df = pd.DataFrame(results)

# Print the final metrics DataFrame
print("\nFinal Metrics DataFrame:")
print(metrics_df)


2024-03-24 13:26:08.450114: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Evaluating BERT...

Evaluating RoBERTa...
Evaluating SBERT...

Evaluating XLNet...


Evaluating DistilBERT...

Evaluating ALBERT...

Evaluating Electra...

Evaluating GPT-2...
Evaluating T5...




Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sentence-transformers/paraphrase-MiniLM-L6-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
Some weig


Metrics for SBERT:
Accuracy: 0.18845200141693233
Precision: 0.23351400361525787
Recall: 0.18845200141693233
F1-score: 0.11970813721613355
Time taken: 6498.826976060867 seconds

Metrics for DistilBERT:
Accuracy: 0.206163655685441
Precision: 0.2661173578462602
Recall: 0.206163655685441
F1-score: 0.08052266740891438
Time taken: 9972.261229991913 seconds

Metrics for Electra:
Accuracy: 0.2038021017829732
Precision: 0.2074110853064702
Recall: 0.2038021017829732
F1-score: 0.1556103342464754
Time taken: 17547.84075808525 seconds

Metrics for BERT:
Accuracy: 0.1970716731609399
Precision: 0.16653531401767593
Recall: 0.1970716731609399
F1-score: 0.1511419893092956
Time taken: 17538.676689863205 seconds


  _warn_prf(average, modifier, msg_start, len(result))



Metrics for RoBERTa:
Accuracy: 0.19553666312433582
Precision: 0.10958946076281297
Recall: 0.19553666312433582
F1-score: 0.1209602798150657
Time taken: 17611.67304778099 seconds


  _warn_prf(average, modifier, msg_start, len(result))



Metrics for GPT-2:
Accuracy: 0.1996693824536545
Precision: 0.1635266707660979
Recall: 0.1996693824536545
F1-score: 0.1453089680337354
Time taken: 18830.012189149857 seconds

Metrics for ALBERT:
Accuracy: 0.20663596646593457
Precision: 0.20044928233279574
Recall: 0.20663596646593457
F1-score: 0.1761370229096668
Time taken: 19803.98435997963 seconds

Metrics for XLNet:
Accuracy: 0.19813437241705043
Precision: 0.16141832203625053
Recall: 0.19813437241705043
F1-score: 0.1526886537036815
Time taken: 20270.039855241776 seconds

Metrics for T5:
Accuracy: 0.20002361553902467
Precision: 0.22927261620020847
Recall: 0.20002361553902467
F1-score: 0.15541347461183408
Time taken: 22553.640382766724 seconds

Final Metrics DataFrame:
        Model  Accuracy  Precision    Recall  F1-score    Time taken
0        BERT  0.197072   0.166535  0.197072  0.151142  17538.676690
1     RoBERTa  0.195537   0.109589  0.195537  0.120960  17611.673048
2       SBERT  0.188452   0.233514  0.188452  0.119708   6498.82