## Data Loading

In [159]:
import logging

# Create a logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
console_handler = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
console_handler.setFormatter(formatter)
logger.addHandler(console_handler)

# Log some messages

logger.info('This is an info message')



2024-02-27 19:26:32,787 - __main__ - INFO - This is an info message
2024-02-27 19:26:32,787 - __main__ - INFO - This is an info message
2024-02-27 19:26:32,787 - __main__ - INFO - This is an info message
2024-02-27 19:26:32,787 - __main__ - INFO - This is an info message
2024-02-27 19:26:32,787 - __main__ - INFO - This is an info message
2024-02-27 19:26:32,787 - __main__ - INFO - This is an info message


02/27/2024 19:26:32 - INFO - __main__ - This is an info message


In [160]:
# !pip install tensorboard

In [1]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
import torch
import warnings
from torch.utils.data import DataLoader
import gc
import numpy as np
# Filter out all warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

from loguru import logger

## Data Loading and Analysis

In [2]:
km_data = pd.read_csv("/Users/603642/Downloads/crossencoder_train_new.csv")

In [163]:
# km_data['final'] = km_data['question_x']+','+ km_data['question_y']

In [3]:
km_data.columns = ["sentence1","sentence2","labels"]

In [165]:
# master_vrb = pd.read_csv("vrb_master.csv")

In [166]:
# master_vrb.columns

In [167]:
# master_vrb[["response_id","_id_x","_id_y"]]

In [168]:
# id_question = pd.read_csv("id_questions.csv")

In [169]:
# id_question

In [170]:
# master_vrb["response"]

In [4]:
km_data.head()

Unnamed: 0,sentence1,sentence2,labels
0,what is the unaccompanied minor fee,how much is a UMNR fee from jfk to acc,1.0
1,what is the unaccompanied minor fee,how much is a UMNR fee from jfk to acc,1.0
2,what is the unaccompanied minor fee,what is the UMNR fee,1.0
3,what is the unaccompanied minor fee,what is the term entry to unhook a ticket,0.0
4,what is the unaccompanied minor fee,how to unhook a tkt in term,0.0


In [5]:
km_data.labels.value_counts()

labels
0.0    1287154
1.0       6202
Name: count, dtype: int64

In [6]:
km_data.isna().sum()

sentence1    0
sentence2    0
labels       0
dtype: int64

In [174]:
# km_data.columns = ["sentence1","sentence2","labels"]

In [7]:
km_data[km_data.labels==0.0]

Unnamed: 0,sentence1,sentence2,labels
3,what is the unaccompanied minor fee,what is the term entry to unhook a ticket,0.0
4,what is the unaccompanied minor fee,how to unhook a tkt in term,0.0
5,what is the unaccompanied minor fee,entry to unhook ticket in term,0.0
6,what is the unaccompanied minor fee,how to unhook a ticket in term,0.0
7,what is the unaccompanied minor fee,how do you unhook a ticket in DL TERM,0.0
...,...,...,...
1293349,award travel on kenya airways,ecredit use on other person,0.0
1293350,award travel on kenya airways,Can ecredits be transferred to other passenger...,0.0
1293351,award travel on kenya airways,ecredit use on other person,0.0
1293352,award travel on kenya airways,latam carry on,0.0


## Balance the Data

In [8]:
negative_df = km_data[km_data.labels==0.0]
positive_df = km_data[km_data.labels==1.0]

In [9]:
positive_df.shape

(6202, 3)

In [10]:
balance_km = pd.concat([negative_df.iloc[:6202],positive_df])

In [11]:
balance_km.labels.value_counts()

labels
0.0    6202
1.0    6202
Name: count, dtype: int64

In [12]:
balance_km.columns

Index(['sentence1', 'sentence2', 'labels'], dtype='object')

In [181]:
# required_df = balance_km[["pair","labels"]]

## Train,Test,Validation Data Split

In [182]:
from sklearn.model_selection import train_test_split

In [183]:
train,test = train_test_split(balance_km,test_size=0.2,random_state=42)
test,validation = train_test_split(test,test_size=0.5,random_state=42)

In [184]:
print(train.shape)
print(test.shape)
print(validation.shape)

(9923, 3)
(1240, 3)
(1241, 3)


In [185]:
train = train[:2000]
test = test[:400]
validation = validation[:400]

## Convert Data to HuggingFace Dataset

In [186]:
from datasets import Dataset

In [187]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)
val_dataset = Dataset.from_pandas(validation)

In [188]:
validation

Unnamed: 0,sentence1,sentence2,labels
722316,GUC and RUC,How to reapply reg certs?,1.0
5906,how to unhook a tkt in term,what is the nearest airport to jfk,0.0
3108,how much is a UMNR fee from jfk to acc,What is wheelchair,0.0
686,what is the unaccompanied minor fee,what are transportation rules for human organs,0.0
4420,what is the UMNR fee,what are the rules of TakeOff 15,0.0
...,...,...,...
5631,what is the term entry to unhook a ticket,HOW TO DOCUMENT MISSING MILES,0.0
287779,who can go through sky priority,how do I get sky priority,1.0
1010119,can bassinets BE reserved on delta flight,can bassinets BE reserved on delta flighta,1.0
378767,does delta accept pets in cabin,bring dog on plane,1.0


In [189]:
from datasets import Dataset, DatasetDict

# Replace this with your actual data loading code
raw_datasets = DatasetDict({
    'train': Dataset.from_pandas(train),
    'validation': Dataset.from_pandas(validation),
    # 'test':Dataset.from_pandas(test_dataset)
    
})

# # Define your preprocess function
# def preprocess_function(examples):
#     return tokenizer(examples['text1'], examples['text2'], truncation=True)

# # Apply the preprocess function
# tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [190]:
# dataset = load_dataset( {'train': train_datset, 
#                                           'validation': val_datset,
#                                           'test': test_datset})

## Load Pre-trained Model

In [191]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)  # Use num_labels=1 for regression tasks like STS-B

In [192]:
def encode(examples):
    return tokenizer(examples['sentence1'], examples['sentence2'], truncation=True, padding='max_length',max_length=128)

# def encode(examples):
#     return tokenizer(examples['pair'], truncation=True, padding='max_length',max_length=128)
# encoded_dataset = dataset.map(encode, batched=True)

In [193]:
train_encoded = train_dataset.map(encode, batched=True)
val_encoded = val_dataset.map(encode, batched=True)
test_encoded = test_dataset.map(encode, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/400 [00:00<?, ? examples/s]

In [194]:
# train_encoded.data

In [195]:
# import os
# total_batch_size = 84
# num_cpu_cores = os.cpu_count()
# num_cpu_cores
# per_device_train_batch_size = total_batch_size // num_cpu_cores

In [196]:
# per_device_train_batch_size

## Model Finetuning : Training arguments

In [197]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir='./crossencoder-marco-MiniLM-L-6-v2-refine',          # Output directory for the model checkpoints
    num_train_epochs=3,              # Number of training epochs
    per_device_train_batch_size=16,   # Batch size for training
    per_device_eval_batch_size=16,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Weight decay if we apply some
    logging_dir='./logs',            # Directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # Evaluate after each epoch
    lr_scheduler_type='cosine',
    learning_rate=5e-5
    
    
)

## Metrics

In [198]:
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     accuracy = np.mean(predictions == labels)
#     return {"accuracy": accuracy}


# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
    
#     # Calculate metrics
#     accuracy = accuracy_score(labels, predictions)
#     precision = precision_score(labels, predictions, average='binary')  # Use 'micro', 'macro', or 'weighted' for multiclass
#     recall = recall_score(labels, predictions, average='binary')  # Use 'micro', 'macro', or 'weighted' for multiclass
    
#     return {
#         "accuracy": accuracy,
#         "precision": precision,
#         "recall": recall
#     }


def compute_metrics(eval_pred, threshold=0.75):
    logits, labels = eval_pred
    # Convert logits to class predictions based on the threshold
    predictions = (logits >= threshold).astype(int)
    # logger.info(logits)
    # Calculate metrics
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions)
    recall = recall_score(labels, predictions)
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall
    }


# logits = torch.tensor([[0.1, 1.2], [2.2, 3.1], [4.9, 5.2]])
# predictions = torch.argmax(logits,dim=-1)
# predictions

In [199]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=encoded_dataset['train'],
#     eval_dataset=encoded_dataset['validation'],
#     tokenizer=tokenizer
# )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=val_encoded,
    tokenizer=tokenizer,
    compute_metrics = compute_metrics
)


In [200]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall
1,0.7375,0.428392,0.8625,0.978417,0.723404
2,0.1311,0.096566,0.99,1.0,0.978723
3,0.0661,0.020387,0.985,1.0,0.968085


TrainOutput(global_step=375, training_loss=16.94432011508942, metrics={'train_runtime': 97.3382, 'train_samples_per_second': 61.641, 'train_steps_per_second': 3.853, 'total_flos': 49746940416000.0, 'train_loss': 16.94432011508942, 'epoch': 3.0})

In [201]:
# trainer.evaluate(encoded_dataset['test'])
trainer.evaluate(test_encoded)

{'eval_loss': 0.016239233314990997,
 'eval_accuracy': 0.995,
 'eval_precision': 1.0,
 'eval_recall': 0.9901960784313726,
 'eval_runtime': 0.8736,
 'eval_samples_per_second': 457.894,
 'eval_steps_per_second': 28.618,
 'epoch': 3.0}

## Save the model to Hub & Local

In [34]:
model.save_pretrained('./MiniLM-L-6-v2-refine')
tokenizer.save_pretrained('./MiniLM-L-6-v2-refine')

('./MiniLM-L-6-v2-refine/tokenizer_config.json',
 './MiniLM-L-6-v2-refine/special_tokens_map.json',
 './MiniLM-L-6-v2-refine/vocab.txt',
 './MiniLM-L-6-v2-refine/added_tokens.json',
 './MiniLM-L-6-v2-refine/tokenizer.json')

In [102]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.66k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/srmishra/crossencoder-marco-MiniLM-L-6-v2-refine/commit/e4f0add2f5def32c7d4047423553ca9bb4991b7a', commit_message='End of training', commit_description='', oid='e4f0add2f5def32c7d4047423553ca9bb4991b7a', pr_url=None, pr_revision=None, pr_num=None)

## Tensorboard Training Graphs

In [202]:
!tensorboard --logdir=./logs

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.16.2 at http://localhost:6006/ (Press CTRL+C to quit)
^C


## Load the Finetunes Model IF Instance is not available

In [166]:
tokenizer = AutoTokenizer.from_pretrained('srmishra/crossencoder-airline-refine')
model = AutoModelForSequenceClassification.from_pretrained('srmishra/crossencoder-airline-refine')

In [103]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("pjbhaumik/crossencoder-km1")
model = AutoModelForSequenceClassification.from_pretrained("pjbhaumik/crossencoder-km1")

In [140]:
def encode_sentences(sentence1, sentence2):
    return tokenizer(sentence1, sentence2, return_tensors='pt', truncation=True, padding='max_length')

# # Example pairs of sentences
# sentence_pairs = [
#     ("dog a pet animal.", "cat is pet animal."),
#     ("We can take pet in main cabin.", "Pet in main cabin requires vaccine certificate")
# ]

# encoded_inputs = [encode_sentences(pair[0], pair[1]) for pair in sentence_pairs]

# def encode_sentences(examples):
#     return tokenizer(examples['pair'], truncation=True, padding='max_length',max_length=128)

In [None]:
# # Example pairs of sentences
# sentence_pairs = [
#     ("dog a pet animal.", "cat is pet animal."),
#     ("We can take pet in main cabin.", "Pet in main cabin requires vaccine certificate")
# ]

# encoded_inputs = [encode_sentences(pair[0], pair[1]) for pair in sentence_pairs]

In [157]:
test_df = test_dataset.to_pandas()[["sentence1","sentence2"]]

In [142]:
test_df

Unnamed: 0,sentence1,sentence2
0,miles cacelation of basic economy,basic economy cancel and redeposit
1,how much is a UMNR fee from jfk to acc,is the customer care line open
2,which seats occupy bassinets,does my flight from ATL to LHR have a bassinet
3,how much is a UMNR fee from jfk to acc,how long does it take to reopen a global upgra...
4,what is the unaccompanied minor fee,SYMILES ACCOUNT UNDER AUDIT
...,...,...
395,how much is a UMNR fee from jfk to acc,can i travel internationally without a passport
396,what is the UMNR fee,changing flights rules day of
397,how to earn status,GOLD MEDALLION QUALIFICATIONS FOR 2024
398,what is the minimum connection time for an int...,what is the minimum connection time in msp


In [158]:
sentence_pairs = list(zip(test_df["sentence1"],test_df["sentence2"]))
encoded_inputs = [encode_sentences(pair[0], pair[1]) for pair in sentence_pairs]

In [74]:
tokenizer("miles cacelation of basic economy")

{'input_ids': [101, 2661, 6187, 29109, 3370, 1997, 3937, 4610, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [77]:
encoded_inputs[0].input_ids.shape

torch.Size([1, 512])

## Predictions

In [67]:
# torch.sigmoid([1.05])

In [82]:
device

device(type='cpu')

In [48]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, e

In [132]:
# test_loader = DataLoader(encoded_inputs, batch_size=8)

In [137]:
# encoded_inputs

In [None]:
# # Put the model in evaluation mode
# model.cuda()
# model.eval()
# torch.no_grad()
# # Predictions list
# predictions = []

# for i, batch in enumerate(batches):
#     encoded_inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding='max_length').to("cuda")
#     outputs = model(**encoded_inputs)
#     scores = [score.item() for score in outputs.logits.squeeze()]
#     predictions += scores
#     torch.cuda.empty_cache()

# len(predictions) == len(test_pairs)

In [36]:
# model.eval()
# with torch.no_grad():
#     for batch in test_loader:
#         print(batch)
#         # Move batch to device
#         batch = {k: v.to(device) for k, v in batch.items()}
        
#         # Forward pass
#         outputs = model(**batch)
#         logits = outputs.logits
#         print(logits)
        

## Predict Individual Records

In [144]:
def get_crossencoder_output(tokenizer,model,encoded_inputs):
    # Put the model in evaluation mode
    model.eval()
    
    # Predictions list
    predictions = []
    
    # Disable gradient calculation for inference
    # encoded_inputs = encoded_inputs[:10]
    with torch.no_grad():
        for encoded_input in tqdm(encoded_inputs):
            # Move the tensors to the same device as the model
            encoded_input = {k: v.to(model.device) for k, v in encoded_input.items()}
            
            # Get the model outputs
            outputs = model(**encoded_input)
            # print(f"RAW OUTPUT: {outputs}")
            # The outputs are logits, get the score by applying the appropriate activation function
            # For regression tasks (like STS-B), you can directly use the output value as the score
            # For classification tasks, apply a softmax function to get probabilities
            score = outputs.logits.squeeze().item()
            # This will give sigmoid output like Sentence-Transformer
            # score = torch.sigmoid(outputs.logits)[:, 0]
            predictions.append(score)
    
    # Now `predictions` holds the similarity scores for each pair of sentences
    # print(predictions)
    return predictions


from transformers import AutoTokenizer, AutoModelForSequenceClassification

# #Load from Local
# tokenizer_local = AutoTokenizer.from_pretrained('./crossencoder_km_tinybert-model')
# model_local = AutoModelForSequenceClassification.from_pretrained('./crossencoder_km_tinybert-model')

# # Load from Hub
# tokenizer_hub = AutoTokenizer.from_pretrained("srmishra/crossencoder-tynybert-km1")
# model_hub = AutoModelForSequenceClassification.from_pretrained("srmishra/crossencoder-tynybert-km1")

# def encode_sentences(sentence1, sentence2,tokenizer):
#     return tokenizer(sentence1, sentence2, return_tensors='pt', truncation=True, padding='max_length')


# sentence_pairs = [
#     ("What is the PETC eligibility", "can i bring a kitten on my flight from ATL to SLC"),
#     ("what is the UMNR fee", "how many pets can i have")
# ]

# encoded_inputs_local = [encode_sentences(pair[0], pair[1],tokenizer_local) for pair in sentence_pairs]
# encoded_inputs_hub = [encode_sentences(pair[0], pair[1],tokenizer_hub) for pair in sentence_pairs]



## Prediction through Batches

In [45]:
test_pairs = test_dataset.to_pandas()[["sentence1", "sentence2"]].apply(lambda x: (x["sentence1"], x["sentence1"]), axis = 1).to_list()

torch.cuda.empty_cache()

gc.collect()

# del test_encoded

gc.collect()



0

In [46]:
batch_size = 10
num_batches = (len(test_pairs) // batch_size)
if len(test_pairs) % batch_size:
    num_batches += 1

num_batches

40

In [47]:
batches = [test_pairs[x*batch_size:(x+1)*batch_size] for x in range(num_batches)]

In [55]:
# Put the model in evaluation mode
# model.cuda()
model.eval()
torch.no_grad()
# Predictions list
predictions = []

for i, batch in enumerate(tqdm(batches)):
    encoded_inputs = tokenizer(batch, return_tensors='pt', truncation=True, padding='max_length').to(device)
    outputs = model(**encoded_inputs)
    # print(outputs)
    if outputs.logits.squeeze().shape:
        # print(outputs.logits.squeeze().shape)
        predictions += [score.item() for score in outputs.logits.squeeze()]
        # print(predictions)
    else:
        predictions += [outputs.logits.squeeze().item()]
    
    del encoded_inputs, outputs
    # torch.cuda.empty_cache()
        
    gc.collect()
  
   
len(predictions) == len(test_pairs)

  0%|                                                    | 0/40 [00:00<?, ?it/s]


False

In [171]:
predicted_df = test_dataset.to_pandas()[["sentence1","sentence2","labels"]]
predicted_df["similar_score"] = predictions

In [172]:
predicted_df['similar_score'].describe()

count    400.000000
mean       3.545358
std        0.048562
min        3.291201
25%        3.523601
50%        3.560787
75%        3.568765
max        3.653357
Name: similar_score, dtype: float64

## Measure the Performance : Precision, Recall, F1 Score, Accuracy

In [179]:
predicted_df['binarized'] = predicted_df['similar_score'].apply(lambda x: 1.0 if x > 3.5 else 0)
base_precision = precision_score(predicted_df['labels'], predicted_df['binarized'])
base_recall = recall_score(predicted_df['labels'], predicted_df['binarized'])
base_accuracy = accuracy_score(predicted_df['labels'], predicted_df['binarized'])
base_f1 = f1_score(predicted_df['labels'], predicted_df['binarized'])
print(f'baseline precision: {base_precision:.4f} \nbaseline recall: {base_recall:.4f} \nbaseline accuracy: {base_accuracy:.4f} \nbaseline f1: {base_f1:.4f}')

baseline precision: 0.4479 
baseline recall: 0.7794 
baseline accuracy: 0.3975 
baseline f1: 0.5689


In [None]:
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Suppose we have the following two questions
question1 = "How do I cook a perfect boiled egg?"
question2 = "What's the best way to boil an egg?"

# Convert questions into embeddings
q1_vector = model.encode(question1)
q2_vector = model.encode(question2)

# Create a FAISS index (for this example, we'll use a flat L2 index)
dimension = q1_vector.shape[0]  # Dimension of the vectors
index = faiss.IndexFlatL2(dimension)

# Since we are only comparing two questions, we'll add both to the index
index.add(np.array([q1_vector, q2_vector]))

# To find the similarity, search the index
# Here, k is the number of nearest neighbors to find; k=2 in this case
# because we are only comparing two vectors
k = 2
D, I = index.search(np.array([q1_vector]), k)

# D contains the distances, and I contains the indices of the nearest vectors
print(f"Distances: {D.flatten()}")
print(f"Indices: {I.flatten()}")

# The distance of q1_vector to itself will be 0, the distance to q2_vector will indicate their similarity

In [87]:
# get_crossencoder_output(tokenizer_local,model_local,encoded_inputs_local)

100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 12.05it/s]


[1.052358865737915, -0.009607994928956032]

In [88]:
# get_crossencoder_output(tokenizer_hub,model_hub,encoded_inputs_hub)

100%|█████████████████████████████████████████████| 2/2 [00:00<00:00, 12.76it/s]


[1.052358865737915, -0.009607994928956032]

In [47]:
predictions = get_crossencoder_output(tokenizer,model,encoded_inputs)

100%|█████████████████████████████████████████| 400/400 [00:51<00:00,  7.72it/s]


In [56]:
predicted_df = test_dataset.to_pandas()[["sentence1","sentence2","labels"]]

In [57]:
predicted_df["predicted_score"] = predictions

In [58]:
predicted_df

Unnamed: 0,sentence1,sentence2,labels,predicted_score
0,miles cacelation of basic economy,basic economy cancel and redeposit,1.0,1.104069
1,how much is a UMNR fee from jfk to acc,is the customer care line open,0.0,0.021331
2,which seats occupy bassinets,does my flight from ATL to LHR have a bassinet,1.0,1.062014
3,how much is a UMNR fee from jfk to acc,how long does it take to reopen a global upgra...,0.0,-0.063898
4,what is the unaccompanied minor fee,SYMILES ACCOUNT UNDER AUDIT,0.0,-0.102140
...,...,...,...,...
395,how much is a UMNR fee from jfk to acc,can i travel internationally without a passport,0.0,-0.158105
396,what is the UMNR fee,changing flights rules day of,0.0,-0.231820
397,how to earn status,GOLD MEDALLION QUALIFICATIONS FOR 2024,1.0,0.957206
398,what is the minimum connection time for an int...,what is the minimum connection time in msp,1.0,1.362267


In [59]:
predicted_df['predicted_score'].describe()

count    400.000000
mean       0.520870
std        0.612219
min       -0.384372
25%       -0.052355
50%        0.555799
75%        1.084717
max        2.355689
Name: predicted_score, dtype: float64

In [64]:
predicted_df['binarized'] = predicted_df['predicted_score'].apply(lambda x: 1.0 if x > 0.56 else 0)
base_precision = precision_score(predicted_df['labels'], predicted_df['binarized'])
base_recall = recall_score(predicted_df['labels'], predicted_df['binarized'])
base_accuracy = accuracy_score(predicted_df['labels'], predicted_df['binarized'])
base_f1 = f1_score(predicted_df['labels'], predicted_df['binarized'])
print(f'baseline precision: {base_precision:.4f} \nbaseline recall: {base_recall:.4f} \nbaseline accuracy: {base_accuracy:.4f} \nbaseline f1: {base_f1:.4f}')

baseline precision: 1.0000 
baseline recall: 0.9804 
baseline accuracy: 0.9900 
baseline f1: 0.9901


In [56]:
# from transformers import AutoTokenizer, AutoModelForSequenceClassification

# tokenizer = AutoTokenizer.from_pretrained("srmishra/crossencoder-tynybert-km1")
# model = AutoModelForSequenceClassification.from_pretrained("srmishra/crossencoder-tynybert-km1")

In [61]:
# Put the model in evaluation mode
model.eval()

# Predictions list
predictions = []

# Disable gradient calculation for inference
# encoded_inputs = encoded_inputs[:10]
with torch.no_grad():
    for encoded_input in tqdm(encoded_inputs):
        # Move the tensors to the same device as the model
        encoded_input = {k: v.to(model.device) for k, v in encoded_input.items()}
        
        # Get the model outputs
        outputs = model(**encoded_input)
        print(outputs)
        # The outputs are logits, get the score by applying the appropriate activation function
        # For regression tasks (like STS-B), you can directly use the output value as the score
        # For classification tasks, apply a softmax function to get probabilities
        score = outputs.logits.squeeze().item()
        
        # print(score)
        predictions.append(score)

# Now `predictions` holds the similarity scores for each pair of sentences
print(predictions)

AttributeError: 'CrossEncoder' object has no attribute 'eval'

In [71]:
len(predictions)

1485

In [72]:
predicted_df = test_dataset.to_pandas()[["sentence1","sentence2","labels"]]

In [73]:
predicted_df["similar_score"] = predictions

In [74]:
predicted_df.head(40)

Unnamed: 0,sentence1,sentence2,labels,similar_score
0,how to unhook a ticket in term,Can anyone unboard this passenger so I can cha...,0.0,-0.035986
1,how to unhook a ticket in term,What are the closest airports to SFO,0.0,-0.033835
2,do you need an e visa to transit through Brazil,Korea travel requirements,1.0,1.034878
3,what time does msp airport open,what are the hours of the ticket counter at Si...,1.0,1.052111
4,is visa required for connecting flights,Korea travel requirements,1.0,1.062162
5,can i assign seats on air france,can I assign seats on Air France,1.0,1.055483
6,what is the unaccompanied minor fee,does recognize PM Platinum Medallion status fo...,0.0,-0.048264
7,entry to unhook ticket in term,what is the age limit for first class?,0.0,0.001806
8,WHAT TIME DOES MIDWAY OPEN,what time does lagos airport open,1.0,1.032597
9,which terminal does use in lga,airport phone number for Atlanta,1.0,1.037165


In [36]:
predicted_df.iloc[23]

sentence1                      will pax need transit visa to dehli
sentence2        do you have to book a return flight if your vi...
labels                                                         1.0
similar_score                                             1.072025
Name: 23, dtype: object

## Calculate Model Performance

In [75]:
predicted_df['binarized'] = predicted_df['similar_score'].apply(lambda x: 1.0 if x > 0.5 else 0)

In [76]:
import numpy as np
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [77]:
precision_score(predicted_df['labels'], predicted_df['binarized'])

1.0

In [78]:
recall_score(predicted_df['labels'], predicted_df['binarized'])

0.9986824769433466

In [79]:
accuracy_score(predicted_df['labels'], predicted_df['binarized'])

0.9993265993265993

In [80]:
f1_score(predicted_df['labels'], predicted_df['binarized'])

0.999340804218853

## Cross Encoder with Sentence-transformer

In [47]:
# from sentence_transformers import CrossEncoder

# # Initialize the cross-encoder model
# cross_encoder = CrossEncoder('cross-encoder/stsb-roberta-large')

# # Step 1: Retrieve candidate documents using Elasticsearch or another IR system
# candidate_documents = retrieve_candidates(query)

# # Step 2: Re-rank candidates using the cross encoder
# pair_list = [(query, doc) for doc in candidate_documents]
# scores = cross_encoder.predict(pair_list)

# # Sort the documents by their score in descending order
# ranked_documents = sorted(zip(candidate_documents, scores), key=lambda x: x[1], reverse=True)

# # Return the top N results
# top_results = ranked_documents[:N]

Up-sampling the Minority Class: This involves randomly duplicating examples in the minority class (label 1 in your case) to achieve a balance between the two classes. This can be done manually or using libraries like imblearn in Python.
*****************
```python
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X, y)

```
****************

Where X is your feature matrix and y is your array of labels.

Down-sampling the Majority Class: This method involves randomly removing examples from the majority class (label 0) to balance the dataset. Again, this can be done manually or using imblearn.

****************
```python
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rus.fit_resample(X, y)

```
****************


Synthetic Data Generation: Techniques such as SMOTE (Synthetic Minority Over-sampling Technique) can generate synthetic examples for the minority class.

****************
```python
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

```
****************

Combining Over and Under Sampling: You can combine both over-sampling the minority class and under-sampling the majority class to achieve a balance.

****************
```python
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=42)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)

```
****************

Adjusting Class Weights: Some algorithms allow you to adjust the weights of classes to handle imbalanced data. For instance, many implementations of classifiers in scikit-learn have a class_weight parameter.

****************
```python
    
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(class_weight='balanced')
clf.fit(X, y)

```

****************

Custom Loss Functions: If you are using deep learning, you can design custom loss functions that penalize the misclassification of the minority class more than the majority class.

## Calculate Performance

In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, accuracy_score


# Example DataFrame
# data = {'sentence': ['This is sentence 1', 'Another sentence', 'Yet another sentence'],
#         'label': [1, 0, 1],
#         'predicted_label': [1, 1, 0]}
df = pd.DataFrame(data)

# Calculate precision, recall, and accuracy
precision = precision_score(df['label'], df['predicted_label'])
recall = recall_score(df['label'], df['predicted_label'])
accuracy = accuracy_score(df['label'], df['predicted_label'])

print("Precision:", precision)
print("Recall:", recall)
print("Accuracy:", accuracy)


## Comparion of model After finetuning vs Loading same version from Huggingface hub

In [128]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("pjbhaumik/crossencoder-km1")
model = AutoModelForSequenceClassification.from_pretrained("pjbhaumik/crossencoder-km1")

In [134]:
tokenizer = AutoTokenizer.from_pretrained('./crossencoder_km_mini2')
model = AutoModelForSequenceClassification.from_pretrained('./crossencoder_km_mini2')

In [146]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("./joecrossencoder/crossencoder_model_v4")
model = AutoModelForSequenceClassification.from_pretrained("./joecrossencoder/crossencoder_model_v4")

In [147]:
from tqdm import tqdm
import torch

In [148]:
def encode_sentences(sentence1, sentence2):
    return tokenizer(sentence1, sentence2, return_tensors='pt', truncation=True, padding='max_length')


sentence_pairs = [
    ('what is the UMNR fee', 'how many pets can i have')
    
]

encoded_inputs = [encode_sentences(pair[0], pair[1]) for pair in sentence_pairs]

In [149]:
model.eval()

# Predictions list
predictions = []

# Disable gradient calculation for inference
# encoded_inputs = encoded_inputs[:10]
with torch.no_grad():
    for encoded_input in tqdm(encoded_inputs):
        # Move the tensors to the same device as the model
        encoded_input = {k: v.to(model.device) for k, v in encoded_input.items()}
        
        # Get the model outputs
        outputs = model(**encoded_input)
        print(outputs)
        # The outputs are logits, get the score by applying the appropriate activation function
        # For regression tasks (like STS-B), you can directly use the output value as the score
        # For classification tasks, apply a softmax function to get probabilities
        score = outputs.logits.squeeze().item()
        
        # print(score)
        predictions.append(score)

# Now `predictions` holds the similarity scores for each pair of sentences
print(predictions)

100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.61it/s]

SequenceClassifierOutput(loss=None, logits=tensor([[-0.1989]]), hidden_states=None, attentions=None)
[-0.19885557889938354]





In [140]:
!git lfs install
!git clone https://huggingface.co/pjbhaumik/crossencoder-km1

git: 'lfs' is not a git command. See 'git --help'.

The most similar command is
	log
fatal: destination path 'crossencoder-km1' already exists and is not an empty directory.


In [141]:
mkdir joecrossencoder

In [142]:
cd joecrossencoder

/Users/603642/Library/CloudStorage/OneDrive-DeltaAirLines/Desktop/Data/RES_KM_EXP/joecrossencoder


In [143]:
!git lfs install
!git clone https://huggingface.co/pjbhaumik/crossencoder-km1

git: 'lfs' is not a git command. See 'git --help'.

The most similar command is
	log
Cloning into 'crossencoder-km1'...
remote: Enumerating objects: 24, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 24 (delta 4), reused 0 (delta 0), pack-reused 3[K
Unpacking objects: 100% (24/24), 319.20 KiB | 645.00 KiB/s, done.
git-lfs filter-process: git-lfs: command not found
fatal: the remote end hung up unexpectedly
You can inspect what was checked out with 'git status'
and retry with 'git restore --source=HEAD :/'



In [145]:
cd ..

/Users/603642/Library/CloudStorage/OneDrive-DeltaAirLines/Desktop/Data/RES_KM_EXP


In [3]:
sentence_pairs = [
    ("What is the PETC eligibility", "can i bring a kitten on my flight from ATL to SLC"),
    ("what is the UMNR fee", "how many pets can i have")
]

In [84]:
from sentence_transformers import CrossEncoder,SentenceTransformer
model = CrossEncoder("srmishra/crossencoder-tynybert-km1",num_labels=1)
scores = model.predict(sentence_pairs)

In [85]:
scores

array([0.7412276 , 0.49759802], dtype=float32)

In [31]:
from sentence_transformers import CrossEncoder
from transformers import AutoTokenizer

# Load the CrossEncoder model
model = CrossEncoder("srmishra/crossencoder-tynybert-km1")

# Load the tokenizer that matches the model
tokenizer = AutoTokenizer.from_pretrained("srmishra/crossencoder-tynybert-km1")

# Example sentence pairs
sentence_pairs = [
    ("What is the PETC eligibility", "can i bring a kitten on my flight from ATL to SLC"),
    ("what is the UMNR fee", "how many pets can i have")
]

# Tokenize the sentence pairs
tokenized_pairs = [tokenizer(pair[0], pair[1], truncation=True, return_tensors="pt") for pair in sentence_pairs]

# # Predict the scores for the tokenized sentence pairs
# scores = []
# for tokenized_input in tokenized_pairs:
#     score = model.predict([(tokenized_input['input_ids'].squeeze().tolist(),
#                             tokenized_input['attention_mask'].squeeze().tolist())], convert_to_tensor=True)
#     scores.append(score)

# # Flatten the scores list (since it's a list of lists with one element each)
# scores = [score[0] for score in scores]

# # Output the scores
# print(scores)

In [71]:
from sentence_transformers import CrossEncoder

# Initialize the model with the model name and additional arguments
model = CrossEncoder("srmishra/crossencoder-tynybert-km1" )
# Predict the scores for the sentence pairs
scores = model.predict(sentence_pairs)

# Output the scores
print(scores)

[0.7412276  0.49759802]


In [54]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load the tokenizer and model from Hugging Face Transformers
tokenizer = AutoTokenizer.from_pretrained("srmishra/crossencoder-tynybert-km1")
model = AutoModelForSequenceClassification.from_pretrained("srmishra/crossencoder-tynybert-km1")

# Example sentence pairs
sentence_pairs = [
    ("What is the PETC eligibility", "can i bring a kitten on my flight from ATL to SLC"),
    ("what is the UMNR fee", "how many pets can i have")
]

# Tokenize and encode the sentence pairs for the model
encoded_input = tokenizer(sentence_pairs[0][0], sentence_pairs[0][1], return_tensors='pt', truncation=True)
with torch.no_grad():
    # Get model predictions (logits)
    logits = model(**encoded_input).logits
    print(f"LOGITS: {logits}")

# Check the number of output classes
num_labels = model.config.num_labels
# print(num_labels)
if num_labels == 1:
    # For binary classification with a single output neuron
    scores = torch.sigmoid(logits)[:, 0]  # Apply sigmoid to get scores between 0 and 1


# Output the scores
print(f"FINAL SCORES: {scores}")

LOGITS: tensor([[1.0524]])
FINAL SCORES: tensor([0.7412])


In [76]:
torch.sigmoid(torch.tensor([1.05]))

tensor([0.7408])

In [77]:
torch.sigmoid(torch.tensor([-0.009]))

tensor([0.4978])

In [2]:
output = pd.read_csv("/Users/603642/Downloads/Output.csv")

In [3]:
output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 761 entries, 0 to 760
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   embedding           761 non-null    object 
 1   question            761 non-null    object 
 2   response_x          761 non-null    object 
 3   response_id         761 non-null    int64  
 4   airline             761 non-null    object 
 5   tags                761 non-null    object 
 6   best                232 non-null    object 
 7   scores              761 non-null    object 
 8   similar_question    232 non-null    object 
 9   rerank_score        224 non-null    float64
 10  response_id_rerank  224 non-null    float64
 11  _id                 224 non-null    float64
 12  response_y          224 non-null    object 
dtypes: float64(3), int64(1), object(9)
memory usage: 77.4+ KB


In [4]:
output.shape

(761, 13)

In [29]:
(output['response_id'] == output['best']).sum() / (output.shape[0])

0.0

In [27]:
output["similar_question"].isna().sum()

529

In [161]:
output[["question","similar_question","response_x","response_y","scores","best","response_id","_id"]]

Unnamed: 0,question,similar_question,response_x,response_y,scores,best,response_id,_id
0,how much is a UMNR fee from jfk to acc,,"<p class=""ng-tns-c32-1"">150 USD/CAD/EUR for ea...",,[],,3,
1,what is the unaccompanied minor fee,what is the unaccompanied minor fee,"<p class=""ng-tns-c32-1"">150 USD/CAD/EUR for ea...","<p class=""ng-tns-c32-1"">150 USD/CAD/EUR for ea...","['what is the unaccompanied minor fee', 3.5607...",3,3,3.0
2,what is the UMNR fee,,"<p class=""ng-tns-c32-1"">150 USD/CAD/EUR for ea...",,[],,3,
3,what is the term entry to unhook a ticket,,<p>&gt;T/X(pax #)</p>\n<p>Unhook Passenger 1.1...,,[],,4,
4,how to unhook a ticket in term,how to unhook a tkt in term,<p>&gt;T/X(pax #)</p>\n<p>Unhook Passenger 1.1...,<p>&gt;T/X(pax #)</p>\n<p>Unhook Passenger 1.1...,"['how to unhook a tkt in term', 0.865902602672...",4,4,4.0
...,...,...,...,...,...,...,...,...
756,can a passenger use ecredits to purchase baggage,can a passenger prepay for luggage,<p>eCredits or paper vouchers cannot be applie...,"<p>No, customers cannot pre-pay for baggage in...","['can a passenger prepay for luggage', 0.98735...",325,1225,325.0
757,ecredit use on other person,,<p>eCredit transferability varies based on eCr...,,[],,1234,
758,latam carry on bag,latam carry on,"<p>Customers with Promo, Light, Plus, or Top f...","<p>Customers with Promo, Light, Plus, or Top f...","['latam carry on', 2.616466999053955, 1255]",1255,1255,1255.0
759,booking on Kenya airways,,<p>Direct Access must be used to book award tr...,,[],,1282,


In [None]:
output[""]

In [25]:
positive_examples = output[output['response_id'] == output['best']]
positive_examples[['question', 'response_x', 'response_y', 'similar_question', 'rerank_score']]

Unnamed: 0,question,response_x,response_y,similar_question,rerank_score


In [9]:
output[output.scores=="[]"].shape

(529, 13)

In [180]:
import torch
from torch import nn
from transformers import AutoModelForSequenceClassification, AutoTokenizer



In [182]:
# Load the pretrained model and tokenizer
model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Freeze all the parameters in the model
for param in model.parameters():
    param.requires_grad = False

# Unfreeze the last layer (classifier)
for param in model.classifier.parameters():
    param.requires_grad = True


In [185]:
for param in model.classifier.parameters():
    print(param)

Parameter containing:
tensor([[-0.0309, -0.0254, -0.0326,  0.0325, -0.0445, -0.0274, -0.0233, -0.0275,
         -0.0285, -0.0410,  0.0389, -0.0270,  0.0247, -0.0473, -0.0297,  0.0295,
          0.0263,  0.0360,  0.0242,  0.0256, -0.0207, -0.0246, -0.0422,  0.0272,
         -0.0272,  0.0253,  0.0228, -0.0336,  0.0226,  0.0253,  0.0354, -0.0330,
          0.0339, -0.0499, -0.0318, -0.0349, -0.0218,  0.0247, -0.0237, -0.0456,
         -0.0485, -0.0240,  0.0488, -0.0251, -0.0337,  0.0256,  0.0434,  0.0260,
          0.0267, -0.0232,  0.0307, -0.0345, -0.0290,  0.0346, -0.0276,  0.0360,
          0.0301,  0.0507, -0.0235, -0.0239,  0.0264,  0.0415, -0.0319, -0.0235,
          0.0225, -0.0236, -0.0284,  0.0344, -0.0451,  0.0247,  0.0324,  0.0218,
         -0.0245, -0.0250,  0.0238,  0.0456,  0.0247,  0.0361, -0.0267, -0.0299,
         -0.0465, -0.0388, -0.0366, -0.0306, -0.0304, -0.0351, -0.0269, -0.0295,
         -0.0525,  0.0319,  0.0250, -0.0205,  0.0323, -0.0508,  0.0267,  0.0536,
      

In [None]:

# Define a new Bi-LSTM layer to be added on top of the MiniLM model
class BiLSTMHead(nn.Module):
    def __init__(self, hidden_size, num_labels):
        super().__init__()
        self.bi_lstm = nn.LSTM(hidden_size, hidden_size // 2, num_layers=1, bidirectional=True, batch_first=True)
        self.classifier = nn.Linear(hidden_size, num_labels)

    def forward(self, hidden_states):
        lstm_output, _ = self.bi_lstm(hidden_states)
        # We just want the last hidden state from the sequence for classification
        lstm_output = lstm_output[:, -1, :]
        logits = self.classifier(lstm_output)
        return logits

# Assuming the hidden size is 384 for MiniLM-L-6-v2 and we have a binary classification problem
hidden_size = 384
num_labels = 2

# Replace the classifier with the new Bi-LSTM head
model.classifier = BiLSTMHead(hidden_size, num_labels)

# Now you can fine-tune the model with your data and preferred optimization setup
# ...

# Remember to unfreeze layers gradually if you want to fine-tune more than the Bi-LSTM and last layer

## Imabalance Data Training

In [53]:
imb_df = pd.concat([positive_df.iloc[:500],negative_df.iloc[:1500]])
imb_df["review"] = imb_df["sentence1"]+" "+imb_df["sentence1"]
imb_df.labels.value_counts()

labels
0.0    1500
1.0     500
Name: count, dtype: int64

In [90]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import torch
from datasets import Dataset

# Assuming you have a DataFrame `df` with columns 'review' and 'label' (0 for negative, 1 for positive)
# Prepare the data
train,validation = train_test_split(imb_df, stratify=imb_df['labels'])

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
weights = torch.tensor(class_weights, dtype=torch.float)

In [58]:
weights

tensor([0.6667, 2.0000])

In [91]:
# Initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

def encode(examples):
    return tokenizer(examples['review'], truncation=True, padding='max_length',max_length=128)



train_dataset = Dataset.from_pandas(train)
# test_dataset = Dataset.from_pandas(test)
val_dataset = Dataset.from_pandas(validation)

train_encoded = train_dataset.map(encode, batched=True)
val_encoded = val_dataset.map(encode, batched=True)



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [97]:
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy='epoch'
)

- Use the weights in compute loss

In [98]:


class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        # print(inputs)
        outputs = model(**inputs)
        print(outputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        
        loss_fct = nn.CrossEntropyLoss(weight=weights)
        logger.info("loss function")
        loss = loss_fct(logits.view(-1, self.model.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [99]:
# loss_fct = nn.CrossEntropyLoss(weight=weights)

In [100]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_encoded,
    eval_dataset=val_encoded
)

In [101]:
trainer.train()

ValueError: Target size (torch.Size([8])) must be the same as input size (torch.Size([8, 2]))

In [22]:
imb_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3000 entries, 0 to 2004
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   sentence1  3000 non-null   object 
 1   sentence2  3000 non-null   object 
 2   labels     3000 non-null   float64
dtypes: float64(1), object(2)
memory usage: 93.8+ KB


In [17]:
imb_df["review"] = imb_df["sentence1"]+' '+imb_df["sentence1"]

In [None]:
# n_samples / (n_classes * np.bincount(y)

In [42]:
>>> import numpy as np
>>> from sklearn.utils.class_weight import compute_class_weight
>>> y = [1, 1, 1, 1, 0, 0]
>>> compute_class_weight(class_weight="balanced", classes=np.unique(y), y=y)


array([1.5 , 0.75])

In [43]:
np.bincount(y)

array([2, 4])