<a href="https://colab.research.google.com/github/sara-37002/ready-group/blob/main/ready_project_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [49]:
# Install the peft library
!pip install peft==0.10.0



In [50]:
# Install and upgrade necessary libraries
!pip install --upgrade --no-cache-dir \
  transformers==4.39.3 \
  accelerate==0.28.0 \
  datasets==2.18.0 \
  scikit-learn==1.6.1 \
  numpy==1.26.4



In [51]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split

from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import Dataset

from transformers import BertForSequenceClassification, BertTokenizerFast
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


import torch

In [52]:
# Define the path to the CSV file
csv_path = "queries-fix.csv"  # Upload to Colab via the right side > Files
# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_path)

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df["label"])
# Display the training DataFrame
train_df

Unnamed: 0,user_query,label
788,אני מתעניין בתכנון ערים מההיבט ההנדסי,הנדסה
866,מעניין אותי כימיה אורגנית והמבנה של תרכובות טב...,מדעים מדויקים
1819,רוצה להבין את גבולות החוק במדינה דמוקרטית,משפטים
1693,האם מה התנאים לעבודה כעורך דין,משפטים
1617,מה זה תעודת הוראה,חינוך
...,...,...
1932,רוצה לעבוד בפרקליטות המדינה,משפטים
1581,איך לומדים הוראה בבתי ספר?,חינוך
2708,רוצה להילחם בשחיתות דרך מערכת החוק,משפטים
1013,רוצה לעסוק ב־Brain-Computer Interface,מדעי המוח


In [53]:
# Get unique labels and create mappings from label to id and id to label
labels = sorted(df["label"].unique())
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for label, i in label2id.items()}

# Map labels to their corresponding ids in the training and testing DataFrames
train_df["label_id"] = train_df["label"].map(label2id)
test_df["label_id"] = test_df["label"].map(label2id)
# Display the head of the testing DataFrame
test_df.head()

Unnamed: 0,user_query,label,label_id
2305,רוצה לשלב יצירתיות בהוראה,חינוך,2
1113,רוצה להבין איך לאמן מחשב לזהות תמונות,מדעי הנתונים,6
1142,איך משתמשים בגרפים כדי להמחיש מגמות?,מדעי הנתונים,6
2445,רוצה לייצג נאשמים בבית משפט,משפטים,9
2645,רוצה ללמד ילדים עם צרכים מיוחדים,חינוך,2


In [54]:
# Define the pre-trained model name
model_name = "bert-base-multilingual-cased"

# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id), # Specify the number of labels
    id2label=id2label,       # Provide the mapping from id to label
    label2id=label2id        # Provide the mapping from label to id
)
# Set the device to GPU if available, otherwise to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Move the model to the selected device
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [55]:
# Convert pandas DataFrames to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Define a function to tokenize the input text
def tokenize(example):
    return tokenizer(example["user_query"], truncation=True, padding="max_length")

# Apply the tokenize function to the training and testing datasets
train_dataset = train_dataset.map(tokenize)
test_dataset = test_dataset.map(tokenize)

# Rename the 'label_id' column to 'labels' for compatibility with the model
train_dataset = train_dataset.rename_column("label_id", "labels")
test_dataset = test_dataset.rename_column("label_id", "labels")

# Set the format of the datasets to PyTorch tensors and specify the columns
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Display the training dataset
train_dataset

Map:   0%|          | 0/2179 [00:00<?, ? examples/s]

Map:   0%|          | 0/545 [00:00<?, ? examples/s]

Dataset({
    features: ['user_query', 'label', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 2179
})

In [56]:
# Define a function to test the BERT model with a given text query
def testBert(text):
    # Set the model to evaluation mode
    model.eval()
    # Tokenize the input text and move to the specified device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    # Perform inference without calculating gradients
    with torch.no_grad():
        outputs = model(**inputs)
        # Get the predicted label id by finding the index of the maximum logit
        pred_id = outputs.logits.argmax(dim=1).item()

    # Return the predicted label string using the id2label mapping
    return id2label[pred_id]



In [57]:
# Test the model with a sample query
query = "רוצה תואר שמתאים למי שמתעניין בעבודה עם אנשים"
# Print the predicted faculty
print("הפקולטה החזויה:", testBert(query))

הפקולטה החזויה: מדעים מדויקים


In [58]:
#get bert accuracy
from sklearn.metrics import accuracy_score

# Get the true labels from the test dataset
true_labels = [id2label[label.item()] for label in test_dataset["labels"]]

# Get the predicted labels using the testBert function
predicted_labels = [testBert(query) for query in test_dataset["user_query"]]

# Calculate the accuracy
accuracy = accuracy_score(true_labels, predicted_labels)

print(f"Accuracy of testBert function on the test dataset: {accuracy}")

Accuracy of testBert function on the test dataset: 0.03302752293577982


In [59]:
# Define a function to compute evaluation metrics
def compute_metrics(eval_pred):
    # Extract logits and labels from the evaluation prediction
    logits, labels = eval_pred
    # Get the predicted labels by finding the index of the maximum logit
    predictions = np.argmax(logits, axis=-1)
    # Calculate accuracy
    acc = accuracy_score(labels, predictions)
    # Calculate weighted F1 score
    f1 = f1_score(labels, predictions, average="weighted")
    # Return a dictionary containing the computed metrics
    return {"accuracy": acc, "f1": f1}

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./bert-intent",  # Output directory for model checkpoints and predictions
    eval_steps=500,  # Evaluate every 500 steps (adjust as needed)
    per_device_train_batch_size=8,  # Batch size per device during training
    per_device_eval_batch_size=8,   # Batch size per device during evaluation
    num_train_epochs=3,  # Number of training epochs
    logging_dir="./logs",  # Directory for storing logs
    report_to=[],  # Disable reporting to external services like wandb
)

# Create a Trainer instance
trainer = Trainer(
    model=model,  # The model to train
    args=training_args,  # The training arguments
    train_dataset=train_dataset,  # The training dataset
    eval_dataset=test_dataset,  # The evaluation dataset
    compute_metrics=compute_metrics,  # The function to compute metrics
)

# Start training the model
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


In [None]:
# Define a function to predict the label for a given text query
def predict(text):
    # Set the model to evaluation mode
    model.eval()

    # Tokenize the input text and move to the specified device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)

    # Perform inference without calculating gradients
    with torch.no_grad():
        outputs = model(**inputs)
        # Get the predicted label id by finding the index of the maximum logit
        pred_id = outputs.logits.argmax(dim=1).item()

    # Return the predicted label string using the id2label mapping
    return id2label[pred_id]


In [None]:

# Test the prediction function with a sample query
query = "רוצה תואר שמתאים למי שמתעניין בעבודה עם אנשים"
# Print the predicted faculty
print("הפקולטה החזויה:", predict(query))

# New Section

In [None]:
# Evaluate the model on a small test set
from sklearn.metrics import precision_score, recall_score

# Evaluate the model on a small test set
small_test_dataset = test_dataset.select(range(10)) # Select the first 10 examples from the test set

# Compute metrics on the small test set
eval_results = trainer.evaluate(small_test_dataset)

print("Evaluation results on a test set:")
print(eval_results)