# Set Up the Environment

In [1]:
!pip install transformers pandas scikit-learn



In [2]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->bitsandbytes)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->bitsandbytes)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->bitsandbytes)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (41

In [3]:
!pip install accelerate

Collecting accelerate
  Downloading accelerate-0.31.0-py3-none-any.whl (309 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/309.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/309.4 kB[0m [31m4.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m309.4/309.4 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.31.0


Import libraries

In [4]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline, BitsAndBytesConfig
from sklearn.metrics import accuracy_score, f1_score

# Load the Dataset

In [5]:
import pandas as pd
import json

def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            data.append(json.loads(line))
    return pd.DataFrame(data)

train_data = load_jsonl('/content/train.jsonl')
test_data = load_jsonl('/content/test.jsonl')

In [6]:
train_examples = [{'question1': row['q1'], 'question2': row['q2'], 'label': row['label']} for _, row in train_data.iterrows()]
test_examples = [{'question1': row['q1'], 'question2': row['q2'], 'label': row['label']} for _, row in test_data.iterrows()]

In [None]:
train_examples

[{'question1': 'چگونه می توانم کم کم وزن خود را کاهش دهم؟',
  'question2': 'چگونه وزن کم کنم؟',
  'label': '1'},
 {'question1': 'چگونه استمناء بر قدرت تمرکز شخص تأثیر می گذارد؟',
  'question2': 'آیا چرک روی لوزه ها می تواند نشانه ای از STD باشد؟',
  'label': '0'},
 {'question1': 'وقتی خواب کسی را دیدم و گفتم آنها در حال مرگ هستند ، چه معنایی دارد؟',
  'question2': 'وقتی خواب می بینم که کسی در حال مرگ باشد معنی اش چیست؟',
  'label': '0'},
 {'question1': 'چگونه می توانم فایل های apk را تغییر دهم؟',
  'question2': 'بهترین روش برای ویرایش فایل های APK چیست؟',
  'label': '1'},
 {'question1': 'از کجا می توانم کفش های کپی درجه اول را در بمبئی بخرم؟',
  'question2': 'از کجا ، در بنگلور می توانید کفش های درجه اول را پیدا کنید؟',
  'label': '0'},
 {'question1': 'روش مرحله به مرحله برای درخواست شهروندی دایم استرالیا چیست؟',
  'question2': 'آیا برادرم می تواند حامی من برای شهروندی دایم استرالیا شوند، زیرا قبلاً هم یکی از آنها را داشته است؟',
  'label': '0'},
 {'question1': 'چه چیزی برای رفع جوش سر

In [None]:
test_examples

[{'question1': 'آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟',
  'question2': 'چه چیزی روح فرد را می شکند؟',
  'label': '0'},
 {'question1': 'چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟',
  'question2': 'برای جلوگیری از تماشای کامل پورنو باید چه کاری انجام دهم؟',
  'label': '1'},
 {'question1': 'چه کسانی امام علی را خدا میدانند؟',
  'question2': 'چه کسانی می توانند امام زمان را ببینند؟',
  'label': '0'},
 {'question1': 'آیا قرار است دونالد ترامپ رئیس جمهور بعدی ایالات متحده باشد؟',
  'question2': 'شانس اینکه دونالد ترامپ رئیس جمهور بعدی آمریکا باشد ، چیست؟',
  'label': '1'},
 {'question1': 'چگونه می توانم سوالی را در این باره بپرسم؟',
  'question2': 'چگونه می توانم سوال بپرسم؟',
  'label': '0'},
 {'question1': 'قرص مولتی ویتامین مینرال چه فوایدی دارد؟',
  'question2': 'مولتی ویتامین مینرال چه کاربردی دارد؟',
  'label': '0'},
 {'question1': 'دانلود اهنگ حال و هوای عالی از دنیا؟',
  'question2': 'دانلود اهنگ حال و هوای عالی دنیا؟',
  'label': '1'},


# Functions

In [7]:
def zero_shot_evaluate(pipeline, test_data):
    predictions = []
    labels = []
    examples = []

    for example in test_data:
        question1 = example['question1']
        question2 = example['question2']
        label = example['label']

        input_text = f"آیا این دو سوال مترادف هستند?\n سوال 1: {question1}\n سوال 2: {question2}"
        result = pipeline(input_text)

        predicted_label = 1 if result[0]['label'] == 'LABEL_1' else 0
        predictions.append(predicted_label)
        labels.append(int(label))
        examples.append((question1, question2, int(label), predicted_label))

    return predictions, labels, examples

In [8]:
def few_shot_evaluate(pipeline, test_data, n_shots):
    predictions = []
    labels = []
    examples = []
    shot_examples = train_examples[:n_shots]

    for example in test_data:
        question1 = example['question1']
        question2 = example['question2']
        label = example['label']

        prompt = "از مثال یا مثال‌های آموزشی زیر استفاده کن و در نهایت تشخیص بده که آیا دو سوال مترادف هستند؟\n"
        prompt += "مثال:"
        for shot in shot_examples:
            prompt += f"سوال 1: {shot['question1']}\n سوال 2: {shot['question2']}\n جواب: {'بله' if shot['label'] == 1 else 'خیر'}\n"

        prompt += f"\n سوال 1: {question1}\n سوال 2: {question2}"

        result = pipeline(prompt)
        predicted_label = 1 if result[0]['label'] == 'LABEL_1' else 0
        predictions.append(predicted_label)
        labels.append(int(label))
        examples.append((question1, question2, int(label), predicted_label))

    return predictions, labels, examples

# Model 1

In [None]:
model_name = "NousResearch/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4")

model = AutoModelForSequenceClassification.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NousResearch/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
paraphrase_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

## Zero-shot Learning

In [None]:
predictions, labels, zero_shot_examples = zero_shot_evaluate(paraphrase_pipeline, test_examples[:10])

accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions)

print(f"Zero-shot Learning - Accuracy: {accuracy}, F1 Score: {f1}")



Zero-shot Learning - Accuracy: 0.4, F1 Score: 0.5714285714285715


In [None]:
print("\nZero-shot Examples:")
for example in zero_shot_examples[:3]:
    question1, question2, label, predicted_label = example
    print(f"Q1: {question1}")
    print(f"Q2: {question2}")
    print(f"True Label: {label}, Predicted Label: {predicted_label}\n")


Zero-shot Examples:
Q1: آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Q2: چه چیزی روح فرد را می شکند؟
True Label: 0, Predicted Label: 1

Q1: چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟
Q2: برای جلوگیری از تماشای کامل پورنو باید چه کاری انجام دهم؟
True Label: 1, Predicted Label: 1

Q1: چه کسانی امام علی را خدا میدانند؟
Q2: چه کسانی می توانند امام زمان را ببینند؟
True Label: 0, Predicted Label: 1



## One-shot and Five-shot Learning

In [None]:
# One-shot Learning
predictions, labels, one_shot_examples  = few_shot_evaluate(paraphrase_pipeline, test_examples[:10], n_shots=1)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions)
print(f"One-shot Learning - Accuracy: {accuracy}, F1 Score: {f1}")

print("\nOne-shot Examples:")
for example in one_shot_examples[:3]:
    question1, question2, label, predicted_label = example
    print(f"Q1: {question1}")
    print(f"Q2: {question2}")
    print(f"True Label: {label}, Predicted Label: {predicted_label}\n")

One-shot Learning - Accuracy: 0.4, F1 Score: 0.5714285714285715

One-shot Examples:
Q1: آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Q2: چه چیزی روح فرد را می شکند؟
True Label: 0, Predicted Label: 1

Q1: چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟
Q2: برای جلوگیری از تماشای کامل پورنو باید چه کاری انجام دهم؟
True Label: 1, Predicted Label: 1

Q1: چه کسانی امام علی را خدا میدانند؟
Q2: چه کسانی می توانند امام زمان را ببینند؟
True Label: 0, Predicted Label: 1



In [None]:
# Five-shot Learning
predictions, labels, five_shot_examples  = few_shot_evaluate(paraphrase_pipeline, test_examples[:10], n_shots=5)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions)
print(f"Five-shot Learning - Accuracy: {accuracy}, F1 Score: {f1}")

print("\nFive-shot Examples:")
for example in five_shot_examples[:3]:
    question1, question2, label, predicted_label = example
    print(f"Q1: {question1}")
    print(f"Q2: {question2}")
    print(f"True Label: {label}, Predicted Label: {predicted_label}\n")

Five-shot Learning - Accuracy: 0.4, F1 Score: 0.5714285714285715

Five-shot Examples:
Q1: آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Q2: چه چیزی روح فرد را می شکند؟
True Label: 0, Predicted Label: 1

Q1: چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟
Q2: برای جلوگیری از تماشای کامل پورنو باید چه کاری انجام دهم؟
True Label: 1, Predicted Label: 1

Q1: چه کسانی امام علی را خدا میدانند؟
Q2: چه کسانی می توانند امام زمان را ببینند؟
True Label: 0, Predicted Label: 1



# Model 2

In [None]:
model_name = "ViraIntelligentDataMining/PersianLLaMA-13B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4")

model = AutoModelForSequenceClassification.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/1.12M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/552 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/716 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/29.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/6 [00:00<?, ?it/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/1.90G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at ViraIntelligentDataMining/PersianLLaMA-13B-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
paraphrase_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

## Zero-shot Learning

In [None]:
predictions, labels, zero_shot_examples = zero_shot_evaluate(paraphrase_pipeline, test_examples[:20])

accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions)

print(f"Zero-shot Learning - Accuracy: {accuracy}, F1 Score: {f1}")

Zero-shot Learning - Accuracy: 0.4, F1 Score: 0.5384615384615384


In [None]:
print("\nZero-shot Examples:")
for example in zero_shot_examples[:3]:
    question1, question2, label, predicted_label = example
    print(f"Q1: {question1}")
    print(f"Q2: {question2}")
    print(f"True Label: {label}, Predicted Label: {predicted_label}\n")


Zero-shot Examples:
Q1: آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Q2: چه چیزی روح فرد را می شکند؟
True Label: 0, Predicted Label: 1

Q1: چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟
Q2: برای جلوگیری از تماشای کامل پورنو باید چه کاری انجام دهم؟
True Label: 1, Predicted Label: 1

Q1: چه کسانی امام علی را خدا میدانند؟
Q2: چه کسانی می توانند امام زمان را ببینند؟
True Label: 0, Predicted Label: 1



## One-shot and Five-shot Learning

In [None]:
# One-shot Learning
predictions, labels, one_shot_examples  = few_shot_evaluate(paraphrase_pipeline, test_examples[:20], n_shots=1)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions)
print(f"One-shot Learning - Accuracy: {accuracy}, F1 Score: {f1}")

print("\nOne-shot Examples:")
for example in one_shot_examples[:3]:
    question1, question2, label, predicted_label = example
    print(f"Q1: {question1}")
    print(f"Q2: {question2}")
    print(f"True Label: {label}, Predicted Label: {predicted_label}\n")

One-shot Learning - Accuracy: 0.35, F1 Score: 0.5185185185185185

One-shot Examples:
Q1: آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Q2: چه چیزی روح فرد را می شکند؟
True Label: 0, Predicted Label: 1

Q1: چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟
Q2: برای جلوگیری از تماشای کامل پورنو باید چه کاری انجام دهم؟
True Label: 1, Predicted Label: 1

Q1: چه کسانی امام علی را خدا میدانند؟
Q2: چه کسانی می توانند امام زمان را ببینند؟
True Label: 0, Predicted Label: 1



In [None]:
# Five-shot Learning
predictions, labels, five_shot_examples  = few_shot_evaluate(paraphrase_pipeline, test_examples[:20], n_shots=5)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions)
print(f"Five-shot Learning - Accuracy: {accuracy}, F1 Score: {f1}")

print("\nFive-shot Examples:")
for example in five_shot_examples[:3]:
    question1, question2, label, predicted_label = example
    print(f"Q1: {question1}")
    print(f"Q2: {question2}")
    print(f"True Label: {label}, Predicted Label: {predicted_label}\n")

Five-shot Learning - Accuracy: 0.35, F1 Score: 0.5185185185185185

Five-shot Examples:
Q1: آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Q2: چه چیزی روح فرد را می شکند؟
True Label: 0, Predicted Label: 1

Q1: چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟
Q2: برای جلوگیری از تماشای کامل پورنو باید چه کاری انجام دهم؟
True Label: 1, Predicted Label: 1

Q1: چه کسانی امام علی را خدا میدانند؟
Q2: چه کسانی می توانند امام زمان را ببینند؟
True Label: 0, Predicted Label: 1



# Model 3

In [9]:
model_name = "universitytehran/PersianMind-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)

quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4")

model = AutoModelForSequenceClassification.from_pretrained(model_name, quantization_config=quantization_config, device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/688k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.50M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/549 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/28.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at universitytehran/PersianMind-v1.0 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
paraphrase_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer)

## Zero-shot Learning

In [11]:
predictions, labels, zero_shot_examples = zero_shot_evaluate(paraphrase_pipeline, test_examples[:20])

accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions)

print(f"Zero-shot Learning - Accuracy: {accuracy}, F1 Score: {f1}")

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Zero-shot Learning - Accuracy: 0.3, F1 Score: 0.46153846153846156


In [12]:
print("\nZero-shot Examples:")
for example in zero_shot_examples[:3]:
    question1, question2, label, predicted_label = example
    print(f"Q1: {question1}")
    print(f"Q2: {question2}")
    print(f"True Label: {label}, Predicted Label: {predicted_label}\n")


Zero-shot Examples:
Q1: آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Q2: چه چیزی روح فرد را می شکند؟
True Label: 0, Predicted Label: 1

Q1: چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟
Q2: برای جلوگیری از تماشای کامل پورنو باید چه کاری انجام دهم؟
True Label: 1, Predicted Label: 1

Q1: چه کسانی امام علی را خدا میدانند؟
Q2: چه کسانی می توانند امام زمان را ببینند؟
True Label: 0, Predicted Label: 1



## One-shot and Five-shot Learning

In [13]:
# One-shot Learning
predictions, labels, one_shot_examples  = few_shot_evaluate(paraphrase_pipeline, test_examples[:20], n_shots=1)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions)
print(f"One-shot Learning - Accuracy: {accuracy}, F1 Score: {f1}")

print("\nOne-shot Examples:")
for example in one_shot_examples[:3]:
    question1, question2, label, predicted_label = example
    print(f"Q1: {question1}")
    print(f"Q2: {question2}")
    print(f"True Label: {label}, Predicted Label: {predicted_label}\n")

One-shot Learning - Accuracy: 0.3, F1 Score: 0.46153846153846156

One-shot Examples:
Q1: آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Q2: چه چیزی روح فرد را می شکند؟
True Label: 0, Predicted Label: 1

Q1: چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟
Q2: برای جلوگیری از تماشای کامل پورنو باید چه کاری انجام دهم؟
True Label: 1, Predicted Label: 1

Q1: چه کسانی امام علی را خدا میدانند؟
Q2: چه کسانی می توانند امام زمان را ببینند؟
True Label: 0, Predicted Label: 1



In [14]:
# Five-shot Learning
predictions, labels, five_shot_examples  = few_shot_evaluate(paraphrase_pipeline, test_examples[:20], n_shots=5)
accuracy = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions)
print(f"Five-shot Learning - Accuracy: {accuracy}, F1 Score: {f1}")

print("\nFive-shot Examples:")
for example in five_shot_examples[:3]:
    question1, question2, label, predicted_label = example
    print(f"Q1: {question1}")
    print(f"Q2: {question2}")
    print(f"True Label: {label}, Predicted Label: {predicted_label}\n")

Five-shot Learning - Accuracy: 0.3, F1 Score: 0.46153846153846156

Five-shot Examples:
Q1: آیا جهان روح وجود دارد؟ اگر بله ، مبتکر و کنترل کننده این قلمرو روحانی کیست؟
Q2: چه چیزی روح فرد را می شکند؟
True Label: 0, Predicted Label: 1

Q1: چگونه می توانم تماشای فیلم های پورنو را متوقف کنم؟
Q2: برای جلوگیری از تماشای کامل پورنو باید چه کاری انجام دهم؟
True Label: 1, Predicted Label: 1

Q1: چه کسانی امام علی را خدا میدانند؟
Q2: چه کسانی می توانند امام زمان را ببینند؟
True Label: 0, Predicted Label: 1



# Comparison

## Workflow Steps(for each model)

1. **Data Loading**: The training and test datasets were loaded from JSONL files.
2. **Data Preparation**: The data was formatted into a list of dictionaries with keys 'question1', 'question2', and 'label'.
3. **Model Loading**: The `NousResearch/Meta-Llama-3-8B` model was loaded using the `transformers` library.
4. **Pipeline Creation**: A text classification pipeline was created for the loaded model.
5. **Zero-shot Evaluation**: The model was evaluated without any training examples.
6. **One-shot Evaluation**: The model was evaluated using one training example from the training set.
7. **Five-shot Evaluation**: The model was evaluated using five training examples from the training set.
8. **Example Printing**: Three examples from each scenario were printed along with their true and predicted labels.
9. **Results Reporting**: Accuracy and F1 scores were calculated and reported for each scenario.

## Parameters Used

- **Model Name**: `NousResearch/Meta-Llama-3-8B`
- **Number of Shots**: 0 (zero-shot), 1 (one-shot), and 5 (five-shot)
- **Subset of Test Data**: First 500 examples from the test set

# Results and Performance Analysis:

### Zero-shot Learning:

- **Accuracy**: 0.4
- **F1 Score**: 0.5714

**Analysis**: In the zero-shot scenario, the model did not see any examples before making predictions. The performance is close to random guessing (50% accuracy), indicating that the model struggles to identify synonyms without any prior examples.

**Advantages**:
- No need for training data.
- Useful for quick assessments when training data is unavailable.

### One-shot Learning:

- **Accuracy**: 0.4
- **F1 Score**: 0.5714

**Analysis**: The one-shot learning scenario showed same performance in both accuracy and F1 score compared to the zero-shot scenario. This might suggest that a single training example was not representative enough or introduced some bias that the model could not generalize well. Also i test in on just 10 test examples and it's not enough!

**Advantages**:
- Requires only one example to provide context.
- Can improve performance slightly over zero-shot in some cases.

## Five-shot Learning:

- **Accuracy**: 0.4
- **F1 Score**: 0.5714

**Analysis**: The five-shot learning scenario showed same performance in both accuracy and F1 score compared to the other scenarioes. This is because i test in on just 10 test examples and it's not enough!

**Advantages**:
- Provides more context to the model, helping it to learn better.
- Can significantly improve the model's understanding of the task with a few examples.

## Parameters Used

- **Model Name**: `ViraIntelligentDataMining/PersianLLaMA-13B-Instruct`
- **Number of Shots**: 0 (zero-shot), 1 (one-shot), and 5 (five-shot)
- **Subset of Test Data**: First 500 examples from the test set

# Results and Performance Analysis:

### Zero-shot Learning:

- **Accuracy**: 0.4
- **F1 Score**: 0.5384615384615384

### One-shot Learning:

- **Accuracy**: 0.35
- **F1 Score**: 0.5185185185185185

## Five-shot Learning:

- **Accuracy**: 0.35
- **F1 Score**: 0.5185185185185185

## Parameters Used

- **Model Name**: `universitytehran/PersianMind-v1.0`
- **Number of Shots**: 0 (zero-shot), 1 (one-shot), and 5 (five-shot)
- **Subset of Test Data**: First 500 examples from the test set

# Results and Performance Analysis:

### Zero-shot Learning:

- **Accuracy**: 0.3
- **F1 Score**: 0.46153846153846156

### One-shot Learning:

- **Accuracy**: 0.3
- **F1 Score**: 0.46153846153846156

## Five-shot Learning:

- **Accuracy**: 0.3
- **F1 Score**: 0.46153846153846156

# Scenario Differences and Their Advantages:

## Zero-shot Learning

**Description**: The model makes predictions without seeing any examples.
**Advantages**:
- No need for training data.
- Fast and requires no additional computation for training.
- Useful for initial assessments and when no labeled data is available.

**Disadvantages**:
- Generally lower performance compared to scenarios with examples.
- Does not leverage any task-specific context.

## One-shot Learning

**Description**: The model makes predictions after seeing one example.
**Advantages**:
- Minimal labeled data required.
- Provides some context to the model, potentially improving performance over zero-shot.
- Useful when labeled data is scarce but at least one example is available.

**Disadvantages**:
- Performance improvement is limited by the quality and representativeness of the single example.
- Risk of bias from the single example used.

## Five-shot Learning

**Description**: The model makes predictions after seeing five examples.
**Advantages**:
- More context provided to the model, leading to better understanding of the task.
- Can significantly improve performance, especially in terms of F1 score.
- Useful when a small amount of labeled data is available.

**Disadvantages**:
- Requires more labeled data than one-shot.
- Risk of overfitting to the few examples provided, especially if they are not representative of the overall dataset.

##in conlcusion:
the results(accuracies) of those models with 3 different scenarioes are not experimentally correct as we just examine 10(or 20) test examples! but generally of our data is collected correctly, the accuracy of the third method should be better than the second one and the accuracy of the second one will be better than the first one as we use training examples on them and the model can cheat using them!