In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

**Load Model Qwen 3 14B Base**

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 2x faster
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Mistral-Small-Instruct-2409",     # Mistral 22b 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!

    "unsloth/Llama-3.2-1B-bnb-4bit",           # NEW! Llama 3.2 models
    "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    "unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
] # More models at https://huggingface.co/unsloth

qwen_models = [
    "unsloth/Qwen3-14B-unsloth-bnb-4bit",      # Qwen 14B 2x faster
    "unsloth/Qwen3-8B-unsloth-bnb-4bit",
    "unsloth/Qwen3-0.6B-unsloth-bnb-4bit",
    "unsloth/Qwen3-1.7B-unsloth-bnb-4bit",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B-Base-unsloth-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-07-02 20:26:00.104390: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751487960.309908      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751487960.366082      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.6.12: Fast Qwen3 patching. Transformers: 4.51.3.
   \\   /|    Tesla P100-PCIE-16GB. Num GPUs = 1. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 6.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.59G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/1.56G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/166 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.6.12 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [4]:
import pandas as pd
import re

**Test file modification**

In [5]:
df= pd.read_csv('/kaggle/input/test-set/test.csv')
df

Unnamed: 0,id,question,options
0,1,কোনো বস্তুর চৌম্বকত্ব ধারকত্ব পরিমাপ করা হয়-,['চুম্বকনকারি বলা হয় ' 'সম্পৃক্ত দ্বারা ' 'আবি...
1,2,একটি বুলেট লক্ষ বস্তুর 3 cm ভেতরে প্রবেশ করতে ...,['1cm' '1m' '2 cm' '2 m']
2,3,"একটি সরল দোলকের দোলনকাল T, দৈর্ঘ্য দ্বিগুণ হলে...",['2T' '`T/sqrt2`' '`T/2`' '`sqrt2T`']
3,4,একটি রকেট ঊর্ধমুখী যাত্রায় প্রথম 2 সেকেন্ড এর ...,['36 ms-2' '20.2 ms-2' '15.2 ms-2' '30 ms-2']
4,5,50 পাকের একটি বৃত্তাকার কুন্ডলীর ব্যাস 30 cm। ...,['4.7 × 10-5 A' '0.48 A' '47 A' '94 A']
...,...,...,...
195,196,একটি বালের গায়ে 100W-200V লেখা। এর রোধ কত?,['200Ω' '300Ω' '400Ω' '500Ω']
196,197,যে বৈশিষ্ট্য দ্বারা একটি শব্দ অন্য একটি শব্দ হ...,['তীব্রতা' 'স্বরকম্প' 'স্বরগ্রাম' 'শব্দোচ্চতা']
197,198,16 বিবর্ধন বিশিষ্ট নভো-দূরবীক্ষন যন্তের লেন্সে...,"['100 cm, 10 cm' '80 cm, 5 cm' '200 cm, 2 cm' ..."
198,199,একটি তারের প্রস্থচ্ছেদ এর ক্ষেত্রফল 0.003 m2 ।...,['102kg' '102 নিউটন ' '9.8 ×102kg' '9.8×102 নি...


**Option Labeling**

In [6]:
def clean_and_format_options(option_str):
    try:
        # Step 1: Ensure it's a string
        text = str(option_str)
        # Step 2: Add missing commas between options (assumes they are in quotes or separated by space)
        text = re.sub(r"'\s*'", "', '", text)  # Add comma between single-quoted strings
        # Step 3: Parse as list
        options = eval(text)  # Use eval carefully here, or switch to safer tokenizer
        # Step 4: Add A-D formatting
        labels = ['A', 'B', 'C', 'D']
        return " ".join([f"{label}. {opt.strip()}" for label, opt in zip(labels, options)])
    except Exception as e:
        return f"Error: {e}"

# Apply function
df["formatted_options"] = df["options"].apply(clean_and_format_options)

In [7]:
df['options'] =df['formatted_options']

In [8]:
df=df.drop(['formatted_options'], axis=1)

In [9]:
df

Unnamed: 0,id,question,options
0,1,কোনো বস্তুর চৌম্বকত্ব ধারকত্ব পরিমাপ করা হয়-,A. চুম্বকনকারি বলা হয় B. সম্পৃক্ত দ্বারা C. আব...
1,2,একটি বুলেট লক্ষ বস্তুর 3 cm ভেতরে প্রবেশ করতে ...,A. 1cm B. 1m C. 2 cm D. 2 m
2,3,"একটি সরল দোলকের দোলনকাল T, দৈর্ঘ্য দ্বিগুণ হলে...",A. 2T B. `T/sqrt2` C. `T/2` D. `sqrt2T`
3,4,একটি রকেট ঊর্ধমুখী যাত্রায় প্রথম 2 সেকেন্ড এর ...,A. 36 ms-2 B. 20.2 ms-2 C. 15.2 ms-2 D. 30 ms-2
4,5,50 পাকের একটি বৃত্তাকার কুন্ডলীর ব্যাস 30 cm। ...,A. 4.7 × 10-5 A B. 0.48 A C. 47 A D. 94 A
...,...,...,...
195,196,একটি বালের গায়ে 100W-200V লেখা। এর রোধ কত?,A. 200Ω B. 300Ω C. 400Ω D. 500Ω
196,197,যে বৈশিষ্ট্য দ্বারা একটি শব্দ অন্য একটি শব্দ হ...,A. তীব্রতা B. স্বরকম্প C. স্বরগ্রাম D. শব্দোচ্চতা
197,198,16 বিবর্ধন বিশিষ্ট নভো-দূরবীক্ষন যন্তের লেন্সে...,"A. 100 cm, 10 cm B. 80 cm, 5 cm C. 200 cm, 2 c..."
198,199,একটি তারের প্রস্থচ্ছেদ এর ক্ষেত্রফল 0.003 m2 ।...,A. 102kg B. 102 নিউটন C. 9.8 ×102kg D. 9.8×102...


**Add comma between options**

In [10]:
def add_commas(options):
    # Split by space followed by a letter and dot (e.g., " A.")
    parts = options.split(' ')
    processed = []
    for part in parts:
        if part and part[0].isupper() and part.endswith('.'):
            if processed:  # Add comma before options (except first one)
                processed[-1] = processed[-1] + ','
        processed.append(part)
    return ' '.join(processed).replace(' ,', ', ')  # Clean up spaces

# Apply the function to the options column
df['options'] = df['options'].apply(add_commas)

# Save the transformed dataset
df.to_csv('test_final.csv', index=False)

# Display a sample to verify
print("Original format:")
print("A. সমবর্তন B. প্রতিফলন C. ব্যাতিচার D. প্রতিসরণ")
print("\nTransformed format:")
print(df['options'].iloc[0])

Original format:
A. সমবর্তন B. প্রতিফলন C. ব্যাতিচার D. প্রতিসরণ

Transformed format:
A. চুম্বকনকারি বলা হয়, B. সম্পৃক্ত দ্বারা, C. আবিষ্ট চুম্বকত্ব দ্বারা, D. উপরের কোনোটিই নয়


In [11]:
df

Unnamed: 0,id,question,options
0,1,কোনো বস্তুর চৌম্বকত্ব ধারকত্ব পরিমাপ করা হয়-,"A. চুম্বকনকারি বলা হয়, B. সম্পৃক্ত দ্বারা, C. ..."
1,2,একটি বুলেট লক্ষ বস্তুর 3 cm ভেতরে প্রবেশ করতে ...,"A. 1cm, B. 1m, C. 2 cm, D. 2 m"
2,3,"একটি সরল দোলকের দোলনকাল T, দৈর্ঘ্য দ্বিগুণ হলে...","A. 2T, B. `T/sqrt2`, C. `T/2`, D. `sqrt2T`"
3,4,একটি রকেট ঊর্ধমুখী যাত্রায় প্রথম 2 সেকেন্ড এর ...,"A. 36 ms-2, B. 20.2 ms-2, C. 15.2 ms-2, D. 30 ..."
4,5,50 পাকের একটি বৃত্তাকার কুন্ডলীর ব্যাস 30 cm। ...,"A. 4.7 × 10-5 A, B. 0.48 A, C. 47 A, D. 94 A"
...,...,...,...
195,196,একটি বালের গায়ে 100W-200V লেখা। এর রোধ কত?,"A. 200Ω, B. 300Ω, C. 400Ω, D. 500Ω"
196,197,যে বৈশিষ্ট্য দ্বারা একটি শব্দ অন্য একটি শব্দ হ...,"A. তীব্রতা, B. স্বরকম্প, C. স্বরগ্রাম, D. শব্দ..."
197,198,16 বিবর্ধন বিশিষ্ট নভো-দূরবীক্ষন যন্তের লেন্সে...,"A. 100 cm, 10 cm, B. 80 cm, 5 cm, C. 200 cm, 2..."
198,199,একটি তারের প্রস্থচ্ছেদ এর ক্ষেত্রফল 0.003 m2 ।...,"A. 102kg, B. 102 নিউটন, C. 9.8 ×102kg, D. 9.8×..."


**Split questions: Theory & Math**

In [12]:
def classify_question(text):
    return "math" if any(char.isdigit() for char in str(text)) else "theory"

# Create classification column
df['type'] = df['question'].apply(classify_question)

# Split dataset
theory_df = df[df['type'] == 'theory'].drop(columns='type')
math_df = df[df['type'] == 'math'].drop(columns='type')

# Save to new CSV files
theory_df.to_csv('theory_questions.csv', index=False)
math_df.to_csv('math_questions.csv', index=False)

# Show confirmation
print(f"Total questions: {len(df)}")
print(f"Theory questions: {len(theory_df)}")
print(f"Math questions: {len(math_df)}")

Total questions: 200
Theory questions: 115
Math questions: 85


In [13]:
theory_df

Unnamed: 0,id,question,options
0,1,কোনো বস্তুর চৌম্বকত্ব ধারকত্ব পরিমাপ করা হয়-,"A. চুম্বকনকারি বলা হয়, B. সম্পৃক্ত দ্বারা, C. ..."
2,3,"একটি সরল দোলকের দোলনকাল T, দৈর্ঘ্য দ্বিগুণ হলে...","A. 2T, B. `T/sqrt2`, C. `T/2`, D. `sqrt2T`"
7,8,শব্দের উপরিপাতন নীতির উপর ভিত্তি করে নিচের কোন...,"A. মুক্ত কম্পন, B. পরবশ কম্পন, C. অনুনাদ, D. স..."
9,10,একটি পরিবাহীর ভেতর দিয়ে I তড়িৎ প্রবাহের জন্য প...,"A. B ∝ I2, B. B ∝ I, C. B ∝ 1/I, D. B ∝ 1/ I2"
11,12,একটি ঘড়ির সেকেন্ডের কাঁটার কৌণিক বেগ কত?,"A. π rads−1, B. `pi/30 rads^-1`, C. `pi/60 rad..."
...,...,...,...
192,193,অসম্পৃক্ত বাষ্পচাপকে f এবং সম্পৃক্ত বাষ্পচাপকে...,"A. f>F, B. f≥F, C. f≤F, D. f<F"
193,194,যে সব বস্তু হতে প্রযুক্ত বল অপসারণ করলে এদের ব...,"A. পূর্ণদৃঢ় বস্তু, B. সমদিক ধর্মি বস্তু, C. অ..."
194,195,তাপের যান্ত্রিক সমতা J এর এসআই একক কি?,"A. জুল, B. ওয়াট, C. ক্যালরি, D. একক নেই"
196,197,যে বৈশিষ্ট্য দ্বারা একটি শব্দ অন্য একটি শব্দ হ...,"A. তীব্রতা, B. স্বরকম্প, C. স্বরগ্রাম, D. শব্দ..."


In [14]:
theoryQuestion_prompt="""You are a subject‑matter expert in physics, fluent in both Bengali and English. You will be given a multiple‑choice question (MCQ) in Bengali along with its answer options.

Your task:
1. If the question or any option is unclear to you in Bengali, translate the question and options into English.
2. Think step by step (chain‑of‑thought) to analyze the physics concept behind the question.
3. Examine every option critically and compare them.
4. Choose the most scientifically accurate option. If calculation needed use python code.

(You may use up to 2,048 tokens for your reasoning. Please take your time to be as accurate as possible.)

### Question:
{question}

### Options:
{options}

### Response:

"""

In [15]:
theory1=theory_df[:60]

In [16]:
theory2=theory_df[60:]

In [17]:
theory1

Unnamed: 0,id,question,options
0,1,কোনো বস্তুর চৌম্বকত্ব ধারকত্ব পরিমাপ করা হয়-,"A. চুম্বকনকারি বলা হয়, B. সম্পৃক্ত দ্বারা, C. ..."
2,3,"একটি সরল দোলকের দোলনকাল T, দৈর্ঘ্য দ্বিগুণ হলে...","A. 2T, B. `T/sqrt2`, C. `T/2`, D. `sqrt2T`"
7,8,শব্দের উপরিপাতন নীতির উপর ভিত্তি করে নিচের কোন...,"A. মুক্ত কম্পন, B. পরবশ কম্পন, C. অনুনাদ, D. স..."
9,10,একটি পরিবাহীর ভেতর দিয়ে I তড়িৎ প্রবাহের জন্য প...,"A. B ∝ I2, B. B ∝ I, C. B ∝ 1/I, D. B ∝ 1/ I2"
11,12,একটি ঘড়ির সেকেন্ডের কাঁটার কৌণিক বেগ কত?,"A. π rads−1, B. `pi/30 rads^-1`, C. `pi/60 rad..."
13,14,অসম বেগ কিন্তু সমত্বরণের ক্ষেত্রে বেগ বনাম সময়...,"A. দূরত্ব, B. সরণ, C. বেগ, D. ত্বরণ"
18,19,উৎসের কম্পাংক নিম্নের কত হলে আমরা শব্দ শুনতে পাই?,"A. 20 Hz এর নিচে, B. 20,000 Hz এর উপরে, C. 20-..."
19,20,বক্রপথে রেলগাড়ির গমনের জন্য-,A. বহিঃস্থ ও অন্তঃস্থ রেলের উচ্চতা সমান হতে হব...
20,21,জার্মেনিয়ামের সাথে নিচের কোনটি যুক্ত থাকলে n ট...,"A. অ্যালুমিনিয়াম, B. গ্যালিয়াম, C. আর্সেনিক, D..."
22,23,সর্বাপেক্ষা স্থিতিস্থাপক বস্তু কোনটি?,"A. তামা, B. লোহা, C. কোয়ার্টজ, D. কাঠ"


In [18]:
theory2

Unnamed: 0,id,question,options
110,111,মধ্যাকর্ষী ধ্রুবকের মাত্রা সমীকরণ কোনটি?,"A. [ML3T-2], B. [M-1T-2L3], C. [M-1T-2L-3], D...."
112,113,হুইটস্টোন ব্রীজে কার সূত্র ব্যবহার করা হয়েছে?,"A. ওমের, B. ওয়েরস্টেডের, C. ফ্যারাডের, D. কির্..."
113,114,ক্ষমতা ও কাজ সংক্রান্ত নিম্নের কোন সমীকরণটি সঠ...,"A. W=pt, B. p=W/t, C. t=W/p, D. Wp=t"
114,115,যে যন্ত্রের সাহায্য যান্ত্রিক শক্তিকে তড়িৎ শক্...,"A. ডায়নামো, B. গ্যালভানোমিটার, C. পোটেনশিওমিটা..."
117,118,নিচের কোনটি অসংরক্ষণশীল বলের উদাহরণ?,"A. সান্দ্রবল, B. অভিকর্ষীয় বল, C. মহাকর্ষ বল,..."
119,120,সময় ব্যবধান শূণ্যের কাছাকাছি সময়ের সাথে বস্তুর...,"A. অসম বেগ, B. গড়বেগ, C. ধ্রুব বেগ, D. তাৎক্ষণ..."
120,121,কোন ভেক্টরের শীর্ষবিন্দু ও পাদবিন্দু একই হলে ভ...,"A. ব্যাসার্ধ ভেক্টর, B. সদৃশ ভেক্টর, C. নাল ভে..."
125,126,একটি মোটর গাড়ি চলার সময় এর চাকা কী জাতীয় গতি...,"A. পর্যায়গতি, B. চলনগতি, C. ঘূর্ণনগতি, D. চলন-..."
126,127,"শব্দ তরঙ্গকে বাযুতে সমবর্তন করা যায় না, কারণ এ...","A. চলমান, B. স্থির, C. অনুপ্রস্থ, D. অনুদৈর্ঘ্য"
130,131,ত্বরণের বৈশিষ্ট্য কোনটি?,"A. বস্তুর ত্বরণ বলের সমানুপাতিক, a∝F, B. ত্বরণ..."


In [None]:
import pandas as pd
from transformers import TextStreamer
from unsloth import FastLanguageModel

# Set EOS and PAD tokens if needed
if tokenizer.eos_token is None:
    tokenizer.eos_token = "<|endoftext|>"
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Enable fast inference
FastLanguageModel.for_inference(model)

streamer = TextStreamer(tokenizer)

# Initialize results list
results = []

# Loop through test dataset
for idx, row in theory1.iterrows():
    prompt_text = theoryQuestion_prompt.format(
        question=row["question"],
        options=row["options"]
    )

    # Tokenize and send to GPU
    inputs = tokenizer([prompt_text], return_tensors="pt").to("cuda")

    # Generate answer
    output = model.generate(
        **inputs,
        max_new_tokens=850,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        streamer=streamer
    )
    
    # Decode the output
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Store results
    results.append({
        'id': row['id'],
        'response': response
    })

# Create DataFrame from results
response_df1 = pd.DataFrame(results)

# Save to CSV
response_df1.to_csv('model_responses_theorymain1.csv', index=False)



You are a subject‑matter expert in physics, fluent in both Bengali and English. You will be given a multiple‑choice question (MCQ) in Bengali along with its answer options.

Your task:
1. If the question or any option is unclear to you in Bengali, translate the question and options into English.
2. Think step by step (chain‑of‑thought) to analyze the physics concept behind the question.
3. Examine every option critically and compare them.
4. Choose the most scientifically accurate option. If calculation needed use python code.

(You may use up to 2,048 tokens for your reasoning. Please take your time to be as accurate as possible.)

### Question:
কোনো বস্তুর চৌম্বকত্ব ধারকত্ব পরিমাপ করা হয়-

### Options:
A. চুম্বকনকারি বলা হয়, B. সম্পৃক্ত দ্বারা, C. আবিষ্ট চুম্বকত্ব দ্বারা, D. উপরের কোনোটিই নয়

### Response:

1. **Translation**:
   - Question: How is the magnetic permeability of a substance measured?
   - Options:
     A. By magnetizing force
     B. By saturation
     C. By induced

**Extract 30 words before "Human:" text(<|endoftext|/>)**

In [None]:
def extract_preceding_words(response, num_words=30):
    # Find "Human:" and capture all text before it
    match = re.search(r"(.*?)(Human:)", response, re.DOTALL)
    if not match:
        return ""
    
    # Get the text before "Human:"
    preceding_text = match.group(1)
    # Split into words, handling multiple spaces, newlines, etc.
    words = re.findall(r'\S+', preceding_text)
    # Take the last 20 words (or fewer if not enough)
    return " ".join(words[-num_words:])

# Read the CSV file
df2 = pd.read_csv("model_responses_theorymain1.csv")

# Apply the extraction function
df2['extracted'] = df2['response'].apply(extract_preceding_words)

In [None]:
new_df = df2[['id', 'extracted']]
new_df.to_csv("extracted_responses_theorymain1.csv", index=False)

In [None]:
new_df['extracted']

**Prompt for extracting option label A/B/C/D from text**

In [None]:
new_prompt="""read the text and find correct answer option from the text.Just (A/B/C/D), no extra text.
{extracted}
#Answer:
"""

In [None]:
results = []

# 5. Loop through every question and generate (no streamer)
for idx, row in new_df.iterrows():
    prompt_text = new_prompt.format(
        extracted = row['extracted'],
        
    )
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    
    # generate without streaming so we can capture the output tokens
    output_ids = model.generate(
        **inputs,
        temperature=0.1,
            top_p=0.95,
            top_k=50,
        max_new_tokens = 1,
        eos_token_id   = tokenizer.eos_token_id,
    )
    
    # strip off the prompt tokens to get only the generated answer
    gen_tokens = output_ids[0][ inputs["input_ids"].shape[-1] : ]
    answer = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    # print the model's raw output
    print(f"ID: {row['id']} | Answer: {answer}")
    
    results.append({
        "id":     row["id"],
        "answer": answer
    })

# build DataFrame & write CSV
sub_theorymain1 = pd.DataFrame(results)

In [None]:
sub_theorymain1.to_csv('sub_theorymain1.csv', index=False)

**Theory question part2**

In [None]:
import pandas as pd
from transformers import TextStreamer
from unsloth import FastLanguageModel

# Set EOS and PAD tokens if needed
if tokenizer.eos_token is None:
    tokenizer.eos_token = "<|endoftext|>"
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Enable fast inference
FastLanguageModel.for_inference(model)

streamer = TextStreamer(tokenizer)

# Initialize results list
results = []

# Loop through test dataset
for idx, row in theory2.iterrows():
    prompt_text = theoryQuestion_prompt.format(
        question=row["question"],
        options=row["options"]
    )

    # Tokenize and send to GPU
    inputs = tokenizer([prompt_text], return_tensors="pt").to("cuda")

    # Generate answer
    output = model.generate(
        **inputs,
        max_new_tokens=850,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        streamer=streamer
    )
    
    # Decode the output
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Store results
    results.append({
        'id': row['id'],
        'response': response
    })

# Create DataFrame from results
response_df2 = pd.DataFrame(results)

# Save to CSV
response_df2.to_csv('model_responses_theorymain2.csv', index=False)

# Display the first few responses
print(response_df2.head())

In [None]:
def extract_preceding_words(response, num_words=30):
    # Find "Human:" and capture all text before it
    match = re.search(r"(.*?)(Human:)", response, re.DOTALL)
    if not match:
        return ""
    
    # Get the text before "Human:"
    preceding_text = match.group(1)
    # Split into words, handling multiple spaces, newlines, etc.
    words = re.findall(r'\S+', preceding_text)
    # Take the last 20 words (or fewer if not enough)
    return " ".join(words[-num_words:])

# Read the CSV file
df2 = pd.read_csv("model_responses_theorymain2.csv")

# Apply the extraction function
df2['extracted'] = df2['response'].apply(extract_preceding_words)

In [None]:
new_df = df2[['id', 'extracted']]
new_df.to_csv("extracted_responses_theorymain2.csv", index=False)

In [None]:
new_df['extracted']

In [None]:
results = []

# 5. Loop through every question and generate (no streamer)
for idx, row in new_df.iterrows():
    prompt_text = new_prompt.format(
        extracted = row['extracted'],
        
    )
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    
    # generate without streaming so we can capture the output tokens
    output_ids = model.generate(
        **inputs,
        temperature=0.1,
            top_p=0.95,
            top_k=50,
        max_new_tokens = 1,
        eos_token_id   = tokenizer.eos_token_id,
    )
    
    # strip off the prompt tokens to get only the generated answer
    gen_tokens = output_ids[0][ inputs["input_ids"].shape[-1] : ]
    answer = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    # print the model's raw output
    print(f"ID: {row['id']} | Answer: {answer}")
    
    results.append({
        "id":     row["id"],
        "answer": answer
    })

# build DataFrame & write CSV
sub_theorymain2 = pd.DataFrame(results)

In [None]:
sub_theorymain2.to_csv('sub_theorymain2.csv', index=False)

**Math Question Part**

In [None]:
math_df

**Incomplete question splitting for another prompting**

In [None]:
incomplete_math_df = math_df[~math_df['question'].str.strip().str.endswith(('-',':','ঃ', '?',')',']','।'))]

incomplete_math_df

In [None]:
math_df_cleaned = math_df[~math_df['id'].isin(incomplete_math_df['id'])]
math_df= math_df_cleaned

In [None]:
mathQuestion_prompt = """You are a physics and mathematics expert fluent in Bengali and English. You will be given a math-based physics MCQ in Bengali with options.

Your task:
1. First translate the question and options into English.
2. Identify the required formula(s) from physics or mathematics.
3. Write down the known values and solve step by step.
4. Use basic Python-style code if necessary to verify your calculation.
5. Compare your result with the options provided and select the most accurate answer.
6. Carefully check units (e.g., cm vs. m, g vs. kg).
7. If two options are close, pick the one that best matches significant digits and unit conversion.

(You may use up to 2,048 tokens for your reasoning. Please take your time to be as accurate as possible.)

### Question:
{question}

### Options:
{options}

### Response:
"""

In [None]:
import pandas as pd
from transformers import TextStreamer
from unsloth import FastLanguageModel

# Set EOS and PAD tokens if needed
if tokenizer.eos_token is None:
    tokenizer.eos_token = "<|endoftext|>"
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Enable fast inference
FastLanguageModel.for_inference(model)

streamer = TextStreamer(tokenizer)

# Initialize results list
results = []

# Loop through test dataset
for idx, row in math_df.iterrows():
    prompt_text = mathQuestion_prompt.format(
        question=row["question"],
        options=row["options"]
    )

    # Tokenize and send to GPU
    inputs = tokenizer([prompt_text], return_tensors="pt").to("cuda")

    # Generate answer
    output = model.generate(
        **inputs,
        max_new_tokens=1200,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        streamer=streamer
    )
    
    # Decode the output
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Store results
    results.append({
        'id': row['id'],
        'response': response
    })

# Create DataFrame from results
response_df = pd.DataFrame(results)

# Save to CSV
response_df.to_csv('model_responses_mathmain.csv', index=False)

# Display the first few responses
print(response_df.head())

In [None]:
response_df["response"]

**30 word extracting from response**

In [None]:
def extract_preceding_words(response, num_words=30):
    # Find "Human:" and capture all text before it
    match = re.search(r"(.*?)(Human:)", response, re.DOTALL)
    if not match:
        return ""
    
    # Get the text before "Human:"
    preceding_text = match.group(1)
    # Split into words, handling multiple spaces, newlines, etc.
    words = re.findall(r'\S+', preceding_text)
    # Take the last 20 words (or fewer if not enough)
    return " ".join(words[-num_words:])

# Read the CSV file
df2 = pd.read_csv("model_responses_mathmain.csv")

# Apply the extraction function
df2['extracted'] = df2['response'].apply(extract_preceding_words)

In [None]:
new_df = df2[['id', 'extracted']]

In [None]:
new_df.to_csv("extracted_responses_mathmain.csv", index=False)

In [None]:
new_df['extracted']

In [None]:
new_prompt="""read the text and find correct answer option from the text.Just (A/B/C/D), no extra text.
{extracted}
#Answer:
"""

In [None]:
results = []

# 5. Loop through every question and generate (no streamer)
for idx, row in new_df.iterrows():
    prompt_text = new_prompt.format(
        extracted = row['extracted'],
        
    )
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    
    # generate without streaming so we can capture the output tokens
    output_ids = model.generate(
        **inputs,
        temperature=0.1,
            top_p=0.95,
            top_k=50,
        max_new_tokens = 1,
        eos_token_id   = tokenizer.eos_token_id,
    )
    
    # strip off the prompt tokens to get only the generated answer
    gen_tokens = output_ids[0][ inputs["input_ids"].shape[-1] : ]
    answer = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    # print the model's raw output
    print(f"ID: {row['id']} | Answer: {answer}")
    
    results.append({
        "id":     row["id"],
        "answer": answer
    })

# build DataFrame & write CSV
sub_mathmain = pd.DataFrame(results)

In [None]:
sub_mathmain.to_csv('sub_mathmain.csv', index=False)

**Incomplete Question Part**

**Incomplete + (#,None,Blank) Answered questions reprocessing**

In [None]:
answer_df = sub_mathmain

incomplete_ans_ids = answer_df[~answer_df['answer'].isin(['A', 'B', 'C', 'D'])]['id']

new_incomplete_df = math_df[math_df['id'].isin(incomplete_ans_ids)]

combined_df = pd.concat([incomplete_math_df, new_incomplete_df])

combined_df = combined_df.drop_duplicates(subset='id')

incomplete_math_df= combined_df

In [None]:
IncompleteMathQuestion_prompt = """You are a highly intelligent assistant specialized in solving Bangla-medium Physics MCQs, especially math-based problems. You are given an *incomplete or unclear MCQ question*, where the main question might be missing or poorly written, but the answer options are provided.
Your task:
1. Based on the answer options, *reconstruct a plausible full Physics MCQ question* in Bangla that best fits the context of the options.
2. Translate your reconstructed question into English to help you reason.
3. Identify which domain it belongs to (e.g. kinematics, energy, current, etc.)
4. If the question is mathematical, solve it *step-by-step, using formulas and applying **Python-style code* where helpful.
5. Analyze and compare each option based on your computed or reasoned result.
6. If no option is exactly correct, choose the *closest* correct answer.
7. Output your final answer in the following strict format:

Answer will be this format(A/B/C/D:option).

(You may use up to 2,048 tokens for your reasoning. Please take your time to be as accurate as possible.)

### Instruction:
{question}

### Input:
{options}

### Response:
"""

In [None]:
import pandas as pd
from transformers import TextStreamer
from unsloth import FastLanguageModel

# Set EOS and PAD tokens if needed
if tokenizer.eos_token is None:
    tokenizer.eos_token = "<|endoftext|>"
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Enable fast inference
FastLanguageModel.for_inference(model)

streamer = TextStreamer(tokenizer)

# Initialize results list
results = []

# Loop through test dataset
for idx, row in incomplete_math_df.iterrows():
    prompt_text = IncompleteMathQuestion_prompt.format(
        question=row["question"],
        options=row["options"]
    )

    # Tokenize and send to GPU
    inputs = tokenizer([prompt_text], return_tensors="pt").to("cuda")

    # Generate answer
    output = model.generate(
        **inputs,
        max_new_tokens=1400,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        streamer=streamer
    )
    
    # Decode the output
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Store results
    results.append({
        'id': row['id'],
        'response': response
    })

# Create DataFrame from results
response_df = pd.DataFrame(results)

# Save to CSV
response_df.to_csv('model_responses_incompletemath.csv', index=False)

# Display the first few responses
print(response_df.head())

In [None]:
import pandas as pd
import re

def extract_preceding_words(response, num_words=30):
    # Find "Human:" and capture all text before it
    match = re.search(r"(.*?)(Human:)", response, re.DOTALL)
    if not match:
        return ""
    
    # Get the text before "Human:"
    preceding_text = match.group(1)
    # Split into words, handling multiple spaces, newlines, etc.
    words = re.findall(r'\S+', preceding_text)
    # Take the last 20 words (or fewer if not enough)
    return " ".join(words[-num_words:])

# Read the CSV file
df_math = pd.read_csv("model_responses_incompletemath.csv")

# Apply the extraction function
df_math['extracted'] = df_math['response'].apply(extract_preceding_words)



In [None]:
new_df = df_math[['id', 'extracted']]

In [None]:
new_df

In [None]:
import pandas as pd

results = []

# 5. Loop through every question and generate (no streamer)
for idx, row in new_df.iterrows():
    prompt_text = new_prompt.format(
        extracted = row['extracted'],
        
    )
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    
    # generate without streaming so we can capture the output tokens
    output_ids = model.generate(
        **inputs,
        temperature=0.1,
            top_p=0.95,
            top_k=50,
        max_new_tokens = 1,
        eos_token_id   = tokenizer.eos_token_id,
    )
    
    # strip off the prompt tokens to get only the generated answer
    gen_tokens = output_ids[0][ inputs["input_ids"].shape[-1] : ]
    answer = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    # print the model's raw output
    print(f"ID: {row['id']} | Answer: {answer}")
    
    results.append({
        "id":     row["id"],
        "answer": answer
    })

# build DataFrame & write CSV
last_df_math = pd.DataFrame(results)
last_df_math.to_csv("sub_incompletemath.csv", index=False)

**Replace & concat incomplete math part to main math submission file**** **

In [None]:
t2=last_df_math
t1=sub_mathmain

In [None]:
common_ids = t2[t2['id'].isin(t1['id'])]['id'].tolist()

# Step 2: Drop rows from t1 that have IDs present in t2 (to avoid duplicates)
t1_updated = t1[~t1['id'].isin(common_ids)]

# Step 3: Concatenate t1 (filtered) and t2 (all rows)
final_df_math = pd.concat([t1_updated, t2], ignore_index=True)

# Step 4: Save the result
final_df_math.to_csv('updated_mathsub.csv', index=False)

print("Merge completed. Existing IDs replaced, new IDs appended.")

In [None]:
final_df_math.shape

**Merge all submission files(theory1,theory2,math)**

In [None]:


# List of CSV files to merge (modify these paths)
file_paths = [
    '/kaggle/working/sub_theorymain2.csv',
    '/kaggle/working/sub_theorymain2.csv',
    '/kaggle/working/updated_mathsub.csv'
]

# Read and merge the files
dfs = [pd.read_csv(file) for file in file_paths]
merged_df = pd.concat(dfs, ignore_index=True)

# Display the merged DataFrame
print(merged_df.head())

**Sorting the submission file**

In [None]:
sorted_df2 = merged_df.sort_values(by="id")

# Step 4: Save to a new CSV file
sorted_df2.to_csv("merged_sorted_submission_f.csv", index=False)

In [None]:
sorted_df2.shape

**Checking sorted submission file's answer if it have any (#,blank,None) answer**

In [None]:


# Load your datasets
main_df = pd.read_csv('/kaggle/working/test_final.csv')  # Contains id and question columns
answer_df = sorted_df2  # Contains id and answer columns
def find_invalid_ids(df):
    invalid = []
    for _, row in df.iterrows():
        answer = str(row['answer']).strip().upper() if pd.notna(row['answer']) else None
        if answer not in ['A', 'B', 'C', 'D']:
            invalid.append(row['id'])
    return invalid

# Get invalid IDs automatically
invalid_ids = find_invalid_ids(answer_df)

# Extract corresponding questions AND options
result_df = main_df[main_df['id'].isin(invalid_ids)][['id', 'question', 'options']] \
    .merge(answer_df[answer_df['id'].isin(invalid_ids)],
           on='id',
           how='left')

# Rename columns for clarity
result_df.columns = ['id', 'question', 'options', 'current_invalid_answer']

# Save and display results
print(f"Found {len(invalid_ids)} invalid IDs: {invalid_ids}")
print("\nQuestions needing re-prompting with options:")
print(result_df)

result_df.to_csv('questions_for_reprompting_with_options.csv', index=False)
print("\nSaved results to 'questions_for_reprompting_with_options.csv'")

In [None]:
result_df=result_df.drop('current_invalid_answer',axis=1)

In [None]:
result_df

**Re prompting the invalid answered(#,blank,None) questions**

In [None]:
invalid_prompt="""Below is an instruction that describes bengali physics mcq, paired with an input that provides the options of mcq. Write a response that appropriately completes the request.
Instructions: 
1.translate the question and options in English and answer the question accurately step by step.
2.analyze every options before you make your final decision and compare your response.

### Instruction:
{question}

### Input:
{options}

### Response:

"""

In [None]:
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
from unsloth import FastLanguageModel

# Set EOS and PAD tokens if needed
if tokenizer.eos_token is None:
    tokenizer.eos_token = "<|endoftext|>"
    tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("<|endoftext|>")

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Enable fast inference
FastLanguageModel.for_inference(model)

# Initialize results list
results = []

# Loop through test dataset
for idx, row in result_df.iterrows():
    prompt_text = invalid_prompt.format(
        question=row["question"],
        options=row["options"]
    )

    # Tokenize and send to GPU
    inputs = tokenizer([prompt_text], return_tensors="pt").to("cuda")

    # Generate answer
    output = model.generate(
        **inputs,
        max_new_tokens=700,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        streamer = TextStreamer(tokenizer)
    )
    
    # Decode the output
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Store results
    results.append({
        'id': row['id'],
        'response': response
    })

# Create DataFrame from results
response_df_invalid = pd.DataFrame(results)

# Save to CSV
response_df_invalid.to_csv('model_responses_invalid.csv', index=False)

# Display the first few responses
print(response_df_invalid.head())

In [None]:
# Read the CSV file
df3 = pd.read_csv("model_responses_invalid.csv")

# Apply the extraction function
df3['extracted'] = df3['response'].apply(extract_preceding_words)

In [None]:
new_df = df3[['id', 'extracted']]
new_df.to_csv("extracted_responses_invalid.csv", index=False)

In [None]:
new_df

In [None]:
results = []

# 5. Loop through every question and generate (no streamer)
for idx, row in new_df.iterrows():
    prompt_text = new_prompt.format(
        extracted = row['extracted'],
        
    )
    inputs = tokenizer(prompt_text, return_tensors="pt").to("cuda")
    
    # generate without streaming so we can capture the output tokens
    output_ids = model.generate(
        **inputs,
        temperature=0.1,
            top_p=0.95,
            top_k=50,
        max_new_tokens = 1,
        eos_token_id   = tokenizer.eos_token_id,
    )
    
    # strip off the prompt tokens to get only the generated answer
    gen_tokens = output_ids[0][ inputs["input_ids"].shape[-1] : ]
    answer = tokenizer.decode(gen_tokens, skip_special_tokens=True).strip()

    # print the model's raw output
    print(f"ID: {row['id']} | Answer: {answer}")
    
    results.append({
        "id":     row["id"],
        "answer": answer
    })

# build DataFrame & write CSV
sub_theoryinvalid = pd.DataFrame(results)

**Update the invalid answers to the sorted dataframe**

In [None]:
test2=sub_theoryinvalid
test1=sorted_df2

In [None]:

# Create a dictionary of test2 data for quick lookup
test2_dict = test2.set_index('id')['answer'].to_dict()

# Replace matching rows in test1
test1['answer'] = test1['id'].map(test2_dict).fillna(test1['answer'])

# Save the updated test1
test1.to_csv('final_sub.csv', index=False)

print("Replacement complete. Saved as updated_test1.csv")

In [None]:
final_sub=test1

**This is final submission file**

In [None]:
final_sub