In [1]:
import pandas as pd

In [3]:
df1 = pd.read_csv("/home/shivraj-pg/DEPNECT/conllu-style-csv/without-context-coarse-train.csv")
df2 = pd.read_csv(
    "/home/shivraj-pg/DEPNECT/conllu-style-csv/without-context-coarse-test.csv")
df3 = pd.read_csv(
    "/home/shivraj-pg/DEPNECT/conllu-style-csv/without-context-coarse-dev.csv")

merged = pd.concat([df1, df2, df3], ignore_index=True)

In [4]:
df1.shape, df2.shape, df3.shape, merged.shape

((11000, 2), (2940, 2), (2000, 2), (15940, 2))

In [5]:
merged = merged[['sentence', 'gold']]

In [6]:
merged.columns

Index(['sentence', 'gold'], dtype='object')

In [7]:
merged = merged[['sentence', 'gold']]
merged.columns

Index(['sentence', 'gold'], dtype='object')

In [12]:
from transformers import AutoTokenizer
from tqdm import tqdm
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-27b-it")

# Columns
gold_col = "gold"
gold_tokens_col = "gold-tokens"
gold_length_col = "gold-token-len"

sent_col = "sentence"
sent_tokens_col = "sent-tokens"
sent_length_col = "sent-token-len"

# Ensure columns exist
for col, default in [
    (gold_tokens_col, None),
    (gold_length_col, 0),
    (sent_tokens_col, None),
    (sent_length_col, 0),
]:
    if col not in merged.columns:
        if default is None:
            merged[col] = pd.Series([None] * len(merged), dtype=object)
        else:
            merged[col] = default

# Helper function
def tokenize_text(text):
    if not isinstance(text, str) or not text.strip():
        return [], 0
    tokens = tokenizer.encode(text.strip(), add_special_tokens=False)
    return tokens, len(tokens)

# Tokenization loop
for idx, row in tqdm(merged.iterrows(), total=len(merged), desc="Tokenizing"):
    gold_tokens, gold_len = tokenize_text(row[gold_col])
    merged.at[idx, gold_tokens_col] = gold_tokens
    merged.at[idx, gold_length_col] = gold_len

    sent_tokens, sent_len = tokenize_text(row[sent_col])
    merged.at[idx, sent_tokens_col] = sent_tokens
    merged.at[idx, sent_length_col] = sent_len


Tokenizing: 100%|██████████| 15940/15940 [00:08<00:00, 1795.82it/s]


In [13]:
merged.columns

Index(['sentence', 'gold', 'gold-tokens', 'gold-token-len', 'sent-tokens',
       'sent-token-len'],
      dtype='object')

In [14]:
merged[['sent-token-len', 'gold-token-len']].describe()

Unnamed: 0,sent-token-len,gold-token-len
count,15940.0,15940.0
mean,27.938645,478.811104
std,12.602983,223.11483
min,6.0,118.0
25%,19.0,305.0
50%,25.0,432.0
75%,34.0,599.0
max,90.0,1554.0


In [15]:
SYSTEM = """
You are an expert in Sanskrit grammar, who identifies and classifies compounds in the given Sanskrit sentence. You will be given the original sentence. First break the sentence in compounds.
Follow these rules strictly:
1. Only use the following 4 compound types. Do not invent or include other types:
    - Tatpurusha: An endocentric compound where the first element (the attributive) determines the second.
    - Avyayibhava: An adverbial compound made of an indeclinable element and a noun, expressing an adverbial meaning.
    - Dvandva: A copulative compound where two or more noun stems are joined by 'and'.
    - Bahuvrihi: An exocentric compound that describes something by referring to its parts.
2. The sentence may contain nested compounds or non-compounded words — handle appropriately.
3. Maintain strict formatting and provide only the answer line. Do not include explanations.
4. The start or end indexes must not exceed the number of words in the sentence.
5. Answer in the devnagri script only, there shouldn't be any latin in the answer

Text:
{INPUT}

Return strictly in JSON with keys:
{
  "tokens": [...],
  "compounds": [
    {
      "span": [start_token_index, end_token_index],
      "label": "<Samasa_type>"
    }
  ]
}

Rules:
- Tokenize by meaningful Sanskrit units.
- span = inclusive of start index, exclusive of end index.
- If multiple nested samāsa exist, include all.
- If none, return empty lists for compounds.
- Do not output anything outside JSON.


Example 1:
Input: ससर्षपंतुम्बुरुधान्यवन्यंचण्डांचचूर्णानिसमानिकुर्यात्DUMMY
Output:
{'tokens': ['स', 'सर्षपं', 'तुम्बुरु', 'धान्य', 'वन्यं', 'चण्डां', 'च', 'चूर्णानि', 'समानि', 'कुर्यात्', 'DUMMY'], 
'compounds': [
    {'span': ['1', '2'], 'label': 'Bahuvrihi'}, 
    {'span': ['2', '11'], 'label': 'Comp_root'}, 
    {'span': ['3', '5'], 'label': 'Dvandva'}, 
    {'span': ['4', '5'], 'label': 'Dvandva'}, 
    {'span': ['5', '11'], 'label': 'Comp_root'}, 
    {'span': ['6', '11'], 'label': 'No_rel'}, 
    {'span': ['7', '11'], 'label': 'No_rel'}, 
    {'span': ['8', '11'], 'label': 'No_rel'}, 
    {'span': ['9', '11'], 'label': 'No_rel'}, 
    {'span': ['10', '11'], 'label': 'No_rel'}, 
    {'span': ['11', '0'], 'label': 'root'}]}
   
Example 2:
Input: आपाततसामान्याइवप्रतीयमानाएतेयदिसूक्ष्मम्निरीक्ष्येरन्तर्हिएतेषाम्हृत्अन्तस्थसंकटबोधDUMMY
Output:
{'tokens': ['आपातत', 'सामान्या', 'इव', 'प्रतीयमाना', 'एते', 'यदि', 'सूक्ष्मम्', 'निरीक्ष्येरन्', 'तर्हि', 'एतेषाम्', 'हृत्', 'अन्त', 'स्थ', 'संकट', 'बोध', 'DUMMY'], 
'compounds': [
    {'span': ['1', '16'], 'label': 'No_rel'}, 
    {'span': ['2', '16'], 'label': 'No_rel'}, 
    {'span': ['3', '16'], 'label': 'No_rel'}, 
    {'span': ['4', '16'], 'label': 'No_rel'}, 
    {'span': ['5', '16'], 'label': 'No_rel'}, 
    {'span': ['6', '16'], 'label': 'No_rel'}, 
    {'span': ['7', '16'], 'label': 'No_rel'}, 
    {'span': ['8', '16'], 'label': 'No_rel'}, 
    {'span': ['9', '16'], 'label': 'No_rel'}, 
    {'span': ['10', '16'], 'label': 'No_rel'}, 
    {'span': ['11', '12'], 'label': 'Tatpurusha'}, 
    {'span': ['12', '13'], 'label': 'Tatpurusha'}, 
    {'span': ['13', '16'], 'label': 'Comp_root'}, 
    {'span': ['14', '15'], 'label': 'Tatpurusha'}, 
    {'span': ['15', '16'], 'label': 'Comp_root'}, 
    {'span': ['16', '0'], 'label': 'root'}]}
    
Input: सःचनविशिष्टवैशिष्ट्यअवगाहीDUMMY
Output:
{'tokens': ['सः', 'च', 'न', 'विशिष्ट', 'वैशिष्ट्य', 'अवगाही', 'DUMMY'], 
'compounds': [
    {'span': ['1', '7'], 'label': 'No_rel'}, 
    {'span': ['2', '7'], 'label': 'No_rel'}, 
    {'span': ['3', '7'], 'label': 'No_rel'}, 
    {'span': ['4', '5'], 'label': 'Tatpurusha'}, 
    {'span': ['5', '6'], 'label': 'Tatpurusha'}, 
    {'span': ['6', '7'], 'label': 'Comp_root'}, 
    {'span': ['7', '0'], 'label': 'root'}]}
"""

In [16]:
tokenized_prompt = tokenizer.encode(SYSTEM)

In [17]:
len(tokenized_prompt)

1370

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("/home/shivraj-pg/DEPNECT/conllu-style-csv/asthangrudyam.csv")

df.columns

Index(['sentence', 'gold'], dtype='object')

In [None]:
df['sentence'][0], df['gold'][0]

('रागआदिरोगान् सततअनुषक्तान् अशेषकायप्रसृतान् अशेषान् औत्सुक्यमोहअरतिदान् जघान (यः) .',
 "{'tokens': ['राग', 'आदि', 'रोगान्', 'सतत', 'अनुषक्तान्', 'अ', 'शेष', 'काय', 'प्रसृतान्', 'अ', 'शेषान्', 'औत्सुक्य', 'मोह', 'अ', 'रति', 'दान्', 'जघान', '(यः)', '.'], 'compounds': '1\\tराग-\\tराग-आदि-रोगान्\\t2\\tComp3\\t_\\tबहुव्रीहिः\\n2\\tआदि-\\t--\\t3\\tComp3\\t_\\tकर्मधारयः\\n3\\tरोगान्\\t--\\t20\\tComp3\\t_\\tComp_root\\n4\\tसतत-\\tसतत-अनुषक्तान्\\t5\\tComp2\\t_\\tकर्मधारयः\\n5\\tअनुषक्तान्\\t--\\t20\\tComp2\\t_\\tComp_root\\n6\\tअ-\\tअ-शेष-काय-प्रसृतान्\\t7\\tComp4\\t_\\tनञ्-तत्पुरुषः\\n7\\tशेष-\\t--\\t8\\tComp4\\t_\\tकर्मधारयः\\n8\\tकाय-\\t--\\t9\\tComp4\\t_\\tसप्तमी-तत्पुरुषः\\n9\\tप्रसृतान्\\t--\\t20\\tComp4\\t_\\tComp_root\\n10\\tअ-\\tअ-शेषान्\\t11\\tComp2\\t_\\tअस्त्यर्थ-मध्यमपदलोपी(नञ्)-बहुव्रीहिः\\n11\\tशेषान्\\t--\\t20\\tComp2\\t_\\tComp_root\\n12\\tऔत्सुक्य-\\tऔत्सुक्य-मोह-अरति-दान्\\t13\\tComp4\\t_\\tइतरेतर-द्वन्द्वः\\n13\\tमोह-\\t--\\t14\\tComp4\\t_\\tइतरेतर-द्वन्द्वः\\n14\\tअ-\\t-

In [None]:
df['sentence'][1], df['gold'][1]

('अपूर्ववैद्याय नमः अस्तु तस्मै .',
 "{'tokens': ['अ', 'पूर्व', 'वैद्याय', 'नमः', 'अस्तु', 'तस्मै', '.'], 'compounds': '1\\tअ-\\tअ-पूर्व-वैद्याय\\t2\\tComp3\\t_\\tअस्त्यर्थ-मध्यमपदलोपी(नञ्)-बहुव्रीहिः\\n2\\tपूर्व-\\t--\\t3\\tComp3\\t_\\tकर्मधारयः\\n3\\tवैद्याय\\t--\\t8\\tComp3\\t_\\tComp_root\\n4\\tनमः\\tनमः\\t8\\tCompNo\\t_\\tNo_rel\\n5\\tअस्तु\\tअस्तु\\t8\\tCompNo\\t_\\tNo_rel\\n6\\tतस्मै\\tतस्मै\\t8\\tCompNo\\t_\\tNo_rel\\n7\\t.\\t.\\t8\\tCompNo\\t_\\tNo_rel\\n8\\tDUMMY\\t_\\t0\\tCompNo\\t_\\troot'}")

In [None]:
df['sentence'][2], df['gold'][2]

('अथ अतः आयुष्कामीयम् अध्यायम् व्याख्यास्यामः .',
 "{'tokens': ['अथ', 'अतः', 'आयुष्कामीयम्', 'अध्यायम्', 'व्याख्यास्यामः', '.'], 'compounds': '1\\tअथ\\tअथ\\t7\\tCompNo\\t_\\tNo_rel\\n2\\tअतः\\tअतः\\t7\\tCompNo\\t_\\tNo_rel\\n3\\tआयुष्कामीयम्\\tआयुष्कामीयम्\\t7\\tCompNo\\t_\\tNo_rel\\n4\\tअध्यायम्\\tअध्यायम्\\t7\\tCompNo\\t_\\tNo_rel\\n5\\tव्याख्यास्यामः\\tव्याख्यास्यामः\\t7\\tCompNo\\t_\\tNo_rel\\n6\\t.\\t.\\t7\\tCompNo\\t_\\tNo_rel\\n7\\tDUMMY\\t_\\t0\\tCompNo\\t_\\troot'}")

In [19]:
SYSTEM = """
You are an expert in Sanskrit grammar, who identifies and classifies compounds in the given Sanskrit sentence. You will be given the original sentence. First break the sentence in compounds.
Follow these rules strictly:
1. Only use the following 4 compound types. Do not invent or include other types:
    - Tatpurusha: An endocentric compound where the first element (the attributive) determines the second.
    - Avyayibhava: An adverbial compound made of an indeclinable element and a noun, expressing an adverbial meaning.
    - Dvandva: A copulative compound where two or more noun stems are joined by 'and'.
    - Bahuvrihi: An exocentric compound that describes something by referring to its parts.
2. The sentence may contain nested compounds or non-compounded words — handle appropriately.
3. Maintain strict formatting and provide only the answer line. Do not include explanations.
4. The start or end indexes must not exceed the number of words in the sentence.
5. Answer in the devnagri script only, there shouldn't be any latin in the answer

Text:
{INPUT}

Return strictly in JSON with keys:
{
  "tokens": [...] ,
  "compounds":""
}

Rules:
- Tokenize by meaningful Sanskrit units.
- span = inclusive of start index, exclusive of end index.
- If multiple nested samāsa exist, include all.
- If none, return empty lists for compounds.
- Do not output anything outside JSON.
- The compound data is in tab separated conllu format as given in example.


Example 1:
Input: रागआदिरोगान् सततअनुषक्तान् अशेषकायप्रसृतान् अशेषान् औत्सुक्यमोहअरतिदान् जघान (यः) .
Output:
 {'tokens': ['राग', 'आदि', 'रोगान्', 'सतत', 'अनुषक्तान्', 'अ', 'शेष', 'काय', 'प्रसृतान्', 'अ', 'शेषान्', 'औत्सुक्य', 'मोह', 'अ', 'रति', 'दान्', 'जघान', '(यः)', '.'], 'compounds': '1\\tराग-\\tराग-आदि-रोगान्\\t2\\tComp3\\t_\\tबहुव्रीहिः\\n2\\tआदि-\\t--\\t3\\tComp3\\t_\\tकर्मधारयः\\n3\\tरोगान्\\t--\\t20\\tComp3\\t_\\tComp_root\\n4\\tसतत-\\tसतत-अनुषक्तान्\\t5\\tComp2\\t_\\tकर्मधारयः\\n5\\tअनुषक्तान्\\t--\\t20\\tComp2\\t_\\tComp_root\\n6\\tअ-\\tअ-शेष-काय-प्रसृतान्\\t7\\tComp4\\t_\\tनञ्-तत्पुरुषः\\n7\\tशेष-\\t--\\t8\\tComp4\\t_\\tकर्मधारयः\\n8\\tकाय-\\t--\\t9\\tComp4\\t_\\tसप्तमी-तत्पुरुषः\\n9\\tप्रसृतान्\\t--\\t20\\tComp4\\t_\\tComp_root\\n10\\tअ-\\tअ-शेषान्\\t11\\tComp2\\t_\\tअस्त्यर्थ-मध्यमपदलोपी(नञ्)-बहुव्रीहिः\\n11\\tशेषान्\\t--\\t20\\tComp2\\t_\\tComp_root\\n12\\tऔत्सुक्य-\\tऔत्सुक्य-मोह-अरति-दान्\\t13\\tComp4\\t_\\tइतरेतर-द्वन्द्वः\\n13\\tमोह-\\t--\\t14\\tComp4\\t_\\tइतरेतर-द्वन्द्वः\\n14\\tअ-\\t--\\t15\\tComp4\\t_\\tनञ्-तत्पुरुषः\\n15\\tरति-\\t--\\t16\\tComp4\\t_\\tComp_root\\n16\\tदान्\\t--\\t20\\tComp4\\t_\\tविशेषणम्\\n17\\tजघान\\tजघान\\t20\\tCompNo\\t_\\tNo_rel\\n18\\t(यः)\\t(यः)\\t20\\tCompNo\\t_\\tNo_rel\\n19\\t.\\t.\\t20\\tCompNo\\t_\\tNo_rel\\n20\\tDUMMY\\t_\\t0\\tCompNo\\t_\\troot '}
   
Example 2:
Input: 'अपूर्ववैद्याय नमः अस्तु तस्मै .
Output:
 {'tokens': ['अ', 'पूर्व', 'वैद्याय', 'नमः', 'अस्तु', 'तस्मै', '.'], 
 'compounds': '1\\tअ-\\tअ-पूर्व-वैद्याय\\t2\\tComp3\\t_\\tअस्त्यर्थ-मध्यमपदलोपी(नञ्)-बहुव्रीहिः\\n2\\tपूर्व-\\t--\\t3\\tComp3\\t_\\tकर्मधारयः\\n3\\tवैद्याय\\t--\\t8\\tComp3\\t_\\tComp_root\\n4\\tनमः\\tनमः\\t8\\tCompNo\\t_\\tNo_rel\\n5\\tअस्तु\\tअस्तु\\t8\\tCompNo\\t_\\tNo_rel\\n6\\tतस्मै\\tतस्मै\\t8\\tCompNo\\t_\\tNo_rel\\n7\\t.\\t.\\t8\\tCompNo\\t_\\tNo_rel\\n8\\tDUMMY\\t_\\t0\\tCompNo\\t_\\troot'}
    
Input: अथ अतः आयुष्कामीयम् अध्यायम् व्याख्यास्यामः .
Output:
 {'tokens': ['अथ', 'अतः', 'आयुष्कामीयम्', 'अध्यायम्', 'व्याख्यास्यामः', '.'], 
 'compounds': '1\\tअथ\\tअथ\\t7\\tCompNo\\t_\\tNo_rel\\n2\\tअतः\\tअतः\\t7\\tCompNo\\t_\\tNo_rel\\n3\\tआयुष्कामीयम्\\tआयुष्कामीयम्\\t7\\tCompNo\\t_\\tNo_rel\\n4\\tअध्यायम्\\tअध्यायम्\\t7\\tCompNo\\t_\\tNo_rel\\n5\\tव्याख्यास्यामः\\tव्याख्यास्यामः\\t7\\tCompNo\\t_\\tNo_rel\\n6\\t.\\t.\\t7\\tCompNo\\t_\\tNo_rel\\n7\\tDUMMY\\t_\\t0\\tCompNo\\t_\\troot'}
"""

TEMPLATE = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>{system}<|eot_id|>
<|start_header_id|>user<|end_header_id|>Now analyze:{INPUT}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>Answer: {OUTPUT}<|eot_id|>"""

from transformers import AutoTokenizer
import pandas as pd

tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")

df = merged

lens = []
for i in range(len(df)):
    inst = TEMPLATE.format(system=SYSTEM, INPUT=df["sentence"][i], OUTPUT="")
    txt = inst + df["gold"][i]
    tokens = tokenizer(txt, truncation=False, add_special_tokens=False)["input_ids"]
    lens.append(len(tokens))

print("max tokenized length:", max(lens))
print("mean:", sum(lens)/len(lens))


max tokenized length: 3300
mean: 2174.7497490589712


In [32]:
import unsloth
import torch
import pandas as pd
import gc
from tqdm import tqdm
from datasets import Dataset
from transformers import TrainingArguments
from unsloth import FastLanguageModel, FastModel
from trl import SFTTrainer
from transformers import EarlyStoppingCallback
import json
# ---------- configs ----------
early_stopping_callback = EarlyStoppingCallback(early_stopping_patience=3)

TRAIN_CSV = "/home/shivraj-pg/DEPNECT/conllu-style-csv/without-context-coarse-train.csv"
OUT_DIR = "/home/shivraj-pg/DEPNECT/OUT_gemma4B_conllu"

MAX_SEQ = 3500
R, ALPHA = 64, 128
DROPOUT = 0.1
LR = 1e-5
BATCH = 8
GRAD_ACC = 16
EPOCHS = 50
SAVE_STEPS = 100
LOG_STEPS = 10

SYSTEM = """
You are an expert in Sanskrit grammar, who identifies and classifies compounds in the given Sanskrit sentence. You will be given the original sentence. First break the sentence in compounds.
Follow these rules strictly:
1. Only use the following 4 compound types. Do not invent or include other types:
    - Tatpurusha: An endocentric compound where the first element (the attributive) determines the second.
    - Avyayibhava: An adverbial compound made of an indeclinable element and a noun, expressing an adverbial meaning.
    - Dvandva: A copulative compound where two or more noun stems are joined by 'and'.
    - Bahuvrihi: An exocentric compound that describes something by referring to its parts.
2. The sentence may contain nested compounds or non-compounded words — handle appropriately.
3. Maintain strict formatting and provide only the answer line. Do not include explanations.
4. The start or end indexes must not exceed the number of words in the sentence.
5. Answer in the devnagri script only, there shouldn't be any latin in the answer

Return strictly in JSON with keys:
{
  "tokens": [...] ,
  "compounds":""
}

Rules:
- Tokenize by meaningful Sanskrit units.
- span = inclusive of start index, exclusive of end index.
- If multiple nested samāsa exist, include all.
- If none, return empty lists for compounds.
- Do not output anything outside JSON.
- The compound data is in tab separated conllu format as given in example.
"""

ex1_input = "रागआदिरोगान् सततअनुषक्तान् अशेषकायप्रसृतान् अशेषान् औत्सुक्यमोहअरतिदान् जघान (यः) ."

ex1_output = {
    "tokens": ["राग", "आदि", "रोगान्", "सतत", "अनुषक्तान्", "अ", "शेष", "काय", "प्रसृतान्", "अ", "शेषान्", "औत्सुक्य", "मोह", "अ", "रति", "दान्", "जघान", "(यः)", "."],
    "compounds": "1\tराग-\tराग-आदि-रोगान्\t2\tComp3\t_\tबहुव्रीहिः\n2\tआदि-\t--\t3\tComp3\t_\tकर्मधारयः\n3\tरोगान्\t--\t20\tComp3\t_\tComp_root\n4\tसतत-\tसतत-अनुषक्तान्\t5\tComp2\t_\tकर्मधारयः\n5\tअनुषक्तान्\t--\t20\tComp2\t_\tComp_root\n6\tअ-\tअ-शेष-काय-प्रसृतान्\t7\tComp4\t_\tनञ्-तत्पुरुषः\n7\tशेष-\t--\t8\tComp4\t_\tकर्मधारयः\n8\tकाय-\t--\t9\tComp4\t_\tसप्तमी-तत्पुरुषः\n9\tप्रसृतान्\t--\t20\tComp4\t_\tComp_root\n10\tअ-\tअ-शेषान्\t11\tComp2\t_\tअस्त्यर्थ-मध्यमपदलोपी(नञ्)-बहुव्रीहिः\n11\tशेषान्\t--\t20\tComp2\t_\tComp_root\n12\tऔत्सुक्य-\tऔत्सुक्य-मोह-अरति-दान्\t13\tComp4\t_\tइतरेतर-द्वन्द्वः\n13\tमोह-\t--\t14\tComp4\t_\tइतरेतर-द्वन्द्वः\n14\tअ-\t--\t15\tComp4\t_\tनञ्-तत्पुरुषः\n15\tरति-\t--\t16\tComp4\t_\tComp_root\n16\tदान्\t--\t20\tComp4\t_\tविशेषणम्\n17\tजघान\tजघान\t20\tCompNo\t_\tNo_rel\n18\t(यः)\t(यः)\t20\tCompNo\t_\tNo_rel\n19\t.\t.\t20\tCompNo\t_\tNo_rel\n20\tDUMMY\t_\t0\tCompNo\t_\troot"
}
ex2_input = "अपूर्ववैद्याय नमः अस्तु तस्मै ."

ex2_output = {
    "tokens": ["अ", "पूर्व", "वैद्याय", "नमः", "अस्तु", "तस्मै", "."],
    "compounds": "1\tअ-\tअ-पूर्व-वैद्याय\t2\tComp3\t_\tअस्त्यर्थ-मध्यमपदलोपी(नञ्)-बहुव्रीहिः\n2\tपूर्व-\t--\t3\tComp3\t_\tकर्मधारयः\n3\tवैद्याय\t--\t8\tComp3\t_\tComp_root\n4\tनमः\tनमः\t8\tCompNo\t_\tNo_rel\n5\tअस्तु\tअस्तु\t8\tCompNo\t_\tNo_rel\n6\tतस्मै\tतस्मै\t8\tCompNo\t_\tNo_rel\n7\t.\t.\t8\tCompNo\t_\tNo_rel\n8\tDUMMY\t_\t0\tCompNo\t_\troot"
}
ex3_input = "अथ अतः आयुष्कामीयम् अध्यायम् व्याख्यास्यामः ."

ex3_output = {
    "tokens": ["अथ", "अतः", "आयुष्कामीयम्", "अध्यायम्", "व्याख्यास्यामः", "."],
    "compounds": "1\tअथ\tअथ\t7\tCompNo\t_\tNo_rel\n2\tअतः\tअतः\t7\tCompNo\t_\tNo_rel\n3\tआयुष्कामीयम्\tआयुष्कामीयम्\t7\tCompNo\t_\tNo_rel\n4\tअध्यायम्\tअध्यायम्\t7\tCompNo\t_\tNo_rel\n5\tव्याख्यास्यामः\tव्याख्यास्यामः\t7\tCompNo\t_\tNo_rel\n6\t.\t.\t7\tCompNo\t_\tNo_rel\n7\tDUMMY\t_\t0\tCompNo\t_\troot"
}


DEMO_EXAMPLES = [
    (ex1_input, json.dumps(ex1_output, ensure_ascii=False)),
    (ex2_input, json.dumps(ex2_output, ensure_ascii=False)),
    (ex3_input, json.dumps(ex3_output, ensure_ascii=False)),
]

# ---------- dataset prep ----------


def create_demo_block(demos):
    blocks = []
    for i, (inp, out_json) in enumerate(demos, 1):
        block = (
            "<|start_header_id|>user<|end_header_id|>\n"
            f"Example {i} Input:\n{inp}\n"
            "<|start_header_id|>assistant<|end_header_id|>\n"
            f"Example {i} Output:\n{out_json}\n"
        )
        blocks.append(block)
    return "\n".join(blocks)


demo_block = create_demo_block(DEMO_EXAMPLES)


def build_prompt(system_text: str, demo_block: str, target_sentence: str) -> str:
    # This returns only the prompt (no target gold).
    return (
        "<|begin_of_text|>\n"
        "<|start_header_id|>system<|end_header_id|>\n"
        f"{system_text}\n\n"
        "<|eot_id|>"
        f"{demo_block}\n"
        "<|eot_id|>"
        "<|start_header_id|>user<|end_header_id|>\n"
        "Now analyze this sentence:\n"
        f"{target_sentence}\n"
        "<|eot_id|>"
        "<|start_header_id|>assistant<|end_header_id|>\n"
    )



def csv_to_ds(path):
    df = pd.read_csv(path)[["sentence", "gold"]].dropna()

    texts = []
    for _, r in df.iterrows():
        prompt = build_prompt(SYSTEM, demo_block, str(r["sentence"]).strip())
        # gold must be strict JSON string already in your dataset
        gold = str(r["gold"]).strip()
        # final string = prompt + gold (assistant completion)
        full_text = prompt + gold
        texts.append(full_text)

    return Dataset.from_dict({"text": texts})



In [33]:
from transformers import AutoTokenizer
import pandas as pd
from tqdm import tqdm

# ---------- paths ----------
TRAIN_CSV = "/home/shivraj-pg/DEPNECT/conllu-style-csv/without-context-coarse-train.csv"
DEV_CSV   = "/home/shivraj-pg/DEPNECT/conllu-style-csv/without-context-coarse-dev.csv"

# ---------- use your real SYSTEM, build_prompt, and demo_block ----------
# Make sure these are already defined in your environment.
# I am assuming they are available exactly as you showed previously.

# Example:
# SYSTEM
# demo_block
# build_prompt(system_text, demo_block, target_sentence)

tok = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")

def compute_lengths(csv_path, name):
    print(f"\nProcessing {name}...")

    df = pd.read_csv(csv_path)[["sentence"]].dropna()
    lengths = []

    for s in tqdm(df["sentence"], total=len(df)):
        prompt = build_prompt(
            SYSTEM,
            demo_block,
            str(s).strip()
        )
        ids = tok(prompt, return_attention_mask=False, add_special_tokens=False)["input_ids"]
        lengths.append(len(ids))

    print(f"Stats for {name}:")
    print(f" Count: {len(lengths)}")
    print(f" Max length: {max(lengths)}")
    print(f" Min length: {min(lengths)}")
    print(f" Mean length: {sum(lengths)/len(lengths):.2f}")
    print(f" 95th percentile: {sorted(lengths)[int(0.95*len(lengths))]}")
    print(f" 99th percentile: {sorted(lengths)[int(0.99*len(lengths))]}")
    return lengths


train_lengths = compute_lengths(TRAIN_CSV, "train")
dev_lengths = compute_lengths(DEV_CSV, "dev")

print("\nGlobal max length:", max(max(train_lengths), max(dev_lengths)))



Processing train...


100%|██████████| 11000/11000 [00:16<00:00, 669.51it/s]


Stats for train:
 Count: 11000
 Max length: 1864
 Min length: 1780
 Mean length: 1801.97
 95th percentile: 1828
 99th percentile: 1842

Processing dev...


100%|██████████| 2000/2000 [00:02<00:00, 703.95it/s]

Stats for dev:
 Count: 2000
 Max length: 1851
 Min length: 1781
 Mean length: 1802.14
 95th percentile: 1827
 99th percentile: 1841

Global max length: 1864





In [34]:
# Pick a sample index from your training dataset
sample_idx = 0

# Load the dataframe
import pandas as pd

df = pd.read_csv(TRAIN_CSV)[["sentence", "gold"]].dropna()

# Get the specific row
sentence = df.iloc[sample_idx]["sentence"]
gold = df.iloc[sample_idx]["gold"]

# Build prompt exactly like your csv_to_ds()
prompt_only = build_prompt(
    SYSTEM,
    demo_block,
    str(sentence).strip()
)

full_input = prompt_only + str(gold).strip()

print("===== FULL PROMPT INPUT TO MODEL =====")
print(full_input)
print("======================================")

# If you also want to see token count:
from transformers import AutoTokenizer
tok = AutoTokenizer.from_pretrained("google/gemma-3-4b-it")
print("Token count:", len(tok(full_input)["input_ids"]))


===== FULL PROMPT INPUT TO MODEL =====
<|begin_of_text|>
<|start_header_id|>system<|end_header_id|>

You are an expert in Sanskrit grammar, who identifies and classifies compounds in the given Sanskrit sentence. You will be given the original sentence. First break the sentence in compounds.
Follow these rules strictly:
1. Only use the following 4 compound types. Do not invent or include other types:
    - Tatpurusha: An endocentric compound where the first element (the attributive) determines the second.
    - Avyayibhava: An adverbial compound made of an indeclinable element and a noun, expressing an adverbial meaning.
    - Dvandva: A copulative compound where two or more noun stems are joined by 'and'.
    - Bahuvrihi: An exocentric compound that describes something by referring to its parts.
2. The sentence may contain nested compounds or non-compounded words — handle appropriately.
3. Maintain strict formatting and provide only the answer line. Do not include explanations.
4. The 