In [1]:
pip install transformers datasets torch scikit-learn pandas

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupt

In [2]:
pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
import pandas as pd
import torch
import random
import numpy as np
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
from transformers import DataCollatorWithPadding

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Load the newly generated dataset
df = pd.read_csv("/content/generated_text_with_labels.csv")  # Replace with the actual file path

# Check the initial contents of the DataFrame
print("Original DataFrame:")
print(df.head())  # Print the first few rows
print(df.info())  # Print information about the DataFrame

# Convert any non-numeric values to NaN, if any
df['Label'] = pd.to_numeric(df['Label'], errors='coerce')

# Check DataFrame after converting to numeric
print("\nDataFrame after converting 'label' to numeric:")
print(df.head())
print(df.info())

# Remove missing values
df = df.dropna()

# Check DataFrame after dropping NaNs
print("\nDataFrame after dropping NaNs:")
print(df.head())
print(df.info())

# Split data into train (80%) and test (20%)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Sentence"].tolist(), df["Label"].tolist(), test_size=0.2, random_state=42
)

# Load the mBERT tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# Custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = [int(label) for label in labels]  # Convert float labels to int
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)  # Ensure int64
        return item

# Prepare train and validation datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

# Load pre-trained mBERT model
model = BertForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=5)  # 5 labels for 5 languages

# Define training arguments
training_args = TrainingArguments(
    output_dir="./mbert-language-classifier",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    report_to="none",  # Disable Weights & Biases logging
)

# Evaluation Metric
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Original DataFrame:
                                    Sentence  Label
0                     నేడు రాత్రి బాగా ఉంది.      3
1  She loved reading books in her free time.      4
2                   મને કટરાય પીરજના ભાઈ છે.      2
3      त्या दिनी मी कायम स른 सांगून होतuscam.      1
4          तुमच्या निश्चित इच्छा पूर्ण होते.      1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Sentence  1025 non-null   object
 1   Label     1025 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 16.1+ KB
None

DataFrame after converting 'label' to numeric:
                                    Sentence  Label
0                     నేడు రాత్రి బాగా ఉంది.      3
1  She loved reading books in her free time.      4
2                   મને કટરાય પીરજના ભાઈ છે.      2
3      त्या दिनी मी कायम स른 सांगून होतuscam.      1
4          तुमच्या निश्चित इच्छा पूर्ण होते.  

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1893,0.002733,1.0
2,0.0142,0.001515,1.0
3,0.0011,0.000949,1.0
4,0.0007,0.000694,1.0
5,0.0005,0.000557,1.0
6,0.0004,0.00047,1.0
7,0.0004,0.000414,1.0
8,0.0003,0.00038,1.0
9,0.0003,0.000361,1.0
10,0.0003,0.000354,1.0


TrainOutput(global_step=1030, training_loss=0.020756429093844682, metrics={'train_runtime': 416.9733, 'train_samples_per_second': 19.666, 'train_steps_per_second': 2.47, 'total_flos': 539392192051200.0, 'train_loss': 0.020756429093844682, 'epoch': 10.0})

In [4]:
# Save the fine-tuned model
model.save_pretrained("./mbert-language-classifier_1")
tokenizer.save_pretrained("./mbert-language-classifier_1")

print("Fine-tuning complete! Model saved to './mbert-language-classifier'")

Fine-tuning complete! Model saved to './mbert-language-classifier'


In [5]:
# prompt: make a decoder for this predicted class

# Load the fine-tuned model and tokenizer
model_path = "./mbert-language-classifier_1"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=5)

# Set the model to evaluation mode
model.eval()

# Example sentence to predict
sentence = '''आज सकाळी मी लवकर उठलो आणि ताज्या हवेचा आनंद घेतला. फिरायला जाण्याचा विचार केला, लेकिन मौसम थोड़ा ठंडा था, इसलिए मैंने पहले चाय पीने का फैसला किया. मी माझ्या आवडत्या कपमध्ये चहा घेतला आणि खिडकीतून बाहेरचं सुंदर दृश्य पाहत बसलो.

थोड़ी देर बाद, मैंने नाश्ता किया और अपनी किताबें लेकर पढ़ाई शुरू कर दी. अभ्यास करताना काही प्रश्न मला समजत नव्हते, म्हणून मी माझ्या मित्राला फोन केला. त्याने मला काही चांगल्या टिप्स दिल्या, जिससे मेरी समस्या हल हो गई. मग मी थोडा वेळ संगीत ऐकले आणि रिलॅक्स झालो.

दुपारी मला बाजारात जावे लागले, कारण घरी काही वस्तूंची गरज होती. बाजारात भरपूर गर्दी होती, पण मला माझ्या लिस्टमधील सर्व सामान पटकन मिळाले. मी काही ताज्या भाज्या घेतल्या, आणि एका दुकानात थांबून माझ्या आवडती पेस्ट्री घेतली. वापसी के दौरान, मैं एक पुराने दोस्त से मिला, आणि आम्ही थोडा वेळ बोललो.

संध्याकाळी मी माझ्या कुटुंबासोबत वेळ घालवला. आम्ही एकत्र बसून चहा घेतला आणि गप्पा मारल्या. मग मी माझ्या लहान भावंडांसोबत खेळलो, और हमें बहुत मज़ा आया. थोड़ी देर बाद, मी पुस्तक वाचायला घेतलं, पण वाचता-वाचता झोप येऊ लागली.

रात्रीच्या जेवणानंतर, मी थोडा वेळ बाहेर ताज्या हवेत फिरायला गेलो. तारे चमकत होते, आणि वातावरण शांत होतं. वापस आकर, मैंने अपने दिन के बारे में सोचा आणि मला वाटलं की आजचा दिवस खरोखरच छान गेला. मग मी झोपण्याची तयारी केली आणि काही क्षणांतच गाढ झोपलो'''

# Tokenize the sentence
inputs = tokenizer(sentence, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Make a prediction
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted label
predicted_class = torch.argmax(outputs.logits, dim=1).item()

# Decode the predicted class (assuming labels represent specific categories)
# Replace with your actual label mapping
label_mapping = {
    0: "Hindi",  # Example: "Negative sentiment"
    1: "Marathi",  # Example: "Neutral sentiment"
    2: "Gujarati",  # Example: "Positive sentiment"
    3: "Telugu",  # Example: "Mixed sentiment"
    4: "English"  # Example: "Mixed sentiment"
}

decoded_label = label_mapping.get(predicted_class, "Unknown label")

# Print the decoded label
print(f"Predicted class: {predicted_class}, Decoded Label: {decoded_label}")


Predicted class: 1, Decoded Label: Marathi


#### Loading Model for training on CODE MIXED Language

In [6]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import evaluate

# Load the previously trained model and tokenizer
model_path = "./mbert-language-classifier_1"  # Path to your previously trained model
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=5)

# Load the code-mixed dataset
df = pd.read_csv("/content/mixed_language_dataset.csv")  # Replace with your actual code-mixed dataset path

# Split data into train (80%) and test (20%)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["Sentence"].tolist(), df["Label"].tolist(), test_size=0.2, random_state=42
)

# Custom Dataset class for tokenization and encoding
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = [int(label) for label in labels]
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            padding="max_length",
            truncation=True,
            max_length=128,
            return_tensors="pt"
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Prepare datasets
train_dataset = CustomDataset(train_texts, train_labels, tokenizer)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer)

# Define evaluation metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    logits = torch.from_numpy(logits)  # Convert logits to PyTorch tensor
    predictions = torch.argmax(logits, dim=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./mbert-language-classifier-finetuned",  # Directory to save the fine-tuned model
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Train the model
trainer.train()


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mkurliyedrk[0m ([33mkurliyedrk-centre-for-development-of-advanced-computing-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.0007,7.9e-05,1.0
2,0.0001,3.7e-05,1.0
3,0.0017,6e-05,1.0
4,0.0118,1.9e-05,1.0
5,0.0,1.4e-05,1.0
6,0.0,1.2e-05,1.0
7,0.0,1e-05,1.0
8,0.0,1e-05,1.0
9,0.0,9e-06,1.0
10,0.0,9e-06,1.0


TrainOutput(global_step=1280, training_loss=0.001431310699081223, metrics={'train_runtime': 3066.8118, 'train_samples_per_second': 3.326, 'train_steps_per_second': 0.417, 'total_flos': 670951263283200.0, 'train_loss': 0.001431310699081223, 'epoch': 10.0})

In [7]:
# Save the fine-tuned model
model.save_pretrained("./mbert-language-codeMixed-classifier")
tokenizer.save_pretrained("./mbert-language-codeMixed-classifier")

print("Fine-tuning complete! Model saved to './mbert-language-classifier'")

Fine-tuning complete! Model saved to './mbert-language-classifier'


In [8]:


# Load the fine-tuned model and tokenizer
model_path = "./mbert-language-codeMixed-classifier"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=5)

# Set the model to evaluation mode
model.eval()

# Example sentence to predict
sentence = '''आज तो बहुत hectic दिन था, work के बीच में, मैं सोच रहा था कि ज़रा आराम करूं, पर अचानक ऑफिस में बहोत urgent काम आ गया, and फिर నాపై చాలా ఒత్తిడి వచ్చింది. જ્યારે मीटिंग खतम होई, तेव्हा मी खुदाला बोललो की, it's time to chill, पण घर पोहोचताच, I realized there’s more to do.'''

# Tokenize the sentence
inputs = tokenizer(sentence, padding=True, truncation=True, max_length=128, return_tensors="pt")

# Make a prediction
with torch.no_grad():
    outputs = model(**inputs)

# Get the predicted label
predicted_class = torch.argmax(outputs.logits, dim=1).item()

# Decode the predicted class (assuming labels represent specific categories)
# Replace with your actual label mapping
label_mapping = {
    0: "Hindi",  # Example: "Negative sentiment"
    1: "Marathi",  # Example: "Neutral sentiment"
    2: "Gujarati",  # Example: "Positive sentiment"
    3: "Telugu",  # Example: "Mixed sentiment"
    4: "English"  # Example: "Mixed sentiment"
}

decoded_label = label_mapping.get(predicted_class, "Unknown label")

# Print the decoded label
print(f"Predicted class: {predicted_class}, Decoded Label: {decoded_label}")


Predicted class: 0, Decoded Label: Hindi


In [20]:
# prompt: i want to download this  models in my local disk

from google.colab import files
import os

# Define the paths to the models you want to download
model_paths = ["./mbert-language-classifier_1", "./mbert-language-codeMixed-classifier"]

# Function to zip a directory
def zip_directory(directory_path, zip_filename):
    os.system(f"zip -r {zip_filename} {directory_path}")
    return zip_filename


# Download each model
for model_path in model_paths:
  zip_file = zip_directory(model_path, f"{model_path}.zip")
  files.download(zip_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

### NLLB on Mixed Text

In [9]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [10]:

target_lang_code = None
if decoded_label == "Hindi":
  target_lang_code = "hin_Deva"
elif decoded_label == "Marathi":
  target_lang_code = "mar_Deva"
elif decoded_label == "Gujarati":
  target_lang_code = "guj_Gujr"
elif decoded_label == "Telugu":
  target_lang_code = "tel_Telu"
elif decoded_label== "English":
  target_lang_code = "eng_Latn"

inputs = tokenizer(sentence, return_tensors="pt")

translated_tokens = model.generate(
    **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_lang_code), max_length=30
)
Translated_sent=tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
Translated_sent

'आज तो बहुत ही व्यस्त दिन था, काम के बीच में, मैं सोच रहा था कि थोड़ा आराम करूं, लेकिन अचानक ऑफिस में बहुत'

### IndicNER

In [11]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")

tokenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.72M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/667M [00:00<?, ?B/s]

In [12]:
def get_predictions( sentence, tokenizer, model ):
  # Let us first tokenize the sentence - split words into subwords
  tok_sentence = tokenizer(sentence, return_tensors='pt')

  with torch.no_grad():
    # we will send the tokenized sentence to the model to get predictions
    logits = model(**tok_sentence).logits.argmax(-1)

    # We will map the maximum predicted class id with the class label
    predicted_tokens_classes = [model.config.id2label[t.item()] for t in logits[0]]

    predicted_labels = []

    previous_token_id = 0
    # we need to assign the named entity label to the head word and not the following sub-words
    word_ids = tok_sentence.word_ids()
    for word_index in range(len(word_ids)):
        if word_ids[word_index] == None:
            previous_token_id = word_ids[word_index]
        elif word_ids[word_index] == previous_token_id:
            previous_token_id = word_ids[word_index]
        else:
            predicted_labels.append( predicted_tokens_classes[ word_index ] )
            previous_token_id = word_ids[word_index]

    return predicted_labels

In [13]:
# let us try with some example sentences here
sentence = Translated_sent

predicted_labels = get_predictions(sentence=sentence,
                                   tokenizer=tokenizer,
                                   model = model)

for index in range(len(sentence.split(' '))):
  print( sentence.split(' ')[index] + '\t' + predicted_labels[index] )

आज	O
तो	O
बहुत	O
ही	O
व्यस्त	O
दिन	O
था,	O
काम	O
के	O
बीच	O
में,	O
मैं	O
सोच	O
रहा	O
था	O
कि	O
थोड़ा	O
आराम	O
करूं,	O
लेकिन	O
अचानक	O
ऑफिस	O
में	O
बहुत	O


In [14]:
# import joblib

# # Save the fine-tuned model and tokenizer using joblib
# joblib.dump(model, "./mbert-language-codeMixed-classifier_joblib.pkl")
# joblib.dump(tokenizer, "./mbert-language-codeMixed-classifier_tokenizer_joblib.pkl")

# print("Fine-tuned model and tokenizer saved using joblib!")

In [15]:
# # prompt: give me gradio code where user can give input as code-mixed language sent and output as predominant language and translated language and name entities give gradio code for above

# import gradio as gr

# def predict_and_translate(text):
#     # Load the fine-tuned model and tokenizer for language identification
#     model_path = "./mbert-language-codeMixed-classifier"
#     tokenizer = BertTokenizer.from_pretrained(model_path)
#     model = BertForSequenceClassification.from_pretrained(model_path, num_labels=5)
#     model.eval()

#     # Tokenize the input text
#     inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors="pt")

#     # Make a prediction
#     with torch.no_grad():
#         outputs = model(**inputs)
#     predicted_class = torch.argmax(outputs.logits, dim=1).item()

#     label_mapping = {
#         0: "Hindi",
#         1: "Mararthi",
#         2: "Gujarati",
#         3: "Telugu",
#         4: "English"
#     }
#     decoded_label = label_mapping.get(predicted_class, "Unknown label")

#     # Translation using NLLB
#     tokenizer_nllb = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
#     model_nllb = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

#     target_lang_code = None
#     if decoded_label == "Hindi":
#         target_lang_code = "hin_Deva"
#     elif decoded_label == "Marathi":
#         target_lang_code = "mar_Deva"
#     elif decoded_label == "Gujarati":
#         target_lang_code = "guj_Gujr"
#     elif decoded_label == "Telugu":
#         target_lang_code = "tel_Telu"
#     elif decoded_label == "English":
#         target_lang_code = "eng_Latn"

#     if target_lang_code:
#         inputs_nllb = tokenizer_nllb(text, return_tensors="pt")
#         translated_tokens = model_nllb.generate(
#             **inputs_nllb, forced_bos_token_id=tokenizer_nllb.convert_tokens_to_ids(target_lang_code), max_length=30
#         )
#         Translated_sent = tokenizer_nllb.batch_decode(translated_tokens, skip_special_tokens=True)[0]
#     else:
#         Translated_sent = "Translation not available for this language"


#     # Named Entity Recognition using IndicNER
#     tokenizer_ner = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
#     model_ner = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")

#         # Load model directly
#     from transformers import AutoTokenizer, AutoModelForTokenClassification

#     tokenizer = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
#     model = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")


#     predicted_labels = get_predictions(sentence=Translated_sent,
#                                       tokenizer=tokenizer,
#                                       model=model
#                                       )
#     ner_results=''
#     for index in range(len(sentence.split(' '))):
#         ner_results+=sentence.split(' ')[index] + '\t' + predicted_labels[index]

#     return decoded_label, Translated_sent, ner_results


# iface = gr.Interface(
#     fn=predict_and_translate,
#     inputs=gr.Textbox(lines=2, placeholder="Enter code-mixed text here..."),
#     outputs=[
#         gr.Textbox(label="Predominant Language"),
#         gr.Textbox(label="Translated Text"),
#         gr.Textbox(label="Named Entities")
#     ],
#     title="Code-Mixed Text Analyzer",
#     description="Identify the predominant language, translate the text, and extract named entities."
# )

# iface.launch()


In [16]:
# text='''આજે office મા meeting છે, but મારા પાસે report હવે સુધી તૈયાર નથી. मैं सुबह से try कर रहा हूँ but internet slow છે. నిన్న రాత్రి late work చేశాను, so I am feeling very tired. હવે boss will ask for updates, અને મને explain કરવું પડશે. दोस्त लोग also warned me कि તું time પર report complete કર, but still I got late. Now, I just hope કે somehow everything goes fine.'''

In [17]:
#  # Named Entity Recognition using IndicNER
# tokenizer_ner = AutoTokenizer.from_pretrained("ai4bharat/IndicNER")
# model_ner = AutoModelForTokenClassification.from_pretrained("ai4bharat/IndicNER")



# predicted_labels = get_predictions(sentence=Translated_sent, tokenizer=tokenizer_ner, model=model_ner)
# ner_results = ""
# for index in range(len(Translated_sent.split(' '))):
#   ner_results += Translated_sent.split(' ')[index] + '\t' + predicted_labels[index] + '\n'

# print(decoded_label, Translated_sent, ner_results)