In [None]:
from transformers import BertTokenizerFast, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch  # Added missing import

# Step 1: Simulating a small Marathi dataset (For demo purposes)
marathi_text = [
    {"text": "हे माझं घर आहे.", "label": 0},  # Positive sentiment
    {"text": "आज बरीच वृष्टी झाली.", "label": 1},  # Negative sentiment
    {"text": "तुम्ही काय करत आहात?", "label": 0},  # Positive sentiment
    {"text": "तो खूप दुखी आहे.", "label": 1},  # Negative sentiment
]

# Convert it into a Dataset
marathi_dataset = Dataset.from_dict({
    "text": [item["text"] for item in marathi_text],
    "label": [item["label"] for item in marathi_text]
})

# Step 2: Tokenizing the data using BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-base-multilingual-cased')

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

# Apply tokenization to the dataset
tokenized_datasets = marathi_dataset.map(preprocess_function, batched=True)

# Step 3: Fine-tuning the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=2)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Corrected Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    tokenizer=tokenizer,  # Set the tokenizer here
)

# Step 4: Training the model
trainer.train()

# Step 5: Testing the model (classification)
def predict(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding='max_length', max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1).item()
    return "Positive" if prediction == 0 else "Negative"

# Step 6: Implement continuous input function
def get_input_prediction():
    while True:
        test_text = input("Enter text for sentiment prediction (in Marathi) or type 'exit' to quit: ")
        if test_text.lower() == 'exit':
            print("Exiting the program.")
            break
        prediction = predict(test_text)
        print(f"Prediction for '{test_text}': {prediction}")

# Call the input function to allow user interaction
get_input_prediction()


Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.601694
2,No log,0.556033
3,No log,0.52025


Enter text for sentiment prediction (in Marathi) or type 'exit' to quit:  ": "तुम्ही काय करत आहात?"


Prediction for '": "तुम्ही काय करत आहात?"': Positive


Enter text for sentiment prediction (in Marathi) or type 'exit' to quit:  आज बरीच वृष्टी झाली.",


Prediction for 'आज बरीच वृष्टी झाली.",': Positive


Enter text for sentiment prediction (in Marathi) or type 'exit' to quit:  "तो खूप दुखी आहे."


Prediction for '"तो खूप दुखी आहे."': Positive


Enter text for sentiment prediction (in Marathi) or type 'exit' to quit:  तो खूप दुखी आहे


Prediction for 'तो खूप दुखी आहे': Negative


Enter text for sentiment prediction (in Marathi) or type 'exit' to quit:  बरीच वृष्टी 


Prediction for 'बरीच वृष्टी ': Positive


Enter text for sentiment prediction (in Marathi) or type 'exit' to quit:   घर आहे


Prediction for ' घर आहे': Positive


Enter text for sentiment prediction (in Marathi) or type 'exit' to quit:  वृष्टी झाली.


Prediction for 'वृष्टी झाली.': Negative
