In [None]:


# Import required libraries
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sentence_transformers import SentenceTransformer, util
from torch.utils.data import Dataset, DataLoader
import numpy as np

# Create synthetic Marathi FAQ data
faq_data = {
    "question": [
        "तुमचे कार्यालय कुठे आहे?",
        "ऑनलाईन पेमेंट कसे करावे?",
        "उत्पादनाची हमी किती काळ आहे?",
        "खाते कसे बनवायचे?",
        "ऑर्डर कॅन्सल कसे करायचे?"
    ],
    "answer": [
        "आमचे कार्यालय पुणे येथे आहे.",
        'भुगतान पृष्ठावर जा आणि "ऑनलाईन पेमेंट" पर्याय निवडा.',
        "सर्व उत्पादनांना १ वर्षाची हमी आहे.",
        "'नवीन खाते तयार करा' वर क्लिक करून फॉर्म भरा.",
        "माय अकाऊंटमध्ये जाऊन ऑर्डर कॅन्सल करू शकता."
    ]
}

df = pd.DataFrame(faq_data)

# Use Sentence-Transformer to encode questions (for question-answer retrieval)
sentence_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

# Encode questions
question_embeddings = sentence_model.encode(df['question'].tolist())

def get_answer(user_query):
    query_embedding = sentence_model.encode(user_query)
    similarities = util.pytorch_cos_sim(query_embedding, question_embeddings)[0]
    most_similar_idx = int(np.argmax(similarities))  # Convert to int
    return df.iloc[most_similar_idx]['answer']

# Test the chatbot for question-answer retrieval
user_input = "पैसे कसे भरायचे?"
print(f"User: {user_input}")
print(f"Bot: {get_answer(user_input)}")

# Intent Classification - Label encoding the questions
df['intent'] = LabelEncoder().fit_transform(df['question'])
X_train, X_test, y_train, y_test = train_test_split(df['question'], df['intent'], test_size=0.2)

# Load the pre-trained multilingual BERT model for intent classification
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
bert_model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(df))

# Tokenize data
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=128)

# Custom Dataset Class for Intent Classification
class IntentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx], dtype=torch.long)  # Ensure labels are Long type
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IntentDataset(train_encodings, y_train)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

# Training Loop for Intent Classification
optimizer = torch.optim.AdamW(bert_model.parameters(), lr=5e-5)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model.to(device)

bert_model.train()
for epoch in range(3):
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = bert_model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch + 1} completed")

# Predict intent function
def predict_intent(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
    outputs = bert_model(**inputs)
    predicted_label = torch.argmax(outputs.logits).item()
    return df[df['intent'] == predicted_label]['answer'].values[0]

# Test the intent prediction
print(predict_intent("हमी किती काळ आहे?"))

# Chatbot for real-time interaction (this uses question-answer retrieval with SentenceTransformer)
while True:
    user_input = input("तुमचा प्रश्न टाइप करा (exit साठी 'बंद' टाइप करा): ")
    if user_input.lower() == 'बंद':
        break
    print(f"Bot: {get_answer(user_input)}")



User: पैसे कसे भरायचे?
Bot: 'नवीन खाते तयार करा' वर क्लिक करून फॉर्म भरा.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 completed
Epoch 2 completed
Epoch 3 completed
'नवीन खाते तयार करा' वर क्लिक करून फॉर्म भरा.


तुमचा प्रश्न टाइप करा (exit साठी 'बंद' टाइप करा):   कसे करावे?


Bot: 'नवीन खाते तयार करा' वर क्लिक करून फॉर्म भरा.
