In [9]:

!pip install transformers sentence-transformers langdetect chromadb sentencepiece torch babel nltk
import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
import os
from typing import Optional, List, Dict, Any
from datetime import datetime
from dataclasses import dataclass

import nltk
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("punkt_tab") # Download punkt_tab for multilingual tokenization
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from sentence_transformers import SentenceTransformer

# ---------- Configuration ----------
SUPPORTED_LANGUAGES = {
    "en": "English",
    "es": "Spanish",
    "hi": "Hindi",
    "zh": "Chinese (Simplified)"
}

INTERNAL_LANG = "en"
DEVICE = "cpu" # Changed device to CPU

# Translation models for multilingual support (Helsinki/Marian)
TRANSLATION_MODELS = {
    ("es", "en"): "Helsinki-NLP/opus-mt-es-en",
    ("en", "es"): "Helsinki-NLP/opus-mt-en-es",
    ("hi", "en"): "Helsinki-NLP/opus-mt-hi-en",
    ("en", "hi"): "Helsinki-NLP/opus-mt-en-hi",
    ("zh", "en"): "Helsinki-NLP/opus-mt-zh-en",
    ("en", "zh"): "Helsinki-NLP/opus-mt-en-zh",
}

# Cultural info
CULTURE_MAP = {
    "en": {"greeting": "Hello"},
    "es": {"greeting": "Hola"},
    "hi": {"greeting": "नमस्ते"},
    "zh": {"greeting": "你好"},
}

# ---------- Global caches ----------
_translation_pipelines = {}
_embedding_model = None
_sentiment_analyzer = None

# ---------- Helper Classes ----------
@dataclass
class TranslationPipeline:
    src: str
    tgt: str
    model_name: str
    tokenizer: Any
    model: Any
    pipeline: Any

# ---------- Functions ----------

# Language detection
def detect_language(text: str) -> str:
    try:
        lang = detect(text)
        if lang.startswith("zh"):
            return "zh"
        return lang if lang in SUPPORTED_LANGUAGES else "en"
    except:
        return "en"

# Translation
def load_translation_pipeline(src: str, tgt: str) -> TranslationPipeline:
    key = (src, tgt)
    if key in _translation_pipelines:
        return _translation_pipelines[key]
    model_name = TRANSLATION_MODELS.get(key)
    if not model_name:
        raise ValueError(f"No translation model for {src}->{tgt}")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    from transformers import pipeline as hf_pipeline
    # Explicitly set device to -1 for CPU
    pipe = hf_pipeline("translation", model=model, tokenizer=tokenizer, device=-1)
    tp = TranslationPipeline(src, tgt, model_name, tokenizer, model, pipe)
    _translation_pipelines[key] = tp
    return tp

def translate_text(text: str, src: str, tgt: str) -> str:
    if src == tgt:
        return text
    tp = load_translation_pipeline(src, tgt)
    sentences = sent_tokenize(text)
    out = []
    batch = []
    for s in sentences:
        batch.append(s)
        if len(batch) >= 20:
            res = tp.pipeline(" ".join(batch), max_length=1000)
            out.append(res[0]["translation_text"])
            batch = []
    if batch:
        res = tp.pipeline(" ".join(batch), max_length=1000)
        out.append(res[0]["translation_text"])
    return " ".join(out)

# Embedding model
def get_embedding_model():
    global _embedding_model
    if _embedding_model is None:
        # Explicitly set device to 'cpu'
        _embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2", device='cpu')
    return _embedding_model

# Sentiment analysis
def get_sentiment_analyzer():
    global _sentiment_analyzer
    if _sentiment_analyzer is None:
        # Explicitly set device to -1 for CPU
        _sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment", device=-1)
    return _sentiment_analyzer

def analyze_sentiment(text: str) -> str:
    analyzer = get_sentiment_analyzer()
    result = analyzer(text)[0]
    label = result["label"]
    # Map to simple positive/neutral/negative
    if label in ["1 star", "2 stars"]:
        return "negative"
    elif label == "3 stars":
        return "neutral"
    else:
        return "positive"

# Culturalization
def culturalize_reply(reply_en: str, user_lang: str, sentiment: Optional[str] = None) -> str:
    greeting = CULTURE_MAP.get(user_lang, CULTURE_MAP["en"])["greeting"]
    # Adjust response tone based on sentiment
    if sentiment == "negative":
        reply_en = f"I'm sorry to hear that. {reply_en}"
    elif sentiment == "positive":
        reply_en = f"Great! {reply_en}"
    return f"{greeting}! {reply_en}"

# ---------- RAG retrieval stub ----------
def query_knowledge_base_stub(query_en: str, top_k: int = 5) -> List[Dict]:
    return []

# ---------- LLM call stub ----------
def call_llm(prompt_en: str, context_chunks: List[Dict], user_lang: str) -> str:
    ctx = "\n".join([c.get("document","") for c in context_chunks])
    return f"Answering: {prompt_en}\nContext:\n{ctx}"

# ---------- Main multilingual + sentiment pipeline ----------
def process_user_message_with_sentiment(user_text: str,
                                        user_lang: Optional[str] = None,
                                        top_k: int = 5,
                                        retrieval_fn = None,
                                        llm_fn = None) -> Dict[str, Any]:

    lang = user_lang or detect_language(user_text)
    if lang not in SUPPORTED_LANGUAGES:
        lang = "en"

    # Translate input to internal language
    if lang != INTERNAL_LANG:
        user_text_en = translate_text(user_text, src=lang, tgt=INTERNAL_LANG)
    else:
        user_text_en = user_text

    # Sentiment detection
    sentiment = analyze_sentiment(user_text_en)

    # Retrieve context
    retrieval_fn = retrieval_fn or query_knowledge_base_stub
    context_chunks = retrieval_fn(user_text_en, top_k=top_k)

    # Call LLM
    llm_fn = llm_fn or call_llm
    reply_en = llm_fn(user_text_en, context_chunks, user_lang)

    # Culturalize reply + sentiment adaptation
    reply_cultur = culturalize_reply(reply_en, lang, sentiment)

    # Translate back if needed
    if lang != INTERNAL_LANG:
        try:
            reply_translated = translate_text(reply_cultur, src=INTERNAL_LANG, tgt=lang)
        except Exception:
            reply_translated = reply_cultur
    else:
        reply_translated = reply_cultur

    return {
        "detected_language": lang,
        "sentiment": sentiment,
        "translated_user_text": user_text_en,
        "reply_translated": reply_translated,
        "reply_internal": reply_en,
        "reply_localized": reply_cultur
    }

# ---------- Demo ----------
if __name__ == "__main__":
    examples = [
        "I love this product! It's amazing.",
        "I'm frustrated because the service is slow.",
        "The chatbot works okay, nothing special.",
        "Me encanta este servicio.",
        "यह सेवा बहुत खराब है।"
    ]

    for msg in examples:
        out = process_user_message_with_sentiment(msg)
        print("="*50)
        print(f"User: {msg}")
        print(f"Detected Language: {out['detected_language']}")
        print(f"Sentiment: {out['sentiment']}")
        print(f"Chatbot Reply:\n{out['reply_translated']}\n")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
Device set to use cpu


User: I love this product! It's amazing.
Detected Language: en
Sentiment: positive
Chatbot Reply:
Hello! Great! Answering: I love this product! It's amazing.
Context:


User: I'm frustrated because the service is slow.
Detected Language: en
Sentiment: negative
Chatbot Reply:
Hello! I'm sorry to hear that. Answering: I'm frustrated because the service is slow.
Context:


User: The chatbot works okay, nothing special.
Detected Language: en
Sentiment: neutral
Chatbot Reply:
Hello! Answering: The chatbot works okay, nothing special.
Context:




Device set to use cpu
Device set to use cpu


User: Me encanta este servicio.
Detected Language: es
Sentiment: positive
Chatbot Reply:
¡Hola! ¡Genial! Respuesta: Me encanta este servicio. Contexto:



Device set to use cpu
Device set to use cpu


User: यह सेवा बहुत खराब है।
Detected Language: hi
Sentiment: negative
Chatbot Reply:
SURARSARS! मुझे लगता है कि सुनने के लिए खेद है. उत्तर: यह सेवा बहुत बुरा है. कॉन्टेक्स्ट:

