In [None]:
# !pip install -q -U google-generativeai
# !pip install langdetect
# !pip install pycountry
# !pip install beautifulsoup4
# !pip install sentence-transformers
# !pip install faiss-cpu
# !pip install nltk

In [None]:
# pip install -r requirements.txt

In [None]:
import requests
from bs4 import BeautifulSoup
import time
import random
import numpy as np
from langdetect import detect
import pycountry
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import google.generativeai as genai
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def translate_single_text(json_input):
    text = json_input['text']
    dest_language = json_input['dest_language']

    # Kiểm tra ngôn ngữ hiện tại của văn bản
    current_language = detect(text)
    if current_language == dest_language:
        return text  # Nếu đã đúng ngôn ngữ đích thì không cần dịch

    # Chuyển mã ISO 639-1 (vd: 'en') thành tên ngôn ngữ đầy đủ (vd: 'English')
    dest_language_name = pycountry.languages.get(alpha_2=dest_language).name

    # Cấu hình API khóa cho Gemini (Google Generative AI)
    genai.configure(api_key='YOUR_API')  # <-- nhớ thay bằng khóa thật

    # Khởi tạo mô hình
    model = genai.GenerativeModel("gemini-1.5-flash")

    # Gửi yêu cầu dịch, giữ kết quả đơn thuần, không thêm dấu chấm
    prompt = f"Dịch '{text}' sang ngôn ngữ {dest_language_name} mà không thêm thông tin khác kể cả dấu chấm"
    response = model.generate_content(prompt)

    # Trả kết quả đã loại bỏ khoảng trắng đầu/cuối
    return response.text.strip()

In [None]:
# Hàm dịch danh sách văn bản
def translate_multiple_texts(json_input):
    texts = json_input['text']
    dest_language = json_input['dest_language']
    translations = []

    # Lặp qua từng văn bản và dịch lần lượt
    for text in texts:
        translated_text = translate_single_text({
            'text': text,
            'dest_language': dest_language
        })
        translations.append(translated_text)

    return translations


In [None]:
# Dữ liệu đầu vào
json_1 = {
'text': 'Hello',
'dest_language': 'vi'
}
json_2 = {
'text': ['Hello', 'I am John','Tôi là sinh viên'],
'dest_language': 'vi'
}


In [None]:
# Kiểm tra kết quả
translate_single_text(json_1)


'Xin chào'

In [None]:
translate_multiple_texts(json_2)


['Xin chào', 'Tôi là John', 'Tôi là sinh viên']

In [None]:
import requests
import re
import time
import numpy as np
from bs4 import BeautifulSoup
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
from nltk.corpus import stopwords
import google.generativeai as genai


class Chatbot:
    def __init__(self, api_key, url):
        self.url = url
        self.data = []

        # Thu thập dữ liệu
        self.crawl_data()

        # Mô hình embedding
        self.sentence_transformer_model = SentenceTransformer('all-mpnet-base-v2')
        self.embeddings = self.sentence_transformer_model.encode(
            [item["cleaned"] for item in self.data]
        )

        # Mô hình Gemini
        genai.configure(api_key=api_key)
        self.gemini_model = genai.GenerativeModel("gemini-1.5-flash")

    def crawl_data(self):
        response = requests.get(self.url)
        soup = BeautifulSoup(response.content, "html.parser")

        tags_to_extract = ["h2", "i", "p", "li"]
        segments = []
        current_segment = []

        for element in soup.find_all(tags_to_extract):
            if element.name in ["h2", "i"]:
                if current_segment:
                    segments.append(current_segment)
                current_segment = [element.text.strip()]
            elif element.name == "li":
                current_segment.append(element.text.strip().rstrip(".") + ';')
            else:
                current_segment.append(element.text.strip())

        if current_segment:
            segments.append(current_segment)

        # Xử lý một số dòng đặc biệt
        segments[0][0] = segments[1][0] + " - " + segments[2][0]
        segments[1][0], segments[2][0] = segments[2][1].split(". ")
        segments[5][0] = segments[4][0] + " - " + segments[5][0]
        segments[6][0] = segments[4][0] + " - " + segments[6][0]
        segments[10][0] = segments[9][0] + " - " + segments[10][0]
        segments[11][0] = segments[9][0] + " - " + segments[11][0]

        del segments[23][2:]
        del segments[9]
        del segments[4]
        del segments[2][1]
        del segments[0][1]

        # Lưu lại từng segment
        for idx, segment in enumerate(segments):
            if idx > 2:
                segments[idx][0] = segments[idx][0].upper() + ":"
            segments[idx] = " ".join(segment).strip()

            self.data.append({
                "original": segments[idx],
                "cleaned": self.clean(segments[idx])
            })

            # In ra màn hình để debug
            print(f"Segment {idx}:")
            print("- " + "\n- ".join(segment))
            print("Original:", self.data[-1]["original"])
            print("Cleaned :", self.data[-1]["cleaned"])
            print("-" * 50)

    def clean(self, text):
        """
        Làm sạch đoạn văn bản trước khi embedding.
        """
        text = text.lower()
        text = re.sub(r'\S+@\S+\.\S+', 'email', text)
        text = re.sub(r'[^\w\s]', '', text)

        stop_words = set(stopwords.words('english'))
        text = ' '.join(word for word in text.split() if word not in stop_words)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    def search_query(self, query, top_k):
        """
        Tìm kiếm các đoạn văn bản liên quan dựa trên câu hỏi.
        """
        query_cleaned = self.clean(query)
        query_embedding = self.sentence_transformer_model.encode([query_cleaned])
        similarities = cosine_similarity(query_embedding, self.embeddings)
        top_indices = np.argsort(similarities[0])[::-1][:top_k]
        contexts = [self.data[idx]["original"] for idx in top_indices]
        scores = [similarities[0][idx] for idx in top_indices]
        return contexts, scores

    def generate_answer_with_gemini(self, context, question):
        """
        Tạo câu trả lời sử dụng mô hình Gemini Flash 1.5
        """
        prompt = (
            f"You are a highly knowledgeable and detail-oriented assistant. "
            f"Begin your answer by addressing the general concept or definition of the question directly. "
            f"Then, if necessary, use the provided context to elaborate or refine your answer. "
            f"Ensure your response is accurate, detailed, and concise. "
            f"Do not include irrelevant information or assumptions not supported by the context.\n\n"
            f"Question:\n{question}\n\n"
            f"Context (if relevant):\n{context}\n\n"
            f"Answer:"
        )
        try:
            response = self.gemini_model.generate_content(prompt)
            return response.text
        except Exception as e:
            return f"Error: {e}"

    def make_request(self, question):
        """
        Tìm context phù hợp và tạo câu trả lời.
        """
        start = time.time()
        results, scores = self.search_query(question, 3)
        context = "\n".join([f"\t- (Score: {score:.4f}) {item}" for item, score in zip(results, scores)])
        answer = self.generate_answer_with_gemini(context, question)
        end = time.time()

        print(f"Context : \n{context}")
        print(f"Chatbot : {answer.rstrip()}")
        print(f"Running time: {end - start:.2f} seconds")


In [None]:
url = "https://www.presight.io/privacy-policy.html"
api_key = 'your_api'
chatbot = Chatbot(api_key, url)


Segment 0:
- PRIVACY POLICY - Last updated 15 Sep 2023
Original: PRIVACY POLICY - Last updated 15 Sep 2023
Cleaned : privacy policy last updated 15 sep 2023
--------------------------------------------------
Segment 1:
- At Presight, we are committed to protecting the privacy of our customers and visitors to our website
Original: At Presight, we are committed to protecting the privacy of our customers and visitors to our website
Cleaned : presight committed protecting privacy customers visitors website
--------------------------------------------------
Segment 2:
- This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.
Original: This Privacy Policy explains how we collect, use, and disclose information about our customers and visitors.
Cleaned : privacy policy explains collect use disclose information customers visitors
--------------------------------------------------
Segment 3:
- INFORMATION COLLECTION AND USE:
- We collect sever

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
question_1 = 'what is Personal Data?'
chatbot.make_request(question_1)

Context : 
	- (Score: 0.6997) TYPES OF DATA COLLECTED - PERSONAL DATA: While using our Service, we may ask you to provide us with certain personally identifiable information that can be used to contact or identify you ("Personal Data"). Personally identifiable information may include, but is not limited to: Email address; First name and last name; Phone number; Address, State, Province, ZIP/Postal code, City; Cookies and Usage Data;
	- (Score: 0.5572) PURPOSEFUL USE ONLY: We commit to only use personal information for the purposes identified in the entity's privacy policy.
	- (Score: 0.5411) SHARING OF PERSONAL DATA: Your personal data will not be subject to sharing, transfer, rental or exchange for the benefit of third parties, including AI models.
Chatbot : Personal data is any information that can be used to identify or contact a specific individual.  Based on the provided context, this includes, but is not limited to: email address, first name and last name, phone number, address (