In [1]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_groq import ChatGroq
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm
import json

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["HUGGING_FACE_API_KEY"] = os.getenv("HUGGING_FACE_API_KEY")
os.environ["OPEN_AI_API"] = os.getenv("OPEN_AI_API")
os.environ["GROQ_API_KEY"] = os.getenv('GROQ_API_KEY')
groq_api_key = os.getenv('GROQ_API_KEY')
open_ai_api_key = os.getenv("OPEN_AI_API")

In [3]:
root_directory = ".\Documents"
documents = list()

In [4]:
for folder, _, files in os.walk(root_directory):
    for file in files:
        file_path = os.path.join(folder, file)
        try:
            loader = PyPDFLoader(file_path)
            docs = loader.load()
            documents.extend(docs)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
XRef object at 2945 can not be read, some object may be missing
XRef object at 2945 can not be read, some object may be missing
Ignoring wrong pointing object 2268 0 (offset 1064028)
Ignoring wrong pointing object 2269 0 (offset 1064028)


In [5]:
# Split documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=8000, chunk_overlap=200)
chunked_documents = []

In [6]:
for doc in tqdm(documents, desc="Splitting Documents"):
    chunks = text_splitter.split_documents([doc])
    chunked_documents.extend(chunks)

Splitting Documents: 100%|██████████| 2702/2702 [00:00<00:00, 6380.89it/s]


In [7]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", api_key=open_ai_api_key)

In [8]:
vector_db = FAISS.from_documents(chunked_documents, embedding_model)

In [26]:
llm = ChatGroq(model='llama-3.2-3b-preview', groq_api_key=groq_api_key)

In [27]:
# Memory setup
memory = ConversationBufferMemory(
    memory_key="chat_history",  # Key used to store conversation history
    return_messages=True        # Allows memory to be added to chain responses
)

In [28]:
# Conversational Retrieval Chain
qa_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
    memory=memory,
    chain_type="stuff"
)

In [29]:
# # Functions to generate cultural norms and scenarios
# def generate_cultural_norms(country: str) -> dict:
#     prompt = f"Provide me 10 unique sentences highlighting the core values/important aspects of individuals living in {country}."
#     response = qa_chain.run(prompt)
#     norms = [
#         {"id": idx + 1, "text": norm.strip()}
#         for idx, norm in enumerate(response.split("\n"))
#     ]
#     return {
#         "gpt4-o prompt": prompt,
#         "country": country,
#         "norms": norms
#     }

In [30]:
def generate_cultural_norms(country: str) -> dict:
    prompt = f"Provide me 10 unique sentences highlighting the core values/important aspects of individuals living in {country}."
    response = qa_chain.run(prompt)
    print(response)
    # Split the response into lines and filter undesired content
    norms = []
    for idx, line in enumerate(response.split("\n")):
        line = line.strip()
        # Check if the line is valid (e.g., non-empty and not generic text)
        if line and not line.lower().startswith("i don't know") and not "here are" in line.lower():
            # Optionally, check for a pattern (e.g., starting with a number or bullet)
            if line[0].isdigit() or line.startswith("-"):
                norms.append({"id": len(norms) + 1, "text": line})
    
    # If no valid norms are found, return an empty list
    if not norms:
        print(f"No valid norms generated for {country}.")
        return {"gpt4-o prompt": prompt, "country": country, "norms": []}
    
    return {
        "gpt4-o prompt": prompt,
        "country": country,
        "norms": norms
    }


In [31]:
# def generate_cultural_scenarios(country: str, norms: list) -> list:
#     scenarios = []
#     scenario_id = 1
#     for norm in norms:
#         prompt = (
#             f"I would like to generate some example scenarios showing this cultural norm in {country}. "
#             "Please generate 10 scenarios, detailing each scenario with up to 2 sentences. "
#             "Please refrain from stating the cultural norm in the scenario."
#         )
#         response = qa_chain.run(prompt)
#         generated_scenarios = [
#             {"id": scenario_id + idx, "norm-id": norm['id'], "text": scenario.strip()}
#             for idx, scenario in enumerate(response.split("\n"))
#         ]
#         scenarios.extend(generated_scenarios)
#         scenario_id += len(generated_scenarios)
#     return scenarios


In [32]:
def generate_cultural_scenarios(country: str, norms: list, batch_size: int = 3) -> list:
    scenarios_list = []
    scenario_id = 1

    for i in range(0, len(norms), batch_size):
        batch = norms[i:i + batch_size]

        for norm in batch:
            # Construct the prompt for each norm
            prompt = (
                f"I would like to generate some example scenarios showing these cultural norms in {country}:\n"
                + "\nPlease generate 10 scenarios for each norm, detailing each scenario with up to 2 sentences. "
                "Please refrain from stating the cultural norm in the scenario."
            )

            try:
                # Run the prompt and handle API response
                response = qa_chain.run(prompt)

                # Process response: Split into lines and filter relevant content
                scenario_lines = [
                    line.strip()
                    for line in response.split("\n")
                    if line.strip() and not line.startswith("Here are") and not line.startswith("**")
                ]

                # Structure scenarios for the given norm
                structured_scenarios = {
                    "gpt4-o prompt": prompt,
                    "country": country,
                    "scenarios": [
                        {
                            "id": scenario_id + idx,
                            "norm-id": norm['id'],
                            "text": scenario
                        }
                        for idx, scenario in enumerate(scenario_lines[:10], start=1)
                    ]
                }

                # Append to the list
                scenarios_list.append(structured_scenarios)
                scenario_id += 10

            except Exception as e:
                print(f"Error generating scenarios for norm '{norm['text']}': {e}")
                continue

    return scenarios_list


In [33]:
# Save or append to JSON file
def save_to_json(data: dict, filename: str):
    if os.path.exists(filename):
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
        if "cultural-norms" in data:
            existing_data.setdefault("cultural-norms", []).extend(data["cultural-norms"])
        if "cultural_scenarios" in data:
            existing_data.setdefault("cultural_scenarios", []).extend(data["cultural_scenarios"])
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(existing_data, f, indent=4, ensure_ascii=False)
    else:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)

In [37]:
# Main function
if __name__ == "__main__":
    country = input("Enter the country for which to generate cultural norms and scenarios: ").strip()

    # Generate cultural norms
    cultural_norms = generate_cultural_norms(country)
    save_to_json({"cultural-norms": [cultural_norms]}, "rag_cultural_norms.json")

    # # Generate cultural scenarios
    # cultural_scenarios = generate_cultural_scenarios(country, cultural_norms['norms'])
    # save_to_json({"cultural_scenarios": cultural_scenarios}, "rag_cultural_scenarios.json")

    print(f"Cultural norms and scenarios for {country} saved successfully.")

Based on the provided context, the core values and important aspects of individuals living in Japan can be summarized as follows:

1. **Harmony (Wa)**: The concept of harmony is deeply ingrained in Japanese culture. It refers to the idea of living in balance and harmony with others, and with nature. This value is considered essential for maintaining social cohesion and stability.
2. **Individualism/Collectivism**: Japanese individuals prioritize both individuality and collectivism. They value personal autonomy and self-expression, but also prioritize the well-being of the group and the community.
3. **Confucianism**: Confucianism plays a significant role in shaping Japanese values and behavior. It emphasizes the importance of social hierarchy, respect for authority, and self-cultivation.
4. **Respect for tradition**: Japanese individuals place a high value on tradition and respect for the past. This is reflected in their cultural practices, customs, and social norms.
5. **Social cohesi

In [17]:
# def chatbot(user_input: str) -> str:
#     """
#     This function takes user input and returns the chatbot's response,
#     retaining conversation context.
#     """
#     response = qa_chain.run(user_input)
#     return response

In [18]:
# # Main execution loop for the chatbot
# if __name__ == "__main__":
#     print("Chatbot is running. Type 'exit' to end the conversation.")
#     while True:
#         user_input = input("You: ")
#         if user_input.lower() == "exit":
#             print("Chatbot: Goodbye!")
#             break
#         response = chatbot(user_input)
#         print(f"Chatbot: {response}")