In [1]:
import os
import json
import openai
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.schema import BaseRetriever
from langchain.memory import ConversationBufferMemory
from langchain.schema import HumanMessage, AIMessage
from typing import List, Dict

In [2]:
os.environ["OPEN_AI_API"] = os.getenv("OPEN_AI_API")
open_ai_api_key = os.getenv("OPEN_AI_API")

In [3]:
root_directory = ".\Documents"
documents = list()

In [4]:
client = openai.OpenAI(api_key=open_ai_api_key)

In [5]:
for folder, _, files in os.walk(root_directory):
    for file in files:
        file_path = os.path.join(folder, file)
        try:
            loader = PyPDFLoader(file_path)
            docs = loader.load()
            documents.extend(docs)
        except Exception as e:
            print(f"Error loading {file_path}: {e}")

  from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4
XRef object at 2945 can not be read, some object may be missing
XRef object at 2945 can not be read, some object may be missing
Ignoring wrong pointing object 2268 0 (offset 1064028)
Ignoring wrong pointing object 2269 0 (offset 1064028)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing object 19 0 (offset 0)
Ignoring wrong pointing object 21 0 (offset 0)
Ignoring wrong pointing object 23 0 (offset 0)
Ignoring wrong pointing object 30 0 (offset 0)
Ignoring wrong pointing object 35 0 (offset 0)
Ignoring wrong pointing object 37 0 (offset 0)
Ignoring wrong pointing object 39 0 (offset 0)
Ignoring wrong pointing object 41 0 (offset 0)
Ignoring wrong pointing object 60 0 (offset 0)
Ignoring wrong pointing object 477 0 (offset 0)
Ignoring wrong pointing object 479 0 (offset 0)
Ignoring wrong pointing object 6 0 (offset 0)
Ignoring wrong pointing object 17 0 (offset 0)
Ignoring wrong pointing objec

In [6]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=3000, chunk_overlap=100)
split_docs = text_splitter.split_documents(documents)

In [7]:
embedding_model = OpenAIEmbeddings(model="text-embedding-3-large", api_key=open_ai_api_key)

In [8]:
vector_db = FAISS.from_documents(split_docs, embedding_model)

In [9]:
retriever: BaseRetriever = vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 5})

In [10]:
# Initialize ConversationBufferMemory
memory = ConversationBufferMemory(
    memory_key="chat_history",  # Key used to store conversation history
    return_messages=True        # Returns chat history in message format
)

In [11]:
def query_gpt4(memory: ConversationBufferMemory, retriever: BaseRetriever, user_query: str) -> str:
    """
    Handles the conversational aspect and integrates the retriever with OpenAI's GPT-4.

    Args:
        memory: ConversationBufferMemory to manage chat history.
        retriever: The retriever for querying the vector store.
        user_query: The user's current question.

    Returns:
        A response from GPT-4.
    """
    # Retrieve relevant documents
    relevant_docs = retriever.get_relevant_documents(user_query)
    context = "\n\n".join([doc.page_content for doc in relevant_docs])

    # Retrieve conversation history from memory
    chat_history = memory.chat_memory.messages

    # Add context and chat history to the prompt
    prompt = (
        "You are a helpful assistant. Use the following context to answer the question:\n\n"
        f"Context:\n{context}\n\n"
        "Conversation History:\n" +
        "\n".join([
            f"{'User' if isinstance(message, HumanMessage) else 'Assistant'}: {message.content}"
            for message in chat_history
        ]) +
        f"\n\nUser: {user_query}\nAssistant:"
    )

    # Query GPT-4
    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[{"role": "user", "content": prompt}]
    )

    # Add the user's query and GPT-4's response to memory
    memory.chat_memory.add_user_message(user_query)
    memory.chat_memory.add_ai_message(response.choices[0].message.content)

    return response.choices[0].message.content


In [12]:
# Save or append to JSON file
def save_to_json(data: dict, filename: str):
    if os.path.exists(filename):
        with open(filename, "r", encoding="utf-8") as f:
            existing_data = json.load(f)
        if "cultural-norms" in data:
            existing_data.setdefault("cultural-norms", []).extend(data["cultural-norms"])
        # if "cultural_scenarios" in data:
        #     existing_data.setdefault("cultural_scenarios", []).extend(data["cultural_scenarios"])
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(existing_data, f, indent=4, ensure_ascii=False)
    else:
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=4, ensure_ascii=False)

In [13]:
def generate_cultural_norms(country: str) -> dict:
    prompt = f"Provide me 10 unique sentences highlighting the core values/important aspects of individuals living in {country}."
    response = query_gpt4(memory, retriever, prompt)
    print(response)
    # Split the response into lines and filter undesired content
    norms = []
    for idx, line in enumerate(response.split("\n")):
        line = line.strip()
        # Check if the line is valid (e.g., non-empty and not generic text)
        if line and not line.lower().startswith("i don't know") and not "here are" in line.lower():
            # Optionally, check for a pattern (e.g., starting with a number or bullet)
            if line[0].isdigit() or line.startswith("-"):
                norms.append({"id": len(norms) + 1, "text": line})
    
    # If no valid norms are found, return an empty list
    if not norms:
        print(f"No valid norms generated for {country}.")
        return {"gpt4-o prompt": prompt, "country": country, "norms": []}
    
    return {
        "gpt4-o prompt": prompt,
        "country": country,
        "norms": norms
    }


In [14]:
def read_json_file(file_path: str) -> Dict:
    """Reads a JSON file and returns its content."""
    with open(file_path, "r", encoding="utf-8") as file:
        return json.load(file)

def write_json_file(data: Dict, file_path: str):
    print("Data: \n\n\n", data)
    """Writes data to a JSON file."""
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4, ensure_ascii=False)
        
def generate_scenarios_for_norm(norms: List[Dict], country: str) -> Dict:
    """Generates cultural scenarios for each norm in a given list."""
    scenarios = []
    
    for i in range(len(norms)):
        norm = norms[i]
        prompt = f"I would like to generate some example scenarios showing this cultural norm {norm} in {country}. Please generate 10 scenarios, detailing each scenario with up to 2 sentences. Please refrain from stating the cultural norm in the scenario."
        try:
            response = query_gpt4(memory, retriever, prompt)  # Replace with your model's call
            generated_scenarios = response.split("\n")  # Assuming each scenario is a line
            
            # print("------------------------------------------------------------------\n")
            # print(f"Generated scenarios for norm {norms[i]}: \n", generated_scenarios)
            # print("------------------------------------------------------------------\n")
            
            for scenario in generated_scenarios:
                scenario = scenario.strip()
                # Filter only valid scenarios that start with a numbered list (e.g., "1.", "2.")
                if scenario and scenario[0].isdigit() and scenario[1] in [".", ")"]:
                    scenarios.append({
                        "id": len(scenarios) + 1,
                        "norm-id": norm["id"],
                        "text": scenario
                    })
        except Exception as e:
            print(f"Error generating scenarios for norms: {e}")

    scenarios_dict = {
        "prompt": f"I would like to generate some example scenarios showing this cultural norm in {country}. Please generate 10 scenarios, detailing each scenario with up to 2 sentences. Please refrain from stating the cultural norm in the scenario.",
        "country": country,
        "scenarios": scenarios
    }
    
    print("\n\n\n Scenarios: ", scenarios)
    
    return scenarios_dict


In [15]:
def main():
    country = input("Enter the country for which to generate cultural norms and scenarios: ").strip()
    
    norms_file = "openai_cultural_norms.json"
    # scenarios_file = "rag_cultural_scenarios.json"

    cultural_norms = generate_cultural_norms(country)
    save_to_json({"cultural-norms": [cultural_norms]}, norms_file)
    
    # Read the norms from the JSON file
    # cultural_data = read_json_file(norms_file)
    # all_scenarios = []

    # Iterate through each country's norms
    # for country_data in cultural_data.get("cultural-norms", []):
    #     country = country_data.get("country")
    #     norms = country_data.get("norms", [])

    #     print(f"The norms of {country} are: \n", norms)
        
    #     if not norms:
    #         print(f"No norms found for {country}.")
    #         continue

    #     # Generate scenarios for the country's norms
    #     country_scenarios = generate_scenarios_for_norm(norms, country)
    #     print("\n\n\n\n\n --------------------------------------------- \n")
    #     print(country_scenarios)
        
    #     # all_scenarios.extend(country_scenarios)

    # # Save all scenarios to a single JSON file
    # write_json_file({"cultural-scenarios": country_scenarios}, scenarios_file)
    # print(f"Cultural scenarios saved to {scenarios_file}.")

In [21]:
main()

1. South Africa prides itself on its cultural diversity, often celebrated through the concept of a "Rainbow Nation" that respects and integrates a multitude of ethnic and cultural identities.

2. Ubuntu, a Nguni Bantu term meaning "humanity towards others," embodies the South African value of interconnectedness, emphasizing community, compassion, and mutual care.

3. The legacy of apartheid has instilled a significant focus on equality and human rights in South African society, with efforts to address historical injustices and promote social justice.

4. Family and extended kin networks hold a central place in South African life, providing emotional and social support across generations and playing a vital role in individual identity.

5. The South African Constitution is one of the most progressive globally, enshrining a commitment to non-racialism, equality, and the protection of a broad spectrum of human rights, including cultural rights.

6. South Africans often exercise a strong s