# Preparation Of Vector Store Data

## Sin Yee

In [1]:
# General
import os
import json
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage


from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Retrievers
from langchain.retrievers import BM25Retriever, EnsembleRetriever

import openai

In [2]:
# Load in environment variables
load_dotenv(override=True)

llm = AzureChatOpenAI(
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_key=os.environ['AZURE_OPENAI_APIKEY'],
    deployment_name=os.environ['AZURE_OPENAI_DEPLOYMENT_NAME'],
    model_name=os.environ['AZURE_OPENAI_MODEL_NAME'],
    api_version=os.environ['AZURE_OPENAI_API_VERSION'],
    temperature=0
)

In [5]:
documents = []
for filename in os.listdir('C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Sin Yee'):
    if filename.endswith(".json"):
        with open(f'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Sin Yee\\{filename}') as f:
            data = json.load(f)
            for response_label, conversation in data.items():
                doc_content = json.dumps(conversation)
                doc_metadata = {"label": response_label, "source": filename}
                documents.append(Document(page_content=doc_content, metadata=doc_metadata))

print(len(documents))

57


In [7]:
# Embeddings
from langchain_openai.embeddings import AzureOpenAIEmbeddings
open_ai_embeddings = AzureOpenAIEmbeddings(azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'], 
                                   api_key=os.environ['AZURE_OPENAI_APIKEY'], 
                                   model=os.environ['TEXT_EMBEDDING_MODEL_NAME'],
                                   azure_deployment=os.environ['TEXT_EMBEDDING_DEPLOYMENT_NAME'])

In [8]:
# Vector store
faiss_vectorstore_open_ai_SinYee = FAISS.from_documents(documents, open_ai_embeddings)
## Saving Vector Store
faiss_vectorstore_open_ai_SinYee.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Sin Yee\\faiss_vectorstore_open_ai_SinYee")

In [5]:
# Instruction prompt
persona = """You would be assisting in identifying topics from a snippet of conversation"""
task = """I would supply the conversation directly. Interpret the main topic of the conversation and return the main topic. Do not give multiple topics such as football/soccer. Only give one main topic."""
example = """For example if the conversation is: 
            {"Sin Yee": "Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Sin Yee! Yeah, it's always exciting to see how your team will perform."}

           football

            Example 2:
            {"Sin Yee": "I'm planning to go on a trip to Japan next year", "John": "That's awesome! Japan is such a beautiful country."}

            travel

            Example 3: If no main topic can be determined such as a greeting
            {"Sin Yee": "Hey there! How are you doing?", "John": "Hey Sin Yee! I'm doing great, how about you?"}

            general"""


In [6]:
for doc in documents:
    instruction = f"{persona} {task} {example}"
    messages = [SystemMessage(content=instruction)]

    query = doc.page_content

    usermsg = HumanMessage(content=query)
    messages.append(usermsg)

    response = llm.invoke(messages)

    doc.metadata['topic'] = response.content


In [9]:
print(doc)

page_content='{"Xavier": "Me too, Sin Yee! Let's continue to inspire and motivate each other. I can't wait to see how far we'll both go in our crocheting journey."}' metadata={'label': 'Response 11', 'source': 'Xavier.json', 'topic': 'crocheting'}


In [11]:
import json

# Convert the documents list to a JSON serializable format
documents_json = [
    {
        "metadata": doc.metadata,
        "page_content": doc.page_content
    }
    for doc in documents
]

# Specify the file path for the JSON file
json_file_path = 'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Sin Yee\\documents.json'

# Save the documents list into a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(documents_json, json_file) 

In [7]:
position = []
count = 0

for doc in documents:
    if(len(doc.metadata['topic'].split()) > 1):
        print(doc.metadata['topic'])
        position.append(count)
    count += 1

print(position)

[]


In [10]:
count = 0

for doc in documents:
    if(len(doc.metadata['topic']) == 0):
        print(count)
    count += 1

In [12]:
temp = {}

for doc in documents:
    if(doc.metadata['topic'] not in temp):
        temp[doc.metadata['topic']] = 1

print(temp)

{'crochet': 1, 'art': 1, 'crafts': 1, 'crafting': 1, 'design': 1, 'gardening': 1, 'travel': 1, 'food': 1, 'fitness': 1, 'work': 1, 'healthcare': 1, 'health': 1, 'workload': 1, 'relaxation': 1, 'stress': 1, 'career': 1, 'nursing': 1, 'general': 1, 'crocheting': 1, 'knitting': 1, 'goal': 1, 'hobbies': 1, 'friendship': 1}


In [13]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

In [14]:
# Vector store
faiss_vectorstore_SinYee = FAISS.from_documents(documents, embeddings)

faiss_vectorstore_SinYee

<langchain_community.vectorstores.faiss.FAISS at 0x245d28a8b50>

In [15]:
## Saving Vector Store
faiss_vectorstore_SinYee.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Sin Yee\\faiss_vectorstore_SinYee")

## Lee Hang

In [1]:
# General
import os
import json
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage


from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Retrievers
from langchain.retrievers import BM25Retriever, EnsembleRetriever

import openai

In [3]:
# Load in environment variables
load_dotenv(override=True)

llm = AzureChatOpenAI(
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_key=os.environ['AZURE_OPENAI_APIKEY'],
    deployment_name=os.environ['AZURE_OPENAI_DEPLOYMENT_NAME'],
    model_name=os.environ['AZURE_OPENAI_MODEL_NAME'],
    api_version=os.environ['AZURE_OPENAI_API_VERSION'],
    temperature=0
)

In [5]:
documents = []
for filename in os.listdir('C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Lee Hang'):
    if filename.endswith(".json"):
        with open(f'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Lee Hang\\{filename}') as f:
            data = json.load(f)
            for response_label, conversation in data.items():
                doc_content = json.dumps(conversation)
                doc_metadata = {"label": response_label, "source": filename}
                documents.append(Document(page_content=doc_content, metadata=doc_metadata))

print(len(documents))

68


In [6]:
# Embeddings
from langchain_openai.embeddings import AzureOpenAIEmbeddings
open_ai_embeddings = AzureOpenAIEmbeddings(azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'], 
                                   api_key=os.environ['AZURE_OPENAI_APIKEY'], 
                                   model=os.environ['TEXT_EMBEDDING_MODEL_NAME'],
                                   azure_deployment=os.environ['TEXT_EMBEDDING_DEPLOYMENT_NAME'])

In [7]:
# Vector store
faiss_vectorstore_open_ai_Lee_Hang = FAISS.from_documents(documents, open_ai_embeddings)
## Saving Vector Store
faiss_vectorstore_open_ai_Lee_Hang.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Lee Hang\\faiss_vectorstore_open_ai_Lee_Hang")

In [8]:
# Instruction prompt
meta_content = """
You would be assisting in identifying topics from a snippet of conversation. I would supply the conversation directly. 
Interpret the main topic of the conversation and return the main topic.

Do not give multiple topics such as football/soccer. Only give one main topic.

For example if the conversation is: 
            {"Lee Hang": "Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Lee Hang! Yeah, it's 
            always exciting to see how your team will perform."}

            football

            Example 2:
            {"Lee Hang": "I'm planning to go on a trip to Japan next year", "John": "That's awesome! Japan is such a beautiful country."}

            travel

            Example 3: If no main topic can be determined such as a greeting
            {"Lee Hang": "Hey there! How are you doing?", "John": "Hey Lee Hang! I'm doing great, how about you?"}

            general 
"""


In [9]:
def getTopic(meta_content, query):
    # Learning instructions
    instruction = {
        "role": "system",
        "content": meta_content,
    }

    #print("Query is: " + query)

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]

    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    topic = raw_response.choices[0].message.content

    return topic

In [12]:
for doc in documents:
    
    query = doc.page_content

    doc.metadata['topic'] = getTopic(meta_content, query)


In [31]:
import json

# Convert the documents list to a JSON serializable format
documents_json = [
    {
        "metadata": doc.metadata,
        "page_content": doc.page_content
    }
    for doc in documents
]

# Specify the file path for the JSON file
json_file_path = 'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Lee Hang\\documents.json'

# Save the documents list into a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(documents_json, json_file) 

In [18]:
position = []
count = 0

for doc in documents:
    if(len(doc.metadata['topic'].split()) > 1):
        print(doc.metadata['topic'])
        position.append(count)
    count += 1

print(position)

cooking techniques
culinary arts
cooking competition
cooking show
elderly care
pet training
pet training
pet relations
[6, 25, 27, 29, 55, 60, 61, 66]


In [29]:
documents[66].metadata['topic'] = "pet"

In [13]:
count = 0

for doc in documents:
    if(len(doc.metadata['topic']) == 0):
        print(count)
    count += 1

In [30]:
temp = {}

for doc in documents:
    if(doc.metadata['topic'] not in temp):
        temp[doc.metadata['topic']] = 1

print(temp)

{'cooking': 1, 'television': 1, 'travel': 1, 'language': 1, 'technology': 1, 'fintech': 1, 'hackathon': 1, 'business': 1, 'volunteering': 1, 'activities': 1, 'stories': 1, 'elderly': 1, 'pets': 1, 'pet': 1, 'pet_adjustment': 1}


In [16]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

In [32]:
# Vector store
faiss_vectorstore_LeeHang = FAISS.from_documents(documents, embeddings)

faiss_vectorstore_LeeHang

<langchain_community.vectorstores.faiss.FAISS at 0x16bf0d26bd0>

In [33]:
## Saving Vector Store
faiss_vectorstore_LeeHang.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Lee Hang\\faiss_vectorstore_LeeHang")

## Gregory

In [8]:
# General
import os
import json
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage


from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Retrievers
from langchain.retrievers import BM25Retriever, EnsembleRetriever

import openai

In [9]:
# Load in environment variables
load_dotenv(override=True)

llm = AzureChatOpenAI(
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_key=os.environ['AZURE_OPENAI_APIKEY'],
    deployment_name=os.environ['AZURE_OPENAI_DEPLOYMENT_NAME'],
    model_name=os.environ['AZURE_OPENAI_MODEL_NAME'],
    api_version=os.environ['AZURE_OPENAI_API_VERSION'],
    temperature=0
)

In [27]:
documents = []
for filename in os.listdir('C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Gregory'):
    if filename.endswith(".json"):
        with open(f'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Gregory\\{filename}') as f:
            data = json.load(f)
            for response_label, conversation in data.items():
                doc_content = json.dumps(conversation)
                doc_metadata = {"label": response_label, "source": filename}
                documents.append(Document(page_content=doc_content, metadata=doc_metadata))

print(len(documents))

55


In [28]:
# Embeddings
from langchain_openai.embeddings import AzureOpenAIEmbeddings
open_ai_embeddings = AzureOpenAIEmbeddings(azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'], 
                                   api_key=os.environ['AZURE_OPENAI_APIKEY'], 
                                   model=os.environ['TEXT_EMBEDDING_MODEL_NAME'],
                                   azure_deployment=os.environ['TEXT_EMBEDDING_DEPLOYMENT_NAME'])

In [29]:
# Vector store
faiss_vectorstore_open_ai_Gregory = FAISS.from_documents(documents, open_ai_embeddings)
## Saving Vector Store
faiss_vectorstore_open_ai_Gregory.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Gregory\\faiss_vectorstore_open_ai_Gregory")

In [13]:
# Instruction prompt
meta_content = """
You would be assisting in identifying topics from a snippet of conversation. I would supply the conversation directly. 
Interpret the main topic of the conversation and return the main topic.

Do not give multiple topics such as football/soccer. Only give one main topic.

For example if the conversation is: 
            {"Gregory": "Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Gregory! Yeah, it's 
            always exciting to see how your team will perform."}

            football

            Example 2:
            {"Gregory": "I'm planning to go on a trip to Japan next year", "John": "That's awesome! Japan is such a beautiful country."}

            travel

            Example 3: If no main topic can be determined such as a greeting
            {"Gregory": "Hey there! How are you doing?", "John": "Hey Gregory! I'm doing great, how about you?"}

            general 
"""

In [14]:
def getTopic(meta_content, query):
    # Learning instructions
    instruction = {
        "role": "system",
        "content": meta_content,
    }

    #print("Query is: " + query)

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]

    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    topic = raw_response.choices[0].message.content

    return topic

In [15]:
for doc in documents:
    
    query = doc.page_content

    doc.metadata['topic'] = getTopic(meta_content, query)



In [21]:
import json

# Convert the documents list to a JSON serializable format
documents_json = [
    {
        "metadata": doc.metadata,
        "page_content": doc.page_content
    }
    for doc in documents
]

# Specify the file path for the JSON file
json_file_path = 'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Gregory\\documents.json'

# Save the documents list into a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(documents_json, json_file) 

In [16]:
position = []
count = 0

for doc in documents:
    if(len(doc.metadata['topic'].split()) > 1):
        print(doc.metadata['topic'])
        position.append(count)
    count += 1

print(position)

video games
puzzle games
video games
television drama
[26, 27, 28, 32]


In [24]:
count = 0

for doc in documents:
    if(len(doc.metadata['topic']) == 0):
        print(count)
    count += 1

In [23]:
temp = {}

for doc in documents:
    if(doc.metadata['topic'] not in temp):
        temp[doc.metadata['topic']] = 1

print(temp)

{'travel': 1, 'food': 1, 'gaming': 1, 'video games': 1, 'puzzle games': 1, 'kdramas': 1, 'movies': 1, 'television drama': 1, 'television': 1, 'entertainment': 1, 'football': 1, 'sports': 1}


In [19]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

In [22]:
# Vector store
faiss_vectorstore_Gregory = FAISS.from_documents(documents, embeddings)

faiss_vectorstore_Gregory

<langchain_community.vectorstores.faiss.FAISS at 0x1afff8de0d0>

In [25]:
## Saving Vector Store
faiss_vectorstore_Gregory.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Gregory\\faiss_vectorstore_Gregory")

## Cheryl

In [30]:
# General
import os
import json
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage


from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Retrievers
from langchain.retrievers import BM25Retriever, EnsembleRetriever

import openai

In [31]:
# Load in environment variables
load_dotenv(override=True)

llm = AzureChatOpenAI(
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_key=os.environ['AZURE_OPENAI_APIKEY'],
    deployment_name=os.environ['AZURE_OPENAI_DEPLOYMENT_NAME'],
    model_name=os.environ['AZURE_OPENAI_MODEL_NAME'],
    api_version=os.environ['AZURE_OPENAI_API_VERSION'],
    temperature=0
)

In [32]:
documents = []
for filename in os.listdir('C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Cheryl'):
    if filename.endswith(".json"):
        with open(f'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Cheryl\\{filename}') as f:
            data = json.load(f)
            for response_label, conversation in data.items():
                doc_content = json.dumps(conversation)
                doc_metadata = {"label": response_label, "source": filename}
                documents.append(Document(page_content=doc_content, metadata=doc_metadata))

print(len(documents))

60


In [33]:
# Embeddings
from langchain_openai.embeddings import AzureOpenAIEmbeddings
open_ai_embeddings = AzureOpenAIEmbeddings(azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'], 
                                   api_key=os.environ['AZURE_OPENAI_APIKEY'], 
                                   model=os.environ['TEXT_EMBEDDING_MODEL_NAME'],
                                   azure_deployment=os.environ['TEXT_EMBEDDING_DEPLOYMENT_NAME'])

In [34]:
# Vector store
faiss_vectorstore_open_ai_Cheryl = FAISS.from_documents(documents, open_ai_embeddings)
## Saving Vector Store
faiss_vectorstore_open_ai_Cheryl.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Cheryl\\faiss_vectorstore_open_ai_Cheryl")

In [35]:
# Instruction prompt
meta_content = """
You would be assisting in identifying topics from a snippet of conversation. I would supply the conversation directly. 
Interpret the main topic of the conversation and return the main topic.

Do not give multiple topics such as football/soccer. Only give one main topic.

For example if the conversation is: 
            {"Cheryl": "Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Cheryl! Yeah, it's 
            always exciting to see how your team will perform."}

            football

            Example 2:
            {"Cheryl": "I'm planning to go on a trip to Japan next year", "John": "That's awesome! Japan is such a beautiful country."}

            travel

            Example 3: If no main topic can be determined such as a greeting
            {"Cheryl": "Hey there! How are you doing?", "John": "Hey Cheryl! I'm doing great, how about you?"}

            general 
"""

In [36]:
def getTopic(meta_content, query):
    # Learning instructions
    instruction = {
        "role": "system",
        "content": meta_content,
    }

    #print("Query is: " + query)

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]

    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    topic = raw_response.choices[0].message.content

    return topic

In [37]:
for doc in documents:
    
    query = doc.page_content

    doc.metadata['topic'] = getTopic(meta_content, query)



In [41]:
import json

# Convert the documents list to a JSON serializable format
documents_json = [
    {
        "metadata": doc.metadata,
        "page_content": doc.page_content
    }
    for doc in documents
]

# Specify the file path for the JSON file
json_file_path = 'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Cheryl\\documents.json'

# Save the documents list into a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(documents_json, json_file) 

In [38]:
position = []
count = 0

for doc in documents:
    if(len(doc.metadata['topic'].split()) > 1):
        print(doc.metadata['topic'])
        position.append(count)
    count += 1

print(position)

outdoor activities
Formula 1
event planning
marathon training
[17, 18, 19, 46]


In [39]:
count = 0

for doc in documents:
    if(len(doc.metadata['topic']) == 0):
        print(count)
    count += 1

In [40]:
temp = {}

for doc in documents:
    if(doc.metadata['topic'] not in temp):
        temp[doc.metadata['topic']] = 1

print(temp)

{'cooking': 1, 'decorating': 1, 'gifts': 1, 'travel': 1, 'hiking': 1, 'outdoor activities': 1, 'Formula 1': 1, 'event planning': 1, 'sports': 1, 'shopping': 1, 'racing': 1, 'promotions': 1, 'entertainment': 1, 'memories': 1, 'advice': 1, 'photography': 1, 'marathon': 1, 'fitness': 1, 'marathon training': 1, 'nutrition': 1, 'running': 1, 'exercise': 1, 'badminton': 1, 'sports ': 1}


In [None]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

In [42]:
# Vector store
faiss_vectorstore_Cheryl = FAISS.from_documents(documents, embeddings)

faiss_vectorstore_Cheryl

<langchain_community.vectorstores.faiss.FAISS at 0x1afe69fe150>

In [43]:
## Saving Vector Store
faiss_vectorstore_Cheryl.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Cheryl\\faiss_vectorstore_Cheryl")

## Yu Min

In [44]:
# General
import os
import json
from dotenv import load_dotenv
from langchain_openai import AzureChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage


from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document

# Retrievers
from langchain.retrievers import BM25Retriever, EnsembleRetriever

import openai

In [45]:
# Load in environment variables
load_dotenv(override=True)

llm = AzureChatOpenAI(
    azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'],
    api_key=os.environ['AZURE_OPENAI_APIKEY'],
    deployment_name=os.environ['AZURE_OPENAI_DEPLOYMENT_NAME'],
    model_name=os.environ['AZURE_OPENAI_MODEL_NAME'],
    api_version=os.environ['AZURE_OPENAI_API_VERSION'],
    temperature=0
)

In [46]:
documents = []
for filename in os.listdir('C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Yu Min'):
    if filename.endswith(".json"):
        with open(f'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Yu Min\\{filename}') as f:
            data = json.load(f)
            for response_label, conversation in data.items():
                doc_content = json.dumps(conversation)
                doc_metadata = {"label": response_label, "source": filename}
                documents.append(Document(page_content=doc_content, metadata=doc_metadata))

print(len(documents))

70


In [47]:
# Embeddings
from langchain_openai.embeddings import AzureOpenAIEmbeddings
open_ai_embeddings = AzureOpenAIEmbeddings(azure_endpoint=os.environ['AZURE_OPENAI_ENDPOINT'], 
                                   api_key=os.environ['AZURE_OPENAI_APIKEY'], 
                                   model=os.environ['TEXT_EMBEDDING_MODEL_NAME'],
                                   azure_deployment=os.environ['TEXT_EMBEDDING_DEPLOYMENT_NAME'])

In [48]:
# Vector store
faiss_vectorstore_open_ai_Yumin = FAISS.from_documents(documents, open_ai_embeddings)
## Saving Vector Store
faiss_vectorstore_open_ai_Yumin.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Yu Min\\faiss_vectorstore_open_ai_Yumin")

In [49]:
# Instruction prompt
meta_content = """
You would be assisting in identifying topics from a snippet of conversation. I would supply the conversation directly. 
Interpret the main topic of the conversation and return the main topic.

Do not give multiple topics such as football/soccer. Only give one main topic.

For example if the conversation is: 
            {"Yu Min": "Can't wait for the new football season to start, hoping for a great one for Arsenal!", "John": "Hey Yu Min! Yeah, it's 
            always exciting to see how your team will perform."}

            football

            Example 2:
            {"Yu Min": "I'm planning to go on a trip to Japan next year", "John": "That's awesome! Japan is such a beautiful country."}

            travel

            Example 3: If no main topic can be determined such as a greeting
            {"Yu Min": "Hey there! How are you doing?", "John": "Hey Yu Min! I'm doing great, how about you?"}

            general 
"""

In [50]:
def getTopic(meta_content, query):
    # Learning instructions
    instruction = {
        "role": "system",
        "content": meta_content,
    }

    #print("Query is: " + query)

    # Initialize messages
    messages = []

    # Add learn instruction to message array
    messages.append(instruction)

    user_message = {
            "role": "user",
            "content": query
    }

    messages.append(user_message)

    openai.api_type = 'openai'
    openai.api_key = os.environ["OPENAI_API_KEY"]
    openai.organisation= os.environ["OPEN_AI_ORG"]

    raw_response = openai.chat.completions.create(
            model="gpt-3.5-turbo",
            messages = messages,
    )
    topic = raw_response.choices[0].message.content

    return topic

In [51]:
for doc in documents:
    
    query = doc.page_content

    doc.metadata['topic'] = getTopic(meta_content, query)



In [63]:
import json

# Convert the documents list to a JSON serializable format
documents_json = [
    {
        "metadata": doc.metadata,
        "page_content": doc.page_content
    }
    for doc in documents
]

# Specify the file path for the JSON file
json_file_path = 'C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Yu Min\\documents.json'

# Save the documents list into a JSON file
with open(json_file_path, 'w') as json_file:
    json.dump(documents_json, json_file) 

In [52]:
position = []
count = 0

for doc in documents:
    if(len(doc.metadata['topic'].split()) > 1):
        print(doc.metadata['topic'])
        position.append(count)
    count += 1

print(position)

adrenaline sports
dining out
event planning
event planning
event planning
event planning
video games
[11, 20, 22, 27, 31, 35, 67]


In [58]:
documents[20].metadata['topic'] = "event planning"

In [53]:
count = 0

for doc in documents:
    if(len(doc.metadata['topic']) == 0):
        print(count)
    count += 1

In [59]:
temp = {}

for doc in documents:
    if(doc.metadata['topic'] not in temp):
        temp[doc.metadata['topic']] = 1

print(temp)

{'adventure': 1, 'travel': 1, 'challenge': 1, 'skydiving': 1, 'cityscape': 1, 'party': 1, 'beverages': 1, 'event planning': 1, 'music': 1, 'events': 1, 'cooking': 1, 'baking': 1, 'hobbies': 1, 'relationship': 1, 'swimming': 1, 'education': 1, 'motivation': 1, 'sports': 1, 'video games': 1, 'floorball': 1}


In [60]:
embeddings=HuggingFaceInferenceAPIEmbeddings(
    api_key=os.environ['HUGGING_FACE_ACCESS_TOKEN'],
    model_name='BAAI/bge-base-en-v1.5'
)

In [61]:
# Vector store
faiss_vectorstore_Yumin = FAISS.from_documents(documents, embeddings)

faiss_vectorstore_Yumin

<langchain_community.vectorstores.faiss.FAISS at 0x1afff8ea590>

In [62]:
## Saving Vector Store
faiss_vectorstore_Yumin.save_local("C:\\Roydon\\Github\\FYP_Application\\MuteCompanion\\MuteApp\\assets\\mockdata\\Yu Min\\faiss_vectorstore_Yumin")