In [19]:
# Import necessary libraries and modules for handling environment variables, file operations, and various APIs
import os
import glob
from dotenv import load_dotenv
import gradio as gr

In [20]:
# Install necessary langchain libraries using pip
!pip install langchain
!pip install -U langchain-community
!pip install langchain_openai
!pip install langchain_chroma



In [21]:
# Import necessary langchain components for document processing and visualization
from langchain.document_loaders import DirectoryLoader, TextLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np
import plotly.graph_objects as go
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings

In [22]:
# Initialize OpenAI API
from openai import OpenAI
openai = OpenAI(api_key="")

In [23]:
# Define folders to scan for documents
folders = glob.glob("chain_reactions_knowledge_base_detailed/*")

def add_metadata(doc, doc_type):
    """Add metadata to document indicating its type.
    
    Args:
        doc (Document): The document to which metadata will be added.
        doc_type (str): The type of the document.

    Returns:
        Document: Document with added metadata.
    """
    doc.metadata["doc_type"] = doc_type
    return doc

In [24]:
# Specify text encoding for loading text documents
text_loader_kwargs = {'encoding': 'utf-8'}

In [25]:
# Collect documents from specified folders
documents = []
for folder in folders:
    doc_type = os.path.basename(folder)  # Determine document type from folder name
    loader = DirectoryLoader(folder, glob="**/*.md", loader_cls=TextLoader, loader_kwargs=text_loader_kwargs)
    folder_docs = loader.load()
    documents.extend([add_metadata(doc, doc_type) for doc in folder_docs])

# Split documents into manageable chunks for further processing
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
chunks = text_splitter.split_documents(documents)

# Print summary of the chunks and document types
print(f"Total number of chunks: {len(chunks)}")
print(f"Document types found: {set(doc.metadata['doc_type'] for doc in documents)}")

Total number of chunks: 59
Document types found: {'contracts', 'company', 'products', 'employees'}


In [26]:
MODEL = "gpt-4o-mini"
db_name = "vector_db"

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key="")

# Remove existing vector store if it exists
if os.path.exists(db_name):
    Chroma(persist_directory=db_name, embedding_function=embeddings).delete_collection()

# Create vector store from document chunks
vectorstore = Chroma.from_documents(documents=chunks, embedding=embeddings, persist_directory=db_name)
print(f"Vectorstore created with {vectorstore._collection.count()} documents")

Vectorstore created with 59 documents


In [27]:
# Access collection of vectors and document metadata
collection = vectorstore._collection
sample_embedding = collection.get(limit=1, include=["embeddings"])["embeddings"][0]
dimensions = len(sample_embedding)
print(f"The vectors have {dimensions:,} dimensions")

The vectors have 1,536 dimensions


In [28]:
# Retrieve vector data, document text, and metadata
result = collection.get(include=['embeddings', 'documents', 'metadatas'])
vectors = np.array(result['embeddings'])
documents = result['documents']
metadatas = result['metadatas']
doc_types = [metadata['doc_type'] for metadata in metadatas]
colors = [['blue', 'green', 'red', 'orange'][['products', 'employees', 'contracts', 'company'].index(t)] for t in doc_types]

In [29]:
# Initialize t-SNE for dimensionality reduction
tsne = TSNE(n_components=2, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 2D scatter plot using Plotly
fig = go.Figure(data=[go.Scatter(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='2D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y'),
    width=800,
    height=600,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [30]:
# Perform 3D t-SNE for higher-dimensional visualization
tsne = TSNE(n_components=3, random_state=42)
reduced_vectors = tsne.fit_transform(vectors)

# Create the 3D scatter plot using Plotly
fig = go.Figure(data=[go.Scatter3d(
    x=reduced_vectors[:, 0],
    y=reduced_vectors[:, 1],
    z=reduced_vectors[:, 2],
    mode='markers',
    marker=dict(size=5, color=colors, opacity=0.8),
    text=[f"Type: {t}<br>Text: {d[:100]}..." for t, d in zip(doc_types, documents)],
    hoverinfo='text'
)])

fig.update_layout(
    title='3D Chroma Vector Store Visualization',
    scene=dict(xaxis_title='x', yaxis_title='y', zaxis_title='z'),
    width=900,
    height=700,
    margin=dict(r=20, b=10, l=10, t=40)
)

fig.show()

In [31]:
# Set up a conversational AI chat using OpenAI
llm = ChatOpenAI(
    temperature=0.7,
    model_name="gpt-4",  # You can use "gpt-3.5-turbo" as an alternative
    openai_api_key=  # Replace with your actual key
)

# Alternative - if you'd like to use Ollama locally, uncomment this line instead
# llm = ChatOpenAI(temperature=0.7, model_name='llama3.2', base_url='http://localhost:11434/v1', api_key='ollama')

# Set up conversation memory to manage chat history
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# Initialize a retriever using the vector store
retriever = vectorstore.as_retriever()

# Set up a conversation chain for retrieval-augmented generation (RAG)
conversation_chain = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory)

# Query the conversation chain with a question
query = "Please explain what chain reaction africa is in a couple of sentences"
result = conversation_chain.invoke({"question": query})
print(result["answer"])

Chain Reactions Africa is a strategic communications and public relations firm that operates across Africa. They partner with corporations, governments, and institutions to provide full-service PR solutions, including media relations, digital campaigns, and crisis management. They aim to build trust and meaningful relationships for brands, as well as manage crises effectively.


In [32]:
def chat(question, history):
    """Execute a chat conversation based on a question and chat history.
    
    Args:
        question (str): The question to ask the conversational model.
        history (list): The conversation history.

    Returns:
        str: The answer from the conversation model.
    """
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

In [33]:
# Launch a Gradio interface for chatting
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

# Define system and human message templates for the conversation prompt
system_message = SystemMessagePromptTemplate.from_template(
    "You are a nice interactive chatbot who is expert in answering accurate questions about Chain reactions africa, the pr consultancy company. Give brief, accurate answers. If you don't know the answer, say so. Do not make anything up if you haven't been provided with relevant context."
)

human_message = HumanMessagePromptTemplate.from_template(
    "Context:\n{context}\n\nQuestion: {question}"
)

# Create a chat prompt template for the conversation chain
chat_prompt = ChatPromptTemplate.from_messages([system_message, human_message])

* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.


In [34]:
# Set up a new conversational AI chat with OpenAI
llm = ChatOpenAI(
    temperature=0.7,
    model_name="gpt-4",  # You can use "gpt-3.5-turbo" as an alternative
    openai_api_key=  # Replace with your actual key
)

# Set up conversation memory again for new chat instance
memory = ConversationBufferMemory(memory_key='chat_history', return_messages=True)

# Set up the retriever with specific retrieval settings
retriever = vectorstore.as_retriever(search_kwargs={"k": 50})

# Configure conversation chain with specific prompt and settings
conversation_chain = ConversationalRetrievalChain.from_llm(
    llm=llm,
    retriever=retriever,
    memory=memory,
    combine_docs_chain_kwargs={
        "prompt": chat_prompt,
        "document_variable_name": "context"  # This specifies the context insertion point
    }
)


In [35]:
def chat(question, history):
    """Execute a chat conversation based on a question and chat history.
    
    Args:
        question (str): The question to ask the conversational model.
        history (list): The conversation history.

    Returns:
        str: The answer from the conversation model.
    """
    result = conversation_chain.invoke({"question": question})
    return result["answer"]

# Launch a Gradio interface for chatting
view = gr.ChatInterface(chat, type="messages").launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.
