In [1]:
!pip install langchain openai python-dotenv pypdf
!pip install unstructured chromadb tiktoken
#for the abstracted example

Collecting langchain
  Downloading langchain-0.0.305-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-0.28.1-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-dotenv
  Downloading python_dotenv-1.0.0-py3-none-any.whl (19 kB)
Collecting pypdf
  Downloading pypdf-3.16.2-py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.3/276.3 kB[0m [31m28.8 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.1-py3-none-any.whl (27 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langsmith<0.1.0,>=0.0.38 (from langchain)
  Downloading langsmith-0.0.41-py3-none-any.w

## Use this if running on Google Colab

In [2]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/My Drive/Colab Notebooks/

Mounted at /content/drive/
/content/drive/My Drive/Colab Notebooks


## Initialise environment variables (link to Azure Resources)

In [3]:
from dotenv import load_dotenv
import os
#load environment variables
load_dotenv("skillwise.env")

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

## Importing Custom Data from Various Files

In [4]:
from langchain.document_loaders import PyPDFLoader
from langchain.document_loaders.csv_loader import CSVLoader


loaders = []
loaders.append(CSVLoader(file_path="data/SkillWise Dataset.csv", encoding='utf8'))
# can load more loaders
docs = []
for loader in loaders:
    docs.extend(loader.load())

from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(
    openai_api_key=OPENAI_API_KEY,
    chunk_size=16,
)

## Set up LLM model

In [5]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY,
                      max_tokens=300,
                      temperature=0.5,
                      request_timeout=30)

# **Abstracted/Simplest Way**
Follow: https://python.langchain.com/docs/use_cases/question_answering.html


In [6]:
# from langchain.indexes import VectorstoreIndexCreator

# # Create your index
# index = VectorstoreIndexCreator(embedding=embeddings).from_loaders(loaders)

In [7]:
# Question-answering
# question = "Can you recommend me a Python course?"
# index.query(question, llm)

## More Detailed Way (more control)

1. Load your documents
2. Create your index (VectorStore)
3. Query your index

https://techcommunity.microsoft.com/t5/startups-at-microsoft/build-a-chatbot-to-query-your-documentation-using-langchain-and/ba-p/3833134

Facebook AI Similarity Search (Faiss) is a library for efficient similarity search and clustering of dense vectors. It contains algorithms that search in sets of vectors of any size, up to ones that possibly do not fit in RAM. It also contains supporting code for evaluation and parameter tuning.
https://python.langchain.com/docs/modules/data_connection/vectorstores/integrations/faiss


In [8]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


## Create your index (VectorStore)

In [9]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter

docs = []
for loader in loaders:
    docs.extend(loader.load())

#Split the Document into chunks for embedding and vector storage.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
documents = text_splitter.split_documents(docs)

#Use Langchain to create the embeddings using the embedding model
db = FAISS.from_documents(documents=documents, embedding=embeddings)

#save the embeddings into FAISS vector store
db.save_local("./dbs/documentation/faiss_index")

In [10]:

#load the faiss vector store we saved into memory
vectorStore = FAISS.load_local("./dbs/documentation/faiss_index", embeddings)

#use the faiss vector store we saved to search the local document
retriever = vectorStore.as_retriever(search_type="similarity", search_kwargs={"k":2})


## Implement Memory

In [11]:
from langchain.memory import ConversationBufferWindowMemory
from langchain.prompts import (
    ChatPromptTemplate,
    MessagesPlaceholder,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.chains import LLMChain

prompt = ChatPromptTemplate(
    messages=[
        SystemMessagePromptTemplate.from_template(
            """You are a custom chatbot running on the SkillWise platform, a talent management and learning platform
            with AI-driven personalized course recommendations and gamification elements to make learning fun and engaging.
            You are having a conversation with a human, and you have a list of SkillWise courses to make personalised recommendations
            from. You are to only recommend courses listed in the Skillwise Dataset.
            If you do not know how to reply, simply respond with "I don't know. Can you provide another query?"
            Here is the chat history for context: {context}
            "))

            Here is a short snippet of how the conversation should look like:
            User: Hello!
            Chatbot: Hello! How can I assist you today?
            User: Thank you.
            Chatbot: You're welcome! Let me know if you need any further assistance.
            """
        ),
        HumanMessagePromptTemplate.from_template("User: {question}")
    ]
)





In [12]:
from IPython.core import history
# ConversationBufferMemory is a simple form of memory that just keeps a list of chat messages in a buffer
memory = ConversationBufferWindowMemory(k=8, memory_key="chat_history", return_messages=True)
#only keep the first 8 so buffer size dont keep increasing

#use the vector store as a retriever
# The ConversationalRetrievalQA chain builds on RetrievalQAChain to provide a chat history component.
from langchain.chains import ConversationalRetrievalChain
qa = ConversationalRetrievalChain.from_llm(llm=llm, retriever=retriever, memory=memory,
                                           chain_type="stuff", combine_docs_chain_kwargs={'prompt': prompt})
# The ConversationalRetrievalChain only uses message history to generate questions for the Retriever,
# but does not expose the history to the chat LLM by default.

def ask_question(qa, question):
    result = qa.run({"question": question})
    # print(memory.load_memory_variables({})) # for debugging
    print_long_text(result)

def print_long_text(text, max_width=80):
    words = text.split()
    lines = []
    current_line = ""

    for word in words:
        if len(current_line) + len(word) + 1 <= max_width:
            current_line += word + " "
        else:
            lines.append(current_line)
            current_line = word + " "

    if current_line:
        lines.append(current_line)

    for line in lines:
        print(line)

print("Welcome to SkillWise AI. Type in your queries or submit 'q' to quit.")
while True:
  query = input('User: ')
  if query == 'q':
    print("Quiting...")
    break
  ask_question(qa, query)


Welcome to SkillWise AI. Type in your queries or submit 'q' to quit.
User: Can you recommend me courses on Skillwise?
Chatbot: Sure! I can recommend courses from the SkillWise platform. Could you 
please provide me with some information about your interests or the specific 
topic you would like to learn? 
User: I would like to learn about Supply chain
Chatbot: Sure! I can recommend the "Supply Chain Management Specialization" on 
Coursera. It covers topics such as transportation, warehousing, inventory, 
logistics network design, and more. You can find more information about the 
course 
[here](https://www.coursera.org/specializations/supply-chain-management). Let 
me know if you need any further assistance! 
User: q
Quiting...
