In [1]:
import os

In [2]:
! pip install openai



In [3]:
#Fetching OPENAI_API_KEY from colab API keys
from google.colab import userdata
os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')

# Retrival Augmented Generation using OpenAI



In [4]:
import numpy as np
import pandas as pd


In [5]:
import tensorflow as tf
import tf_keras as keras

In [6]:
from openai import OpenAI

In [7]:
client = OpenAI()

## Document Ingestion

In [8]:
!pip install pypdf
!pip install python-docx
!pip install beautifulsoup4

Collecting pypdf
  Downloading pypdf-5.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading pypdf-5.3.1-py3-none-any.whl (302 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/302.0 kB[0m [31m9.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.3.1
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [9]:
import pypdf
import docx
import zipfile

In [10]:
local_zip = '/content/NLP_intents.zip'
zip_ref = zipfile.ZipFile(local_zip,'r')
zip_ref.extractall('/content')
zip_ref.close()

In [11]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [12]:
#Document Extraction

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file"""
    text = ""
    reader = pypdf.PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text() + "\n"
    return text

def extract_text_from_docx(docx_path):
    """Extract text from a Word file"""
    text = ""
    doc = docx.Document(docx_path)
    for para in doc.paragraphs:
        text += para.text + "\n"
    return text

def extract_text_from_txt(txt_path):
    """Extract text from a TXT file"""
    with open(txt_path, "r", encoding="utf-8") as file:
        return file.read()

def extract_text_from_html(fh):
  with open(fh, "r", encoding="utf-8") as file:
    content = file.readlines()
    print(content)#Contains list of html links
    for i in content:
      soup = BeautifulSoup(i, "html.parser")
      response = requests.get(soup.get_text())
      print(type(response.text))
      return response.text

def load_documents(folder_path):
    """Load all supported document types from a folder"""
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith(".pdf"):
            text = extract_text_from_pdf(file_path)
        elif filename.endswith(".docx"):
            text = extract_text_from_docx(file_path)
        elif filename.endswith(".txt"):
            text = extract_text_from_txt(file_path)
        elif filename.endswith(".html"):
            text = extract_text_from_html(file_path)
        else:
            continue  # Skip unsupported file types
        documents.append({'filename': filename, 'text':text})
    return documents


In [13]:
fp = '/content/NLP_intents'
docs = load_documents(fp)
print(type(docs))

<class 'list'>


In [14]:
raw_text = ''
for d in docs:#(d is tuple i.e.(name, text))
  #raw_text += d[1]
  raw_text += str(d)

In [15]:
raw_text



##Text Splitting and Chunking

In [16]:
!pip install -q langchain

In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [18]:
my_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 800,
    chunk_overlap = 100,
    length_function=len
)

In [19]:
chunks = my_splitter.split_text(raw_text)

In [20]:
chunks

['{\'filename\': \'rule_word2.docx\', \'text\': "ICC Men’s U19 Cricket World Cup 2024 Playing Conditions 1 Preamble - The Spirit of Cricket Cricket owes much of its appeal and enjoyment to the fact that it should be played not only according to the Laws (which are incorporated within these Playing Conditions), but also within the Spirit of Cricket. The major responsibility for ensuring fair play rests with the captains, but extends to all players, match officials and, especially in junior cricket, teachers, coaches and parents. Respect is central to the Spirit of Cricket. Respect your captain, team-mates, opponents and the authority of the umpires. Play hard and play fair. Accept the umpire’s decision. Create a positive atmosphere by your own conduct, and encourage others to do likewise. Show',
 'Create a positive atmosphere by your own conduct, and encourage others to do likewise. Show self-discipline, even when things go against you. Congratulate the opposition on their successes, an

##Create Index

In [21]:
pre_upsert_df = pd.DataFrame(columns=['id', 'values', 'metadata'])

In [22]:
pre_upsert_df

Unnamed: 0,id,values,metadata


In [23]:
def generate_ids(number, size):
  import string, random
  ids=[]
  for i in range(number):
    res = ''.join(random.choices(string.ascii_letters, k=size))
    ids.append(res)
    if len(set(ids)) != i+1:
      i-=1
      ids.pop(-1)

  return ids

In [24]:
def get_embeddings(text, model='text-embedding-ada-002'):
  text = text.replace("\n"," ")
  return client.embeddings.create(input=text, model=model).data[0].embedding

In [25]:
def load_chunks(df, split_text):
  ids=generate_ids(len(split_text),7)
  i = 0
  for chunk in split_text:
    df.loc[i]= [ids[i],get_embeddings(chunk, model='text-embedding-3-small'),{'text':chunk}]
    i+=1
  return df

In [26]:
my_index_df = load_chunks(pre_upsert_df, chunks)

In [27]:
my_index_df

Unnamed: 0,id,values,metadata
0,PlxobYH,"[0.01995290070772171, 0.06670600175857544, 0.0...","{'text': '{'filename': 'rule_word2.docx', 'tex..."
1,dNTlYQT,"[0.029205966740846634, -0.005541421473026276, ...",{'text': 'Create a positive atmosphere by your...
2,rQVoNhg,"[0.011474181897938251, 0.02478327974677086, 0....",{'text': 'Each captain shall nominate 11 playe...
3,LxkEWvY,"[0.015696045011281967, 0.018519073724746704, 0...",{'text': 'nominated including those nominated ...
4,RlWIMKu,"[-0.004091864917427301, 0.03072209283709526, 0...",{'text': 'player or player support personnel w...
...,...,...,...
2019,xdrFONH,"[-0.007341206073760986, 0.004794763866811991, ...",{'text': 'competition or tournament held under...
2020,OpOnicS,"[-0.009653075598180294, 0.0016899119364097714,...",{'text': 'and binding determination. APPENDIX ...
2021,WBrjNlL,"[-0.013384937308728695, -0.011868387460708618,...",{'text': 'decides to award T20I status. ii) Te...
2022,NwSdxpt,"[0.002839159220457077, -0.009074818342924118, ...",{'text': 'against teams adjudged First-Class p...


In [28]:
my_index_df.to_csv('final_cricket.csv', index=False)

## Connect and Upsert to Vector Data Base: Pinecone

In [29]:
!pip install pinecone

Collecting pinecone
  Downloading pinecone-6.0.1-py3-none-any.whl.metadata (8.8 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone-6.0.1-py3-none-any.whl (421 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/421.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━[0m [32m337.9/421.4 kB[0m [31m10.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.4/421.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone
Successfully installed pinecone-6.0.1 pinecone-plugin-interface-0.0.7


In [30]:
from pinecone import Pinecone

In [31]:
os.environ['PINECONE_API_KEY']='pcsk_jCydh_CFAHfTpuLgbKc6fgBwpD2qheFbRNjnMsMhhcfH26SnLgsAM5dFyFThSsgzXYqQf'

In [32]:
pc = Pinecone(api_key=os.environ['PINECONE_API_KEY'])

In [33]:
index = pc.Index('my-new-rag')

In [34]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 18767}},
 'total_vector_count': 18767,
 'vector_type': 'dense'}

In [35]:
my_index_from_csv = pd.read_csv('final_cricket.csv')

In [36]:
my_index_from_csv

Unnamed: 0,id,values,metadata
0,PlxobYH,"[0.01995290070772171, 0.06670600175857544, 0.0...","{'text': '{\'filename\': \'rule_word2.docx\', ..."
1,dNTlYQT,"[0.029205966740846634, -0.005541421473026276, ...",{'text': 'Create a positive atmosphere by your...
2,rQVoNhg,"[0.011474181897938251, 0.02478327974677086, 0....",{'text': 'Each captain shall nominate 11 playe...
3,LxkEWvY,"[0.015696045011281967, 0.018519073724746704, 0...",{'text': 'nominated including those nominated ...
4,RlWIMKu,"[-0.004091864917427301, 0.03072209283709526, 0...",{'text': 'player or player support personnel w...
...,...,...,...
2019,xdrFONH,"[-0.007341206073760986, 0.004794763866811991, ...",{'text': 'competition or tournament held under...
2020,OpOnicS,"[-0.009653075598180294, 0.0016899119364097714,...",{'text': 'and binding determination. APPENDIX ...
2021,WBrjNlL,"[-0.013384937308728695, -0.011868387460708618,...",{'text': 'decides to award T20I status. ii) Te...
2022,NwSdxpt,"[0.002839159220457077, -0.009074818342924118, ...",{'text': 'against teams adjudged First-Class p...


In [37]:
def prepare_DF(df):
  import json,ast
  try: df=df.drop('Unnamed: 0',axis=1)
  except: print('Unnamed Not Found')
  df['values']=df['values'].apply(lambda x: np.array([float(i) for i in x.replace("[",'').replace("]",'').split(',')]))
  df['metadata']=df['metadata'].apply(lambda x: ast.literal_eval(x))
  return df

In [38]:
index_df = prepare_DF(my_index_from_csv)

Unnamed Not Found


In [39]:
index_df

Unnamed: 0,id,values,metadata
0,PlxobYH,"[0.01995290070772171, 0.06670600175857544, 0.0...","{'text': '{'filename': 'rule_word2.docx', 'tex..."
1,dNTlYQT,"[0.029205966740846634, -0.005541421473026276, ...",{'text': 'Create a positive atmosphere by your...
2,rQVoNhg,"[0.011474181897938251, 0.02478327974677086, 0....",{'text': 'Each captain shall nominate 11 playe...
3,LxkEWvY,"[0.015696045011281967, 0.018519073724746704, 0...",{'text': 'nominated including those nominated ...
4,RlWIMKu,"[-0.004091864917427301, 0.03072209283709526, 0...",{'text': 'player or player support personnel w...
...,...,...,...
2019,xdrFONH,"[-0.007341206073760986, 0.004794763866811991, ...",{'text': 'competition or tournament held under...
2020,OpOnicS,"[-0.009653075598180294, 0.0016899119364097714,...",{'text': 'and binding determination. APPENDIX ...
2021,WBrjNlL,"[-0.013384937308728695, -0.011868387460708618,...",{'text': 'decides to award T20I status. ii) Te...
2022,NwSdxpt,"[0.002839159220457077, -0.009074818342924118, ...",{'text': 'against teams adjudged First-Class p...


In [40]:
def convert_data(chunk):
 'Converts a pandas dataframe to be a simple list of tuples, formatted how the `upsert()` method in the Pinecone Python client expects.'
 data = []
 for i in chunk.to_dict('records'):
  data.append(i)
 return data

In [41]:
def load_chunker(seq, size):
 'Yields a series of slices of the original iterable, up to the limit of what size is.'
 for pos in range(0, len(seq), size):
   yield seq.iloc[pos:pos + size]

### UPSERT to Pinecone

In [42]:
for load_chunk in load_chunker(index_df, 200):
  index.upsert(vectors=convert_data(load_chunk))

In [43]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 20767}},
 'total_vector_count': 20767,
 'vector_type': 'dense'}

## Retrieve Context

In [44]:
sample_text = 'What is ILLEGAL BOWLING ACTIONS?'

In [45]:
test_embeddings = get_embeddings(sample_text, model='text-embedding-3-small')

In [46]:
index.query(vector=test_embeddings, top_k=5, include_metadata=True)

{'matches': [{'id': 'sMgumUE',
              'metadata': {'text': 'constitutes an Illegal Bowling Action and '
                                   'these Illegal Bowling \n'
                                   'Regulations amongst all Players.\n'
                                   ' 2 ILLEGAL BOWLING ACTIONS\n'
                                   ' 2.1 An Illegal Bowling Action is a '
                                   'bowling action where the Player’s \n'
                                   'Elbow Extension exceeds 15 degrees, '
                                   'measured using the Standard \n'
                                   'Analysis Protocols from the point at which '
                                   'the bowling arm reaches \n'
                                   'the horizontal until the point at which '
                                   'the ball is released. Any Elbow \n'
                                   'Hyperextension shall be discounted for the '
                       

In [47]:
contexts = [item['metadata']['text'] for item in index.query(vector=test_embeddings, top_k=5, include_metadata=True)['matches']]

In [48]:
contexts

['constitutes an Illegal Bowling Action and these Illegal Bowling \nRegulations amongst all Players.\n 2 ILLEGAL BOWLING ACTIONS\n 2.1 An Illegal Bowling Action is a bowling action where the Player’s \nElbow Extension exceeds 15 degrees, measured using the Standard \nAnalysis Protocols from the point at which the bowling arm reaches \nthe horizontal until the point at which the ball is released. Any Elbow \nHyperextension shall be discounted for the purposes of determining  \nan Illegal Bowling Action.\n 2.2 Nothing contained herein shall override an Umpire’s responsibility and \ndiscretion\tto\tapply\tLaw\t24\tof\tthe\tLaws\tof\tCricket,\tas\tmodified\tby\tArticle\t \n2.1 above.\n 3 REPORTING PROCEDURE FOR A SUSPECTED ILLEGAL  \nBOWLING ACTION',
 'constitutes an Illegal Bowling Action and these Illegal Bowling \nRegulations amongst all Players.\n 2 ILLEGAL BOWLING ACTIONS\n 2.1 An Illegal Bowling Action is a bowling action where the Player’s \nElbow Extension exceeds 15 degrees, measu

In [49]:
def get_context(query, embed_model='text-embedding-3-small',k=5):
  query_embeddings = get_embeddings(query, model=embed_model)
  pinecone_response = index.query(vector =query_embeddings, top_k=k, include_metadata=True)
  contexts =[item['metadata']['text'] for item in pinecone_response['matches']]
  return contexts, query

### Prompt Engineering

In [50]:
import textwrap

In [71]:
def ask_gpt(system_prompt,user_prompt, model="gpt-3.5-turbo", temp=0.7):

  temperature_=temp

  completion = client.chat.completions.create(
    model=model,
    temperature=temperature_,
    messages=[
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": user_prompt
        }
    ]
  )
  lines = (completion.choices[0].message.content).split("\n")
  lists = (textwrap.TextWrapper(width=90, break_long_words=False).wrap(line) for line in lines)
  return "\n".join("\n".join(list) for list in lists)

In [72]:
def augmented_query(user_query, embed_model='text-embedding-3-small', k=5):
  contexts, query = get_context(user_query, embed_model=embed_model, k=k)
  return "\n\n---\n\n".join(contexts)+"\n\n---\n\n"+query

In [73]:
my_user_prompt = augmented_query('What is National Cricket Federation?')

In [74]:
my_user_prompt

'National Cricket Federation. A national or regional entity which is a member of or\nis recognised by the ICC as the entity governing the sport of cricket in a country (or\ncollective group of countries associated for cricket purposes).\nNotice of Charge. As defined in Article 4.4.\nOne Day International Match. As defined by Section 33 of the ICC Classification of\nOfficial Cricket.\nPCT Code. The agreed set of core values in force from time to timewhich all Match\nOfficials and Match Official Support Personnel are required to operate in accordancewith\nin pursuance oftheir duties as Match Officials and Match Official Support Personnel,\nthe current version ofwhich is attached at Appendix 2.\t\nPlayer. Any cricketer who is selected in any playing or touring team or squad that is\n\n---\n\nNational Cricket Federation. A national or regional entity which is a member of or\nis recognised by the ICC as the entity governing the sport of cricket in a country (or\ncollective group of countrie

### System Prompt

In [75]:
# primer = f"""
# You are a question and answer bot. A highly intelligent system that answers user
# questions based on information provided by the user above each question.
# If the answer cannot be found in the information provided by the user, you truthfully
# answer, "I don't Know"
# """

In [101]:
primer = f"""
You are a question and answer bot. A highly intelligent system that answers user
questions based on information provided by the user above each question.
Avoid giving harmful and inappropriate answers.
If the answer cannot be found in the information provided by the user, you truthfully
answer, "I don't Know"
"""

In [102]:
import re

def validate_input(user_query):
  if re.search(r"system prompt|profanity|hate_speech_keywords", user_query):
    return "Sorry, your query contains inappropriate text/language."
  return user_query

In [103]:
def moderate_output(model_response):
  if re.search(r"system prompt|profanity|hate_speech_keywords", model_response):
    return "I'm sorry, I cannot provide an answer to that."
  return model_response

In [92]:
ask_gpt(system_prompt=primer, user_prompt=augmented_query('What is National Cricket Federation?'))

'National Cricket Federation is a national or regional entity that is a member of or\nrecognized by the ICC as the entity governing the sport of cricket in a country or a\ncollective group of countries associated for cricket purposes.'

In [93]:
ask_gpt(system_prompt=primer, user_prompt=augmented_query('Who is Sachin Tendulkar?'))

'Sachin Tendulkar is a former Indian cricketer widely regarded as one of the greatest\nbatsmen in the history of cricket. He is known for his numerous records and achievements\nin the sport, including being the highest run-scorer in international cricket. Tendulkar\nis often referred to as the "Master Blaster" and "Little Master" by cricket fans around\nthe world.'

In [104]:
ask_gpt(system_prompt=primer, user_prompt=augmented_query('Who is Elon Musk?'))

"I don't know."

Yodabot


In [79]:
def Yoda_AI(query):
  embed_model ='text-embedding-3-small'
  primer = f"""
  You are the Jedi Master Yoda. You are a wise and powerful teacher. You provide
  complete and consise answers from the information provided above the user prompt. If the answer cannot be found in the information provided, you truthfully
  say "I Know not" in a manner consistent with the light side of The Force.
  """
  llm_model='gpt-3.5-turbo'
  user_prompt=augmented_query(query,embed_model)
  return ask_gpt(primer, user_prompt, model=llm_model)

In [80]:
Yoda_AI('What is National Cricket Federation?')

'National Cricket Federation is a national or regional entity recognized by the ICC as the\ngoverning body for cricket in a specific country or group of countries associated for\ncricket purposes.'

In [81]:
Yoda_AI('Who is Katrina Kaif?')

'I Know not.'

##UI Interface: Gradio

In [82]:
!pip install gradio --upgrade
import gradio as gr



In [83]:
# def rag_pipeline(query):
#     augmented_query_text = augmented_query(query)
#     answer = ask_gpt(system_prompt=primer, user_prompt=augmented_query_text)
#     return answer

In [108]:
def rag_pipeline(query):
    validated_query = validate_input(query)
    if validated_query != query: # If query was modified return warning msg
        return validated_query
    augmented_query_text = augmented_query(validated_query)
    answer = ask_gpt(system_prompt=primer, user_prompt=augmented_query_text)
    moderated_answer = moderate_output(answer)
    return moderated_answer

In [109]:
iface = gr.Interface(
    fn=rag_pipeline,
    inputs=gr.Textbox(lines=2, placeholder="Enter your query here..."),
    outputs=gr.Textbox(),
    title="RAG: Expert Chatbot For Cricket RuleBook",
    description="Ask me anything about cricket rules you need to know.",
)

In [110]:
iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://2bdc204619fe1ece5b.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


