# RAG Implementation using ChromaDB Embeddings and Anthropic LLM model

# Install required packages

In [1]:
!pip install -q anthropic pandas chromadb
!pip install -q numpy

#Import the required packages

In [2]:
from anthropic import Anthropic
import os
import pandas as pd
import chromadb
import numpy as np
from chromadb.utils import embedding_functions

# Set the API Keys

In [21]:
anthropic_api_key=os.environ.get('ANTHROPIC_API_KEY')
client_anthropic=Anthropic(api_key=anthropic_api_key)
client_chroma=chromadb.Client()

In [15]:
anthropic_api_key='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'

# Load the data into a data frame
#### This dataset has been downloaded from Kaggle https://www.kaggle.com/datasets/suraj520/customer-support-ticket-dataset

In [4]:
datapath_tickets='/content'
df_tickets=pd.read_csv(datapath_tickets + '/customer_support_tickets.csv')
print(len(df_tickets))
df_tickets.head(2)

8469


Unnamed: 0,Ticket ID,Customer Name,Customer Email,Customer Age,Customer Gender,Product Purchased,Date of Purchase,Ticket Type,Ticket Subject,Ticket Description,Ticket Status,Resolution,Ticket Priority,Ticket Channel,First Response Time,Time to Resolution,Customer Satisfaction Rating
0,1,Marisa Obrien,carrollallison@example.com,32,Other,GoPro Hero,2021-03-22,Technical issue,Product setup,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Social media,2023-06-01 12:15:36,,
1,2,Jessica Rios,clarkeashley@example.com,42,Female,LG Smart TV,2021-05-22,Technical issue,Peripheral compatibility,I'm having an issue with the {product_purchase...,Pending Customer Response,,Critical,Chat,2023-06-01 16:45:38,,


In [5]:
df_tickets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8469 entries, 0 to 8468
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Ticket ID                     8469 non-null   int64  
 1   Customer Name                 8469 non-null   object 
 2   Customer Email                8469 non-null   object 
 3   Customer Age                  8469 non-null   int64  
 4   Customer Gender               8469 non-null   object 
 5   Product Purchased             8469 non-null   object 
 6   Date of Purchase              8469 non-null   object 
 7   Ticket Type                   8469 non-null   object 
 8   Ticket Subject                8469 non-null   object 
 9   Ticket Description            8469 non-null   object 
 10  Ticket Status                 8469 non-null   object 
 11  Resolution                    2769 non-null   object 
 12  Ticket Priority               8469 non-null   object 
 13  Tic

# Data preparation for generating embeddings with Vector database Chromadb

In [6]:
ticket_description=[]
ticket_id=[]

for i in range(len(df_tickets['Ticket Description'])):
  ticket_description.append(str(df_tickets['Ticket Description'][i]))
  ticket_id.append(str(df_tickets['Ticket ID'][i]))

# Create chromadb collection and load the data

#### Here we are using Chromadb's default embedding function to generate the embeddings. There are other more special purpose embedding models available like voyageai, etc that can be used for generating embeddings.

In [7]:
default_embed_func = embedding_functions.DefaultEmbeddingFunction()

cust_supp_tkt_collection=client_chroma.get_or_create_collection(name='cust_supp_tkt_collection', embedding_function=default_embed_func)
cust_supp_tkt_collection.add(ids=ticket_id[:500], documents=ticket_description[:500]) #Limiting the data as supportd by ChormaDB collections add function
cust_supp_tkt_collection.add(ids=ticket_id[500:1000], documents=ticket_description[500:1000])

#### Executing a sample query to fetch tickets which contains the word 'not turning on' in the ticket description.

In [8]:
cust_supp_tkt_collection.query(
    query_texts=["not turning on"], include=['documents', 'embeddings'],
    n_results=1)

{'ids': [['434']],
 'embeddings': [array([[-4.58637550e-02, -1.14503661e-02,  4.77238260e-02,
          -8.04173201e-02, -2.98854671e-02, -1.77736152e-02,
           2.57012085e-03,  5.20835891e-02,  2.21555531e-02,
           7.88760372e-03,  4.26727645e-02,  5.01059256e-02,
          -2.35065930e-02,  6.16935529e-02,  7.21299350e-02,
           1.31026283e-01, -1.76124778e-02, -7.65978023e-02,
           4.59535234e-03,  3.98420878e-02, -3.16244960e-02,
           9.60485451e-03, -9.11026436e-05,  3.42344120e-02,
          -1.71957184e-02,  7.50167761e-03,  9.48048569e-03,
          -2.32096463e-02, -3.89010087e-02, -3.35825747e-03,
           1.79355685e-02,  1.70920882e-02, -2.29298752e-02,
          -3.55100143e-03,  1.43859833e-02, -7.72016472e-04,
           1.06728321e-03, -5.83519302e-02, -7.86477476e-02,
          -3.29021551e-02,  3.62745374e-02, -4.21610512e-02,
          -7.14566931e-02,  3.26671712e-02,  4.64714691e-02,
           8.50813091e-03,  5.87139092e-02,  7.27938

#### Function for querying data from the collection

In [9]:
def query_collection(user_query, collection_name):
  query_results=collection_name.query(
    query_texts=[user_query], include=['documents'],
    n_results=5)
  return query_results

#### Execute the function and Inspect the response

In [10]:
user_query="issues related to not turning on"
response = query_collection(user_query, cust_supp_tkt_collection)
for num_recs in range(len(response['documents'][0])):
  print('Ticket ID: ' + str(response['ids'][0][num_recs]) + '\n\n' + str(response['documents'][0][num_recs] + '\n\n'))

Ticket ID: 434

I'm facing a problem with my {product_purchased}. The {product_purchased} is not turning on. It was working fine until yesterday, but now it doesn't respond.

The user could also do this with This problem started occurring after the recent software update. I haven't made any other changes to the device.


Ticket ID: 880

I'm facing a problem with my {product_purchased}. The {product_purchased} is not turning on. It was working fine until yesterday, but now it doesn't respond.

My first thought was to put it I'm experiencing this issue on multiple devices of the same model, so it seems to be a widespread problem.


Ticket ID: 42

I'm facing a problem with my {product_purchased}. The {product_purchased} is not turning on. It was working fine until yesterday, but now it doesn't respond.

If I'd just switched to a The issue I'm facing is intermittent. Sometimes it works fine, but other times it acts up unexpectedly.


Ticket ID: 255

I'm facing a problem with my {product_pu

# Prompt generation with context for Anthropic Opus LLM model

In [19]:
def prep_user_query(user_query, collection_name):

  response = query_collection(user_query, cust_supp_tkt_collection)
  query_result=''
  for num_recs in range(len(response['documents'][0])):
    query_result+='Ticket ID: ' + str(response['ids'][0][num_recs]) + '\n\n' + str(response['documents'][0][num_recs] + '\n\n')

  prompt=f"Answer this query : '{user_query}' with the following context: '{query_result}'"

  client_anthropic.messages.create(model='claude-3-opus-20240229',
                                   system = 'You are an assistant with access to customer support tickets data. Use the data you have to handle queries',
                                   messages = [{'role':'user','content':prompt}], max_tokens=1000)

In [None]:
user_query="issues related to not turning on"
prep_user_query(user_query, cust_supp_tkt_collection)