In [29]:
from elasticsearch import Elasticsearch

In [30]:
es = Elasticsearch(
    "https://aws-chatbot.es.us-central1.gcp.cloud.es.io",
    basic_auth=("elastic","YTEJkgvx3NX19njbGyDF40Lo")
   

)
es.ping()

True

### Prepare the data

In [31]:
import pandas as pd

df = pd.read_csv("../dataset/consolidated_data.csv").loc[:1500]
df.head()

Unnamed: 0,id,pattern,response
0,1,How can I run a web server on AWS?,"To run a web server on AWS, you can use Amazon..."
1,2,What AWS service should I use to host a scalab...,"For hosting scalable applications, consider us..."
2,3,How do I deploy a custom application on the cl...,"To deploy a custom application, use Amazon EC2..."
3,4,I need a dedicated environment for my applicat...,If you need dedicated resources for your appli...
4,5,What's the best way to get started with virtua...,Start with Amazon EC2. It offers a wide range ...


#### Check NA values


In [32]:
df.isna().value_counts()

id     pattern  response
False  False    False       1501
Name: count, dtype: int64

In [5]:
df.fillna("None", inplace=True)

### Convert the relevant field to Vector using BERT model

In [33]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-mpnet-base-v2')

In [34]:
df["ResponseVector"] = df["response"].apply(lambda x: model.encode(x))

In [35]:
df.head()

Unnamed: 0,id,pattern,response,ResponseVector
0,1,How can I run a web server on AWS?,"To run a web server on AWS, you can use Amazon...","[-0.00054931047, -0.05692641, -0.0006814774, 0..."
1,2,What AWS service should I use to host a scalab...,"For hosting scalable applications, consider us...","[-0.015269826, -0.012294186, -0.01100238, -0.0..."
2,3,How do I deploy a custom application on the cl...,"To deploy a custom application, use Amazon EC2...","[-0.012933653, -0.045452587, -0.0068273027, -0..."
3,4,I need a dedicated environment for my applicat...,If you need dedicated resources for your appli...,"[-0.009388072, -0.018510194, 0.0007667323, 0.0..."
4,5,What's the best way to get started with virtua...,Start with Amazon EC2. It offers a wide range ...,"[-0.059143938, -0.017358141, -0.0022847152, -0..."


In [36]:
es.ping()

True

### Create new index in ElasticSearch

In [37]:
from indexMapping import indexMapping

try:
    es.indices.create(index="all_patterns_v1", mappings=indexMapping) 
except Exception as e:
    pass

### Ingest the data into index

In [38]:
record_list = df.to_dict("records")

In [39]:
for record in record_list:
    try:
        es.index(index="all_patterns_1500", document=record, id=record["id"])
    except Exception as e:
        print(e)

In [40]:
es.count(index="all_patterns_1500")

ObjectApiResponse({'count': 1501, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

### Search the data

In [41]:
input_keyword = " Billing of Amazon EC2 systems begin and end?"
vector_of_input_keyword = model.encode(input_keyword)

query = {
    "field": "ResponseVector",
    "query_vector": vector_of_input_keyword,
    "k": 1,  # Set k to 1 to get only the top result
    "num_candidates": 1500,
}

res = es.knn_search(index="all_patterns_1500", knn=query, source=["pattern", "response"])
hits = res["hits"]["hits"]


if hits:
    best_match = hits[0]
    print("Best Matching Result:")
    print("Pattern:", best_match["_source"]["pattern"])
    print("Response:", best_match["_source"]["response"])
else:
    print("No matching results found.")


Best Matching Result:
Pattern: When does billing of my Amazon EC2 systems begin and end?
Response: Billing commences when Amazon EC2 initiates the boot sequence of an AMI instance. Billing ends when the instance terminates, which could occur through a web services command, by running "shutdown -h", or through instance failure. When you stop an instance, we shut it down but don't charge hourly usage for a stopped instance, or data transfer fees, but we do charge for the storage for any Amazon EBS volumes. To learn more, visit the AWS Documentation.


  res = es.knn_search(index="all_patterns_1500", knn=query, source=["pattern", "response"])


In [42]:
input_keyword = "Billing of Amazon EC2 systems begin and end?"
vector_of_input_keyword = model.encode(input_keyword)

query = {
    "field" : "ResponseVector",
    "query_vector" : vector_of_input_keyword,
    "k" : 3,
    "num_candidates" : 1500, 
}

res = es.knn_search(index="all_patterns_1500", knn=query , source=["pattern","response"])
res["hits"]["hits"]

  res = es.knn_search(index="all_patterns_1500", knn=query , source=["pattern","response"])


[{'_index': 'all_patterns_1500',
  '_id': '339',
  '_score': 0.90958804,
  '_ignored': ['response.keyword'],
  '_source': {'pattern': 'When does billing of my Amazon EC2 systems begin and end?',
   'response': 'Billing commences when Amazon EC2 initiates the boot sequence of an AMI instance. Billing ends when the instance terminates, which could occur through a web services command, by running "shutdown -h", or through instance failure. When you stop an instance, we shut it down but don\'t charge hourly usage for a stopped instance, or data transfer fees, but we do charge for the storage for any Amazon EBS volumes. To learn more, visit the AWS Documentation.'}},
 {'_index': 'all_patterns_1500',
  '_id': '340',
  '_score': 0.84812176,
  '_ignored': ['response.keyword'],
  '_source': {'pattern': 'What defines billable EC2 instance usage?',
   'response': 'Instance usages are billed for any time your instances are in a "running" state. If you no longer wish to be charged for your instance

In [43]:
rdf = df.sample(frac=0.01)

In [44]:
rdf.head()

Unnamed: 0,id,pattern,response,ResponseVector
268,269,Why should I use EFA?,"EFA brings the scalability, flexibility, and e...","[-0.008393067, -0.043162365, -0.02907724, 0.02..."
1261,1262,How do I know in which S3 Intelligent-Tiering ...,You can use Amazon S3 Inventory to report the ...,"[-0.008884707, -0.022729026, 0.010597382, 0.02..."
81,82,Will vCPU limits be available in all Regions?,vCPU-based instance limits are available in al...,"[-0.028409814, -0.057278637, -0.04101186, 0.01..."
794,795,How is AWS Lambda@Edge different from using AW...,The difference is that API Gateway and Lambda ...,"[0.012427445, -0.08264046, -0.05015783, -0.016..."
195,196,What are some of the ideal use cases for R6g i...,R6g instances deliver significant price perfor...,"[0.0057732402, -0.062491536, -0.02346015, 0.04..."


In [63]:
def search(input_keyword):
    # model = SentenceTransformer('all-mpnet-base-v2')
    # input_keyword = "Billing of Amazon EC2 systems begin and end?"
    vector_of_input_keyword = model.encode(input_keyword)

    query = {
        "field" : "ResponseVector",
        "query_vector" : vector_of_input_keyword,
        "k" : 3,
        "num_candidates" : 1500, 
    }

    res = es.knn_search(index="all_patterns_1500", knn=query , source=["pattern","response"])
    results = res["hits"]["hits"]

    return results




In [74]:
arr_of_actual_responses = rdf['response'].tolist()
arr_of_predicted_responses = []

for index, row in rdf.iterrows():
    result = search(row["pattern"])
    print(f"Pattern: {result[1]['_source']['pattern']} ")
    print(f"Response: {result[1]['_source']['response']}")
    arr_of_predicted_responses.append(result[1]['_source']['response'])

  res = es.knn_search(index="all_patterns_1500", knn=query , source=["pattern","response"])


Pattern: Why should I use EFA? 
Response: EFA brings the scalability, flexibility, and elasticity of cloud to tightly coupled HPC applications. With EFA, tightly coupled HPC applications have access to lower and more consistent latency and higher throughput than traditional TCP channels, enabling them to scale better. EFA support can be enabled dynamically, on-demand on any supported EC2 instance without pre-reservation, giving you the flexibility to respond to changing business/workload priorities.
Pattern: How do I get my data into S3 Intelligent-Tiering? 
Response: There are two ways to get data into S3 Intelligent-Tiering. You can directly PUT into S3 Intelligent-Tiering by specifying INTELLIGENT_TIERING in the x-amz-storage-class header or set lifecycle policies to transition objects from S3 Standard or S3 Standard-IA to S3 INTELLIGENT_TIERING.
Pattern: Will vCPU limits be available in all Regions? 
Response: vCPU-based instance limits are available in all commercial AWS Regions.


In [75]:
# print(rdf['response'].tolist())
print(arr_of_predicted_responses)

['EFA brings the scalability, flexibility, and elasticity of cloud to tightly coupled HPC applications. With EFA, tightly coupled HPC applications have access to lower and more consistent latency and higher throughput than traditional TCP channels, enabling them to scale better. EFA support can be enabled dynamically, on-demand on any supported EC2 instance without pre-reservation, giving you the flexibility to respond to changing business/workload priorities.', 'There are two ways to get data into S3 Intelligent-Tiering. You can directly PUT into S3 Intelligent-Tiering by specifying INTELLIGENT_TIERING in the x-amz-storage-class header or set lifecycle policies to transition objects from S3 Standard or S3 Standard-IA to S3 INTELLIGENT_TIERING.', 'vCPU-based instance limits are available in all commercial AWS Regions.', 'Lambda@Edge is optimized for latency-sensitive use cases where your end viewers are distributed globally. All the information you need to make a decision should be ava

In [83]:
def precision_for_k(arr_of_actual_responses, arr_of_predicted_responses, k):
    sum = 0

    for i in range(len(arr_of_actual_responses)-1):
        if (arr_of_actual_responses[i] == arr_of_predicted_responses[i]) :
            
            sum += 1
        # else:
            # print(f"{arr_of_actual_responses[i]} !=== {arr_of_predicted_responses[i]}")
    precision = sum / k if k > 0 else 0
    print(f"precision : {precision}")

k = len(rdf.index)
precision_for_k(arr_of_actual_responses, arr_of_predicted_responses, k)
    

precision : 0.2
