In [1]:
import boto3
import csv
import io
from pinecone import Pinecone
from tqdm import tqdm
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from compete_api_keys import *

# Connecting to Pinecone (and your 3rd party DB), then accessing data in S3

In [4]:
pc = Pinecone(api_key=PINECONE_KEY)
index = pc.Index("wiki-data-pod")

In [2]:
mongo_conn_str = f'mongodb+srv://{MONGO_USER}:{MONGO_PASSWORD}@mongo-cluster.o2nld2l.mongodb.net/'
client = MongoClient(mongo_conn_str, server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
    db = client.wiki_data_db
    collection = db.wiki_data_collection
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [7]:
s3_client = boto3.client("s3", aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, aws_session_token=AWS_SESSION_TOKEN)
bucket_name = "wikipedia-video-game-data"
object_key = "video-game-embeddings(1).csv"

obj = s3_client.get_object(Bucket=bucket_name, Key=object_key)
data = obj['Body'].read().decode('utf-8')

In [16]:
lines = io.StringIO(data)
csv_reader = csv.reader(lines)

# Upsert the data into your "Pod"

This took 45 min for 23k iterations!

In [32]:
for row in tqdm(csv_reader):
    try:
        id, url, paragraph, vector_str = row
        vector = eval(vector_str)  # Convert string representation of list to actual list
        # Ensure all elements of the vector are floats
        vector = [float(v) for v in vector]
        # Prepare the pinecone_vector with the modified float type vector
        pinecone_vector = (id, vector, {"url": url, "paragraph": paragraph})
        # Upsert the single item; consider batching upserts for efficiency
        index.upsert(vectors=[pinecone_vector])
    except ValueError as e:
        print(f"Failed to parse row: {id}, {url}")
        continue

9988it [12:14, 13.60it/s]


In [34]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.23896,
 'namespaces': {'': {'vector_count': 23896}},
 'total_vector_count': 23896}

# Perform your "Hybrid" Search!

In [5]:
# I want to know information about Rocket League and its relationship to AAA games! A good proxy for this paragraph is the vector with ID '2251799813701581'.

vector_search = index.query(id="2251799813701581", top_k=200, include_metadata=True)

def compute_custom_score(paragraph, pinecone_score, wei):
    # Basic example: increase score if "AAA games" is mentioned in the paragraph
    keyword_bonus = 1.0 if "AAA games" in paragraph else 0.0
    return (1-wei)*pinecone_score + wei*keyword_bonus

filtered_results = []
for result in vector_search["matches"]:
    # Use result["id"] to fetch the paragraph from MongoDB
    document = collection.find_one({"_id": str(result["id"])})
    if document is not None:
        paragraph = document["paragraph"]
        pinecone_score = result["score"]
        # Compute the custom score using the function defined above
        custom_score = compute_custom_score(paragraph, pinecone_score, wei=0.3)
        # Append results including the custom score
        filtered_results.append((result["id"], paragraph, custom_score))
    else:
        print(f"No document found for _id: {result['id']}")

# Close the MongoDB client
client.close()

In [6]:
# Sort filtered_results based on the custom score in descending order and limit to top 5
filtered_results.sort(key=lambda x: x[2], reverse=True)
top_5_results = filtered_results[:5]

# Print the top 5 results
for result_id, paragraph, score in top_5_results:
    print(f"ID: {result_id}, Score: {score}, \n Paragraph: {paragraph} \n")

ID: 2251799813701581, Score: 0.9992272, 
 Paragraph: Psyonix's team were aware of past difficulties that they had with Battle-Cars and other racing games with online play and client-side prediction, and the issues that would arise from that with Rocket League's fast-paced play style. To solve this, the physics in the game are based on using the Bullet physics engine within the Unreal Engine 3's PhysX engine, which tracks the movement of all the cars and actors, allowing them to periodically re-synchronize the game state across players based on the stored physics states, which enabled players to have quick reactions from their client. At the time of Battle-Cars, Psyonix could not afford a dedicated server network and were forced to rely on individual hosts, which could lead to poor performance with slow Internet connections. With Rocket League, Psyonix was able to put a dedicated server network in place, writing their own service protocols to interface with Sony's and Valve's online ser