In [6]:
import boto3
import pandas as pd
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from compete_api_keys import *
from tqdm import tqdm
import csv
import io

In [3]:
s3_client = boto3.client("s3", aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY, aws_session_token=AWS_SESSION_TOKEN)
bucket_name = "wikipedia-video-game-data"
object_key = "video-game-embeddings(1).csv"

obj = s3_client.get_object(Bucket=bucket_name, Key=object_key)
data = obj['Body'].read().decode('utf-8')

In [8]:
lines = io.StringIO(data)
csv_reader = csv.reader(lines)

In [4]:
mongo_conn_str = f'mongodb+srv://{MONGO_USER}:{MONGO_PASSWORD}@mongo-cluster.o2nld2l.mongodb.net/'
client = MongoClient(mongo_conn_str, server_api=ServerApi('1'))

try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
    db = client.wiki_data_db
    collection = db.wiki_data_collection
except Exception as e:
    print(e)

Pinged your deployment. You successfully connected to MongoDB!


In [9]:
for row in tqdm(csv_reader):
    try:
        id, url, paragraph, vector_str = row
        vector = eval(vector_str)  # Convert string representation of list to actual list
        vector = [float(v) for v in vector]  # Ensure all elements of the vector are floats
        # Prepare the document for MongoDB insertion
        mongo_document = {
            "_id": id,
            "url": url,
            "paragraph": paragraph,
            "vector": vector
        }
        # Insert the document into MongoDB; consider using `insert_many` for batching inserts for efficiency
        collection.insert_one(mongo_document)
    except ValueError as e:
        continue

40027it [37:06, 17.98it/s]


In [None]:
collection.create_search_index(
    {"definition":
        {"mappings": {"dynamic": True, "fields": {
            "vector" : {
                "dimensions": 1536,
                "similarity": "cosine",
                "type": "knnVector"
                }}}},
     "name": "vector_index"
    }
)

In [26]:
query = collection.find_one({"_id": "2251799813701581"})
query_vector = query["vector"]

In [24]:
results = collection.aggregate([
    {
        '$vectorSearch': {
            "index": "vector_index",
            "path": "vector",
            "queryVector": query["vector"],
            "numCandidates": 50,
            "limit": 5,
        }
    }
])

<pymongo.command_cursor.CommandCursor object at 0x137639fd0>


In [34]:
pipeline = [
    {
        '$vectorSearch': {
            "index": "vector_index",  # The name of your vector index
            "path": "vector",  # The path to the vector field in your documents
            "queryVector": query_vector,
            "numCandidates": 200,  # Adjust the number of candidates as needed
            "limit": 200  # Limit the number of results as needed
        }
    },
    {
        '$addFields': {
            'keyword_bonus': {
                '$cond': {
                    'if': {'$regexMatch': {'input': "$paragraph", 'regex': "AAA games"}},
                    'then': 1.0,
                    'else': 0.0
                }
            }
        }
    },
    {
        '$addFields': {
            'custom_score': {
                '$add': [
                    {'$multiply': [{'$subtract': [1, 0.3]}, '$score']},  # Adjust weight as necessary
                    {'$multiply': [0.3, '$keyword_bonus']}  # Adjust weight as necessary
                ]
            }
        }
    },
    {
        '$project': {
            '_id': 1,
            'paragraph': 1,
            'custom_score': 1
        }
    },
    {'$sort': {'custom_score': -1}},  # Sort by custom score descending
    {'$limit': 5}  # Limit the results if needed
]

results = list(collection.aggregate(pipeline))

In [37]:
for result in results:
    print(result['_id'], '\n', result['paragraph'], '\n \n')

2251799813701597 
 Rocket League was officially announced as the sequel to Battle-Cars in February 2014. Building on the effects from the lack of marketing with Battle-Cars, Psyonix developed a different marketing approach to Rocket League. This included engaging with YouTube and Twitch video game streamers with early release copies to help spread the word, recognizing that clips from the game would be readily shared through social media. They also opened the game to early alpha and beta testing for several months following the game's announcement. Davis noted that they otherwise did not spend any money on traditional marketing approaches. 
 

2251799813701582 
 Psyonix had at one point considered having Rocket League as a free-to-play game with microtransactions, inspired by Team Fortress 2 and Dota 2's models. Though they had put in efforts to establish a free-to-play model, Psyonix decided instead to switch to a traditional sale method, and offer only cosmetic elements as downloadab