In [1]:
import os
import base64
import boto3
import json
import pandas as pd
import requests
import urllib.request
from botocore.config import Config
from requests.auth import HTTPBasicAuth

In [27]:
movie_files = [f for f in os.listdir('movielens/') if f.endswith('.json')]
all_movie_data = []

for idx, movie_file in enumerate(movie_files):
    file_path = os.path.join('movielens/', movie_file)
    with open(file_path, 'r', encoding='utf-8') as f:
        movie_info = json.load(f).get('data', {}).get('searchResults', [])
        all_movie_data.extend(movie_info)

In [28]:
movies_df = pd.DataFrame([movie['movie'] for movie in all_movie_data])

In [29]:
import ssl
def save_image(image_url, directory, image_name):
    image_name = image_name.lstrip('/')
    full_image_path = os.path.join(directory, image_name)
    os.makedirs(directory, exist_ok=True)

    ssl_context = ssl._create_unverified_context()
    with urllib.request.urlopen(image_url, context=ssl_context) as response, open(full_image_path, 'wb') as image_file:
        image_data = response.read()
        image_file.write(image_data)

In [20]:
for _, movie in movies_df.iterrows():
    poster_url = 'https://image.tmdb.org/t/p/w500/' + movie['posterPath']
    save_image(poster_url, 'images/', movie['posterPath'])

In [30]:
config = Config(
    region_name = 'us-west-2',
    signature_version = 'v4',
    retries = {
        'max_attempts': 10,
        'mode': 'standard'
    }
)

bedrock_client = boto3.client(service_name="bedrock-runtime", config=config)

In [24]:
os.makedirs('images/', exist_ok=True)
os.makedirs('embeddings/', exist_ok=True)

def get_embedding(image_path, title=None):
    with open(image_path, "rb") as image_file:
        input_image = base64.b64encode(image_file.read()).decode('utf8')
        
    body = {"inputImage": input_image}
    if title:
        body["inputText"] = title

    response = bedrock_runtime.invoke_model(
        body=json.dumps(body),
        modelId="amazon.titan-embed-image-v1",
        accept="application/json",
        contentType="application/json"
    )

    vector_json = json.loads(response['body'].read().decode('utf8'))
    image_name = image_path.split("/")[-1].split(".")[0]

    return vector_json, image_name


In [25]:
for index, row in df.iterrows():
    image_path = 'images/' + row['posterPath']
    
    vector_json, image_name = get_embedding(image_path)
    with open('embeddings/' + image_name + '.json', 'w') as f:
        json.dump(vector_json, f)
    
    vector_json, image_name = get_embedding(image_path, row['title'])
    with open('embeddings/' + 'with_title_' + image_name + '.json', 'w') as f:
        json.dump(vector_json, f)


In [None]:
df = df.drop(columns=['dvdReleaseDate', 'backdropPaths', 'youtubeTrailerIds', 'numRatings', 'avgRating'])

In [16]:
elastic_url = "https://127.0.0.1:9200"
user = "admin"
passwd = "<YOUR_PASSWORD_HERE>"

requests.packages.urllib3.disable_warnings()

try:
    health_response = requests.get(f"{elastic_url}/_cluster/health", auth=HTTPBasicAuth(user, passwd), verify=False)
    print("Cluster Health Response:", health_response.status_code)
    print(health_response.json())
except requests.ConnectionError as conn_err:
    print(f"Connection failed: {conn_err}")


Cluster Health Response: 200
{'cluster_name': 'opensearch-cluster', 'status': 'green', 'timed_out': False, 'number_of_nodes': 2, 'number_of_data_nodes': 2, 'discovered_master': True, 'discovered_cluster_manager': True, 'active_primary_shards': 5, 'active_shards': 10, 'relocating_shards': 0, 'initializing_shards': 0, 'unassigned_shards': 0, 'delayed_unassigned_shards': 0, 'number_of_pending_tasks': 0, 'number_of_in_flight_fetch': 0, 'task_max_waiting_in_queue_millis': 0, 'active_shards_percent_as_number': 100.0}


In [17]:
delete_response = requests.delete(f"{elastic_url}/multi-modal-embedding-index", auth=HTTPBasicAuth(user, passwd), verify=False)
print(delete_response.text)



{"error":{"root_cause":[{"type":"index_not_found_exception","reason":"no such index [multi-modal-embedding-index]","index":"multi-modal-embedding-index","resource.id":"multi-modal-embedding-index","resource.type":"index_or_alias","index_uuid":"_na_"}],"type":"index_not_found_exception","reason":"no such index [multi-modal-embedding-index]","index":"multi-modal-embedding-index","resource.id":"multi-modal-embedding-index","resource.type":"index_or_alias","index_uuid":"_na_"},"status":404}


In [19]:
mapping = {
    "settings": {"index.knn": True},
    "mappings": {
        "properties": {
            "titan_multimodal_embedding": {"type": "knn_vector", "dimension": 1024},
            "title": {"type": "text"},
            "plotSummary": {"type": "text"},
            "movieId": {"type": "keyword"},
            "imdbMovieId": {"type": "keyword"},
            "posterPath": {"type": "text"},
        }
    }
}


In [20]:
response = requests.put(f"{base_url}/multi-modal-embedding-index", auth=HTTPBasicAuth(username, password), verify=False, json=mapping)

In [21]:
def generate_document(row_data):
    embedding_path = f"embeddings/with_title_{row_data['posterPath'].split('/')[-1].split('.')[0]}.json"
    with open(embedding_path) as file:
        embedding_data = json.load(file)

    return {
        "titan_multimodal_embedding": embedding_data['embedding'],
        "title": row_data['title'],
        "plotSummary": row_data['plotSummary'],
        "movieId": row_data['movieId'],
        "imdbMovieId": row_data['imdbMovieId'],
        "posterPath": row_data['posterPath']
    }

In [31]:
for idx, row_data in df.iterrows():
    doc = generate_document(row_data)
    response = requests.post(f"{base_url}/multi-modal-embedding-index/_doc", auth=HTTPBasicAuth(username, password), verify=False, json=doc)