# Setup Environment Variables

In [48]:
from dotenv import load_dotenv
load_dotenv()

True

# Load Dataset

In [160]:
import pandas as pd

In [277]:
df_summaries = pd.read_csv("./MovieSummaries/plot_summaries.txt", delimiter="\t", usecols=[0, 1], names=["id", "text"])

In [278]:
df_metadata = pd.read_csv("./MovieSummaries/movie.metadata.tsv", delimiter="\t", usecols=[0, 2, 3, 5], names=["id", "movie_title", "release_date", "runtime"])

In [230]:
df = pd.merge(df_metadata, df_summaries, how="inner", on="id")

In [231]:
df = df.fillna(-1)
df.text = df.text.astype(str)

# Convert Summaries to Embeddings

In [135]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("all-MiniLM-L6-v2")

sending upsert requests: 100%|██████████| 42204/42204 [12:47<00:00, 55.01it/s]  
collecting async responses:   0%|          | 0/423 [11:58<?, ?it/s]


In [None]:
summary_embeddings = model.encode(df.text)

In [47]:
import numpy as np
np.save("./MovieSummaries/summary_embeddings.npy", np.array(summary_embeddings))
np.save("./MovieSummaries/summary_id.npy", np.array(df.id))


# Load Embeddings

In [12]:
import numpy as np

# Load Embeddings
summary_embeddings = np.load("./MovieSummaries/summary_embeddings.npy")
ids = np.load("./MovieSummaries/summary_id.npy", allow_pickle=True)

In [13]:
ids

array([  975900,  9363483,   261236, ..., 34980460,   913762, 12476867])

# Upload to Pinecone

In [49]:
import pinecone
import os
import time

index_name = "cmu-movie-summaries"

In [50]:
api_key = os.getenv("PINECONE_API_KEY")
env = os.getenv("PINECONE_ENV") # found next to api key

In [51]:
pinecone.init(api_key=api_key, environment=env)

In [279]:
if index_name in pinecone.list_indexes():
    pinecone.delete_index(index_name)

# create new index
pinecone.create_index(
    name=index_name,
    metric="cosine",
    dimension=summary_embeddings.shape[1]
)

while not pinecone.describe_index(index_name).status["ready"]:
    time.sleep(1)

In [280]:
index = pinecone.GRPCIndex(index_name)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 0}},
 'total_vector_count': 0}

In [287]:
import json

df_embedding = pd.DataFrame({
    "wiki_id": ids,
    "values": summary_embeddings.tolist(),
})

df_export = pd.merge(df_embedding, df, left_on="wiki_id", right_on="id", how="inner")

df_json_string = df_export[["release_date", "runtime", "text"]].to_json(orient="records")

df_export["metadata"] = json.loads(df_json_string)
df_export["id"] = df_export["movie_title"].str.encode(encoding="ascii", errors="ignore")


In [282]:
df_export[["id", "values", "metadata"]].head()

Unnamed: 0,id,values,metadata
0,b'Ghosts of Mars',"[-0.030300872400403023, 0.05613691732287407, -...","{'release_date': '2001-08-24', 'runtime': 98.0..."
1,b'White Of The Eye',"[-0.07510953396558762, 0.0031548410188406706, ...","{'release_date': '1987', 'runtime': 110.0, 'te..."
2,b'A Woman in Flames',"[-0.06540518254041672, 0.020126990973949432, -...","{'release_date': '1983', 'runtime': 106.0, 'te..."
3,"b""The Sorcerer's Apprentice""","[-0.019909139722585678, 0.07503096014261246, -...","{'release_date': '2002', 'runtime': 86.0, 'tex..."
4,b'Little city',"[-0.07269977033138275, -0.07025031000375748, 0...","{'release_date': '1997-04-04', 'runtime': 93.0..."


In [290]:
df_export[df_export["id"] == ""]

Unnamed: 0,wiki_id,values,id,movie_title,release_date,runtime,text,metadata


In [283]:
index.upsert_from_dataframe(df_export[["id", "values", "metadata"]], batch_size=100)

sending upsert requests: 100%|█████████▉| 42200/42203 [00:58<00:00, 2018.48it/s]

PineconeException: UNKNOWN:Error received from peer ipv4:34.160.88.44:443 {created_time:"2023-09-02T00:38:31.060764-04:00", grpc_status:3, grpc_message:"Vector ID must not be empty"}

sending upsert requests: 100%|██████████| 42203/42203 [01:16<00:00, 2018.48it/s]

In [289]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.39821,
 'namespaces': {'': {'vector_count': 39821}},
 'total_vector_count': 39821}

# Search

In [197]:
query = "cloud high above London"

In [198]:
query_embedding = model.encode(query)

In [199]:
res = index.query(
    vector=query_embedding,
    top_k=3,
)

In [200]:
res

{'matches': [{'id': 'Mary Poppins',
              'metadata': {},
              'score': 0.16821064,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': 'Henry V',
              'metadata': {},
              'score': 0.14026843,
              'sparse_values': {'indices': [], 'values': []},
              'values': []},
             {'id': "The Sorcerer's Apprentice",
              'metadata': {},
              'score': 0.08186696,
              'sparse_values': {'indices': [], 'values': []},
              'values': []}],
 'namespace': ''}

In [None]:
pinecone.delete_index(index_name)