## Setup

In [1]:
import os
from dotenv import load_dotenv
from elasticsearch import Elasticsearch

load_dotenv()

client = Elasticsearch(
    "https://my-elasticsearch-project-eda2fa.es.us-east-1.aws.elastic.cloud:443",
    api_key=os.getenv("ELASTIC_SEARCH_IMDB")
)
index_name = "imdb-vdb"

mappings = {
    "properties": {
        "vector": {
            "type": "dense_vector",
            "dims": 3
        },
        "text": {
            "type": "text"
        }
    }
}
mapping_response = client.indices.put_mapping(index=index_name, body=mappings)

print(mapping_response)

{'acknowledged': True}


# Data dump

In [2]:
import os
import pandas as pd
from dotenv import load_dotenv
from elasticsearch import Elasticsearch, helpers

load_dotenv()

# Connect to Elasticsearch
client = Elasticsearch(
    "https://my-elasticsearch-project-eda2fa.es.us-east-1.aws.elastic.cloud:443",
    api_key=os.getenv("ELASTIC_SEARCH_IMDB")
)

index_name = "imdb-vdb"

# Ensure the index exists
if not client.indices.exists(index=index_name):
    client.indices.create(index=index_name)

# Load Data
df = pd.read_csv("C:/Users/saisu/Documents/Learning/RAG_project_imdb/data/imdb_top_1000.csv")

# Clean NaN values
df.fillna("", inplace=True)

# Convert DataFrame rows to Elasticsearch-compatible dictionaries
documents = [
    {
        "_index": index_name,
        "_id": row["Series_Title"],  # Use title as metadata (ID)
        "_source": {
            "released_year": row["Released_Year"],  # Updated field names
            "certificate": row["Certificate"],
            "runtime": row["Runtime"],
            "genre": row["Genre"],
            "imdb_rating": row["IMDB_Rating"],
            "overview": row["Overview"][:1000],  # Truncate long text
            "meta_score": row["Meta_score"],
            "director": row["Director"],
            "star1": row["Star1"],
            "star2": row["Star2"],
            "star3": row["Star3"],
            "star4": row["Star4"],
            "no_of_votes": row["No_of_Votes"],
            "gross": row["Gross"]
        }
    }
    for _, row in df.iterrows()
]

# Bulk Insert with Error Handling
try:
    bulk_response = helpers.bulk(client, documents)
    print("Bulk Insertion Completed:", bulk_response)
except helpers.BulkIndexError as e:
    print("BulkIndexError:", e.errors)  # Print specific errors for debugging


  df.fillna("", inplace=True)


Bulk Insertion Completed: (1000, [])


## Data Access

In [4]:
from elasticsearch import Elasticsearch

client = Elasticsearch(
    "https://my-elasticsearch-project-eda2fa.es.us-east-1.aws.elastic.cloud:443",
    api_key=os.getenv("ELASTIC_SEARCH_IMDB")
)

index_name = "imdb-vdb"

# Fetch all documents
response = client.search(index=index_name, body={"query": {"match_all": {}}}, size=10)

for hit in response["hits"]["hits"]:
    print(hit["_source"])


  response = client.search(index=index_name, body={"query": {"match_all": {}}}, size=10)


{'title': 'The Dark Knight', 'released_year': '2008', 'certificate': 'UA', 'runtime': '152 min', 'genre': 'Action, Crime, Drama', 'imdb_rating': 9.0, 'overview': 'When the menace known as the Joker wreaks havoc and chaos on the people of Gotham, Batman must accept one of the greatest psychological and physical tests of his ability to fight injustice.', 'meta_score': 84.0, 'director': 'Christopher Nolan', 'star1': 'Christian Bale', 'star2': 'Heath Ledger', 'star3': 'Aaron Eckhart', 'star4': 'Michael Caine', 'no_of_votes': 2303232, 'gross': '534,858,444'}
{'title': "Schindler's List", 'released_year': '1993', 'certificate': 'A', 'runtime': '195 min', 'genre': 'Biography, Drama, History', 'imdb_rating': 8.9, 'overview': 'In German-occupied Poland during World War II, industrialist Oskar Schindler gradually becomes concerned for his Jewish workforce after witnessing their persecution by the Nazis.', 'meta_score': 94.0, 'director': 'Steven Spielberg', 'star1': 'Liam Neeson', 'star2': 'Ralph

In [10]:
query = "Inception"

search_body = {
    "query": {
        "match": {
            "title": query  # Full-text search on movie title
        }
    }
}

response = client.search(index=index_name, body=search_body)

for hit in response["hits"]["hits"]:
    print(hit["_source"])


{'title': 'Inception', 'released_year': '2010', 'certificate': 'UA', 'runtime': '148 min', 'genre': 'Action, Adventure, Sci-Fi', 'imdb_rating': 8.8, 'overview': 'A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O.', 'meta_score': 74.0, 'director': 'Christopher Nolan', 'star1': 'Leonardo DiCaprio', 'star2': 'Joseph Gordon-Levitt', 'star3': 'Elliot Page', 'star4': 'Ken Watanabe', 'no_of_votes': 2067042, 'gross': '292,576,195'}


In [6]:
search_body = {
    "query": {
        "bool": {
            "must": [
                {"match": {"released_year": "2010"}},
                {"match": {"genre": "Sci-Fi"}}
            ]
        }
    }
}

response = client.search(index=index_name, body=search_body)

for hit in response["hits"]["hits"]:
    print(hit["_source"])


{'title': 'Inception', 'released_year': '2010', 'certificate': 'UA', 'runtime': '148 min', 'genre': 'Action, Adventure, Sci-Fi', 'imdb_rating': 8.8, 'overview': 'A thief who steals corporate secrets through the use of dream-sharing technology is given the inverse task of planting an idea into the mind of a C.E.O.', 'meta_score': 74.0, 'director': 'Christopher Nolan', 'star1': 'Leonardo DiCaprio', 'star2': 'Joseph Gordon-Levitt', 'star3': 'Elliot Page', 'star4': 'Ken Watanabe', 'no_of_votes': 2067042, 'gross': '292,576,195'}


In [7]:
search_body = {
    "query": {"match_all": {}},
    "sort": [{"imdb_rating": "desc"}],
    "size": 10
}

response = client.search(index=index_name, body=search_body)

for hit in response["hits"]["hits"]:
    print(hit["_source"]["title"], " - IMDb Rating:", hit["_source"]["imdb_rating"])


The Shawshank Redemption  - IMDb Rating: 9.3
The Godfather  - IMDb Rating: 9.2
The Dark Knight  - IMDb Rating: 9.0
The Godfather: Part II  - IMDb Rating: 9.0
12 Angry Men  - IMDb Rating: 9.0
Schindler's List  - IMDb Rating: 8.9
Pulp Fiction  - IMDb Rating: 8.9
The Lord of the Rings: The Return of the King  - IMDb Rating: 8.9
The Lord of the Rings: The Fellowship of the Ring  - IMDb Rating: 8.8
Forrest Gump  - IMDb Rating: 8.8


In [8]:
search_body = {
    "query": {
        "fuzzy": {
            "title": {
                "value": "Interstelar",
                "fuzziness": "AUTO"
            }
        }
    }
}

response = client.search(index=index_name, body=search_body)

for hit in response["hits"]["hits"]:
    print(hit["_source"]["title"])


Interstellar
