In [116]:
import kaggle # type: ignore
import pandas as pd # type: ignore
import numpy as np # type: ignore
import zipfile
import re
from elasticsearch import Elasticsearch # type: ignore
from sentence_transformers import SentenceTransformer # type: ignore
from sklearn.metrics.pairwise import cosine_similarity # type: ignore
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from sklearn.preprocessing import normalize

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shlok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [117]:
# Download dataset from Kaggle
kaggle.api.authenticate()
dataset = "harshitshankhdhar/imdb-dataset-of-top-1000-movies-and-tv-shows"
kaggle.api.dataset_download_files(dataset)

In [118]:
# Extract the downloaded dataset
with zipfile.ZipFile("imdb-dataset-of-top-1000-movies-and-tv-shows.zip", "r") as zip_ref:
    zip_ref.extractall(".")

In [119]:
# Read the dataset into a DataFrame
movies = pd.read_csv("imdb_top_1000.csv")

In [120]:
columns_to_remove = ['Poster_Link', 'Released_Year', 'Certificate', 'Runtime', 'IMDB_Rating', 'Meta_score', 'Star1', 'Star2', 'Star3', 'Star4', 'No_of_Votes', 'Gross']
movies = movies.drop(columns=columns_to_remove)

In [121]:
# Clean the text data
def clean_text(text):
    # Remov stop words
    new_words = word_tokenize(text)
 
# Remove stopwords using NLTK
    new_filtered_words = [
    word for word in new_words if word.lower() not in stopwords.words('english')]
 
# Join the filtered words to form a clean text
    text = ' '.join(new_filtered_words)
    # Remove non-alphanumeric characters and extra spaces
    text = re.sub(r"[^\w\s]", "", str(text))
    text = re.sub(r"\s+", " ", text)

    # Convert text to lowercase
    # text = text.lower()

    
    return text

In [122]:
# Apply clean_text function to the Overview column
#movies["Overview"] = movies["Overview"].fillna('')
movies["text"] = movies['Series_Title'] + ' ' + movies['Overview'] + ' ' + movies['Director'] + ' ' + movies['Genre']
movies["clean_text"] = movies["text"].apply(clean_text)

In [123]:
# Select columns needed for embedding
df = movies[['Series_Title', 'clean_text', 'Overview', 'Director']]

In [124]:
df

Unnamed: 0,Series_Title,clean_text,Overview,Director
0,The Shawshank Redemption,Shawshank Redemption Two imprisoned men bond n...,Two imprisoned men bond over a number of years...,Frank Darabont
1,The Godfather,Godfather organized crime dynasty s aging patr...,An organized crime dynasty's aging patriarch t...,Francis Ford Coppola
2,The Dark Knight,Dark Knight menace known Joker wreaks havoc ch...,When the menace known as the Joker wreaks havo...,Christopher Nolan
3,The Godfather: Part II,Godfather Part II early life career Vito Corle...,The early life and career of Vito Corleone in ...,Francis Ford Coppola
4,12 Angry Men,12 Angry Men jury holdout attempts prevent mis...,A jury holdout attempts to prevent a miscarria...,Sidney Lumet
...,...,...,...,...
995,Breakfast at Tiffany's,Breakfast Tiffany s young New York socialite b...,A young New York socialite becomes interested ...,Blake Edwards
996,Giant,Giant Sprawling epic covering life Texas cattl...,Sprawling epic covering the life of a Texas ca...,George Stevens
997,From Here to Eternity,Eternity Hawaii 1941 private cruelly punished ...,"In Hawaii in 1941, a private is cruelly punish...",Fred Zinnemann
998,Lifeboat,Lifeboat Several survivors torpedoed merchant ...,Several survivors of a torpedoed merchant ship...,Alfred Hitchcock


In [125]:
# Initialize Elasticsearch client
es = Elasticsearch(
    "http://localhost:9200",
    http_auth=("shloka", "shloka"),
    ca_certs="/Users/shlok/Downloads/elasticsearch-8.13.0-windows-x86_64/elasticsearch-8.13.0/config/certs/http_ca.crt")

In [126]:
es.ping()

True

In [127]:
# Clear existing index if it exists
if es.indices.exists(index="my_movies"):
    es.indices.delete(index="my_movies")

In [128]:
model = SentenceTransformer("all-mpnet-base-v2")

In [129]:
df["Overview_Vector"] = df["clean_text"].apply(lambda x: model.encode(x))
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["Overview_Vector"] = df["clean_text"].apply(lambda x: model.encode(x))


Unnamed: 0,Series_Title,clean_text,Overview,Director,Overview_Vector
0,The Shawshank Redemption,Shawshank Redemption Two imprisoned men bond n...,Two imprisoned men bond over a number of years...,Frank Darabont,"[0.020018049, 0.051489376, 0.0037729605, 0.008..."
1,The Godfather,Godfather organized crime dynasty s aging patr...,An organized crime dynasty's aging patriarch t...,Francis Ford Coppola,"[-0.041141756, 0.1136787, 0.0030414062, 0.0858..."
2,The Dark Knight,Dark Knight menace known Joker wreaks havoc ch...,When the menace known as the Joker wreaks havo...,Christopher Nolan,"[0.029547436, 0.0630262, 0.00023984986, -0.036..."
3,The Godfather: Part II,Godfather Part II early life career Vito Corle...,The early life and career of Vito Corleone in ...,Francis Ford Coppola,"[-0.027511343, 0.11960185, 0.0061571123, 0.041..."
4,12 Angry Men,12 Angry Men jury holdout attempts prevent mis...,A jury holdout attempts to prevent a miscarria...,Sidney Lumet,"[0.021142999, 0.04061857, 0.013984885, 0.00039..."


In [130]:
record_list = df.to_dict("records")
'''
es_index = {
            "mappings": {
                "properties": {
                    "Series_Title": {"type":"text"},
                    "clean_text": {"type":"text"},
                    "Overview": {"type": "text"},
                    "Overview_Vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
                }
            }
        }
'''
for record in record_list:
    try:
        es.index(index="my_movies", document=record)
    except Exception as e:
        print(e)

In [131]:
record_list[0]

{'Series_Title': 'The Shawshank Redemption',
 'clean_text': 'Shawshank Redemption Two imprisoned men bond number years finding solace eventual redemption acts common decency Frank Darabont Drama',
 'Overview': 'Two imprisoned men bond over a number of years, finding solace and eventual redemption through acts of common decency.',
 'Director': 'Frank Darabont',
 'Overview_Vector': array([ 2.00180486e-02,  5.14893755e-02,  3.77296051e-03,  8.15074332e-03,
        -5.36081977e-02,  2.86651216e-02,  3.33252847e-02, -6.83311466e-03,
         5.61203770e-02, -5.38253551e-03,  9.88662895e-03,  2.79864185e-02,
         4.45968173e-02, -1.81307383e-02, -3.20574455e-02, -4.59328815e-02,
        -2.59905402e-02,  3.55043225e-02,  1.11568219e-03,  1.44372405e-02,
         4.99056019e-02,  8.43293965e-03,  2.23187841e-02, -2.12483276e-02,
         1.06953513e-02, -8.69359076e-03,  5.19052846e-03,  1.23281982e-02,
        -3.81824523e-02, -2.82907765e-02,  2.95834336e-03,  5.43193556e-02,
         4

In [132]:
inputKeyword = "Women in sports"
ip_word = clean_text(inputKeyword)
vectorOfInputKeyword = model.encode(ip_word)

# Compute cosine similarity between search query and all sentences 
#cos_sim = util.cos_sim(vectorOfInputKeyword, embeddings)

def calculate_cosine_similarity(input_vector, target_vector):
    return cosine_similarity([input_vector], [target_vector])[0][0]

def calculate_l2_norm(input_vector, target_vector):
    vector1 = np.array([input_vector])
    vector2 = np.array([target_vector])
    normalized_vector1 = normalize(vector1.reshape(1, -1))
    normalized_vector2 = normalize(vector2.reshape(1, -1))
    l2_norm = np.linalg.norm(normalized_vector1 - normalized_vector2)

    return l2_norm

# res = es.search(
#     index="my_movies",
#     body={
#         "query": {
#             "multi_match": {
#                 "query": ip_word,
#                 "fields": ["Series_Title", "Genre", "Overview", "Director", "clean_text"]
#             }
#         },
#         "size": 1000
#     }
# )
# res = es.search(
#     index="my_movies",
#     body={
#         "query": {
#             "knn": {
#                 "Overview_Vector": {
#                     "vector": ip_word,
#                     "k": 10
#                 }
#             }
#         }
#     }
# )

res = es.search(
    index="my_movies",
    body={
        
            "knn": {
                "field": "Overview_Vector",
                "query_vector": vectorOfInputKeyword,
                "k": "20",
            }
        
    }
)
hits = res["hits"]["hits"]

# Sort hits based on cosine similarity
hits_sorted = sorted(hits, key=lambda x: calculate_cosine_similarity(vectorOfInputKeyword, x["_source"]["Overview_Vector"]), reverse=True)
# hits_sorted = sorted(hits, key=lambda x: calculate_l2_norm(vectorOfInputKeyword, x["_source"]["Overview_Vector"]), reverse=True)

for hit in hits_sorted:
    series_title = hit["_source"]["Series_Title"]
    ct = hit["_source"]["Overview"]
    overview_vector = hit["_source"]["Overview_Vector"]
    director = hit["_source"]["Director"]
    similarity_score = calculate_cosine_similarity(vectorOfInputKeyword, overview_vector)
    print("Series Title:", series_title)
    print("Overview:", ct)
    print("Director:", director)
    #print("Cosine Similarity Score:", similarity_score)
    print()  # Add a newline for better readability

Series Title: Chak De! India
Overview: Kabir Khan is the coach of the Indian Women's National Hockey Team and his dream is to make his all girls team emerge victorious against all odds.
Director: Shimit Amin

Series Title: The Hustler
Overview: An up-and-coming pool player plays a long-time champion in a single high-stakes match.
Director: Robert Rossen

Series Title: Dangal
Overview: Former wrestler Mahavir Singh Phogat and his two wrestler daughters struggle towards glory at the Commonwealth Games in the face of societal oppression.
Director: Nitesh Tiwari

Series Title: Million Dollar Baby
Overview: A determined woman works with a hardened boxing trainer to become a professional.
Director: Clint Eastwood

Series Title: Remember the Titans
Overview: The true story of a newly appointed African-American coach and his high school team on their first season as a racially integrated unit.
Director: Boaz Yakin

Series Title: The Hurricane
Overview: The story of Rubin 'Hurricane' Carter, a 

  res = es.search(


In [18]:
'''
import requests
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Define Elasticsearch endpoint and authentication credentials
ELASTICSEARCH_URL = 'http://localhost:9200'
USERNAME = 'shloka'
PASSWORD = 'shloka'

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Input keyword
input_keyword = "world two"

# Elasticsearch search request
res = requests.post(
    f'{ELASTICSEARCH_URL}/my_movies/_search',
    json={
        "query": {
            "multi_match": {
                "query": input_keyword,
                "fields": ["Series_Title", "clean_text", "Overview"]
            }
        },
        "size": 2
    },
    auth=(USERNAME, PASSWORD)  # Provide authentication credentials
)

# Extract hits from Elasticsearch response
hits = res.json().get("hits", {}).get("hits", [])

# Fit TF-IDF vectorizer with overview texts
overview_texts = [hit["_source"]["Overview"] for hit in hits]
tfidf_vectorizer.fit(overview_texts)

# Function to calculate cosine similarity
def calculate_cosine_similarity(input_text, target_text):
    input_vector = tfidf_vectorizer.transform([input_text])
    target_vector = tfidf_vectorizer.transform([target_text])
    return cosine_similarity(input_vector, target_vector)[0][0]

# Sort hits based on cosine similarity
hits_sorted = sorted(hits, key=lambda x: calculate_cosine_similarity(input_keyword, x["_source"]["Overview"]), reverse=True)

# Display search results
for hit in hits_sorted:
    series_title = hit["_source"]["Series_Title"]
    clean_text = hit["_source"]["Overview"]
    similarity_score = calculate_cosine_similarity(input_keyword, clean_text)
    print("Series Title:", series_title)
    print("Overview:", clean_text)
    print("Cosine Similarity Score:", similarity_score)
    print()
'''

Series Title: The Perks of Being a Wallflower
Overview: An introvert freshman is taken under the wings of two seniors who welcome him to the real world
Cosine Similarity Score: 0.25511925032778854

Series Title: Stalag 17
Overview: When two escaping American World War II prisoners are killed, the German P.O.W. camp barracks black marketeer, J.J. Sefton, is suspected of being an informer.
Cosine Similarity Score: 0.22478822287203665



In [19]:
import nltk
from nltk.corpus import stopwords
 
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shlok\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True