In [32]:
!pip install whoosh




In [33]:
!pip install elasticsearch




In [34]:
# Importar las bibliotecas necesarias
import pandas as pd
import re
from collections import defaultdict
from whoosh.fields import Schema, TEXT
from whoosh.index import create_in, open_dir
from whoosh.qparser import QueryParser
from whoosh import index
import os
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk

# Cargar el archivo CSV
file_path = '/content/wiki_movie_plots_deduped.csv'  # Cambia esto si el archivo está en otro directorio
data = pd.read_csv(file_path)

# Filtrar columnas relevantes
data = data[['Title', 'Plot']].dropna()

# Normalizar texto: convertir a minúsculas y eliminar caracteres especiales
def normalize_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', '', text)
    return text

data['Plot'] = data['Plot'].apply(normalize_text)


In [35]:

# Construcción del índice invertido
def build_manual_index(data):
    index = defaultdict(list)
    for idx, row in data.iterrows():
        title = row['Title']
        plot = row['Plot']
        for word in plot.split():
            index[word].append(title)
    return index

# Normalizar texto (puedes personalizar esta función si usas lematización, stemming, etc.)
def normalize_text(text):
    return text.lower()  # Convertir a minúsculas

# Función para realizar consultas en el índice invertido
def search_manual_index(query, index):
    query = normalize_text(query)
    words = query.split()
    results = set(index[words[0]])
    for word in words[1:]:
        results.intersection_update(index[word])
    return results


In [36]:
# Construcción del índice a partir de los datos
index = build_manual_index(data)

# Verificar el índice construido
print(f"Índice invertido construido con {len(index)} palabras.")


Índice invertido construido con 181327 palabras.


In [37]:
# Ejemplo de consulta 1
query = "man"
print("Películas encontradas:", search_manual_index(query, index))




In [38]:
# Ejemplo de consulta 3
query = "time machine"
print("Películas encontradas:", search_manual_index(query, index))


Películas encontradas: {'Cooliekkaran', 'The Big Parade', 'The Guardian', 'ABBA: The Movie', 'Honey, I Blew Up the Kid', 'Minions', 'Gallipoli', 'Go Tell the Spartans', 'The Invisible Woman', 'Judge Dredd', 'Robot', 'Dust', 'Lost In Space', 'A Sound of Thunder', 'Feast of July', 'Dr. Who and the Daleks', ' Dressmaker, TheThe Dressmaker', 'One Hour Photo', 'Journey 2: The Mysterious Island', 'Poketto Monsutā Daiyamondo to Pāru Diaruga VS Parukia VS Dākurai', "Everybody's Fine", "Spirit of '76", 'Dark City', 'Dimension 5', 'Dragon Ball Z: Fusion Reborn', 'Charlotte Gray', 'Miracle Mile', 'The Secret of the Sword', 'Summer Time Machine Blues', 'Mysterians !The Mysterians', 'The Bridge at Remagen', 'The Amazing Transparent Man', 'Puppet Master X: Axis Rising', 'The Jackal', 'Easy Come, Easy Go', 'Indru Netru Naalai', 'Miracles', 'Dennis the Menace Strikes Again', 'Toy Soldiers', 'Paprika', 'Aliens in the Attic', 'Flags of Our Fathers', 'Metalstorm: The Destruction of Jared-Syn', 'Dead Leav

In [44]:
# Ejemplo de consulta 3
query = "love"
print("Películas encontradas:", search_manual_index(query, index))




In [42]:
# Configuración del índice con Whoosh
schema = Schema(Title=TEXT(stored=True), Plot=TEXT(stored=True))
if not os.path.exists("indexdir2"):
    os.mkdir("indexdir2")

ix = create_in("indexdir2", schema)
writer = ix.writer()

# Agregar documentos al índice
for _, row in data.iterrows():
    writer.add_document(Title=row['Title'], Plot=row['Plot'])
writer.commit()

# Consultas en el índice de Whoosh
with ix.searcher() as searcher:
    query = QueryParser("Plot", ix.schema).parse("love")
    results = searcher.search(query)
    for result in results:
        print(result['Title'])

Azhagai Irukkirai Bayamai Irukkirathu
Hai Ram Charan
Orange
Neenade Naa
Innisai Mazhai
Kodanda Ramudu
Iru Mugan
Strange Magic
Pooveli
Dil Vil Pyar Vyar


In [43]:
# Crear el índice en Whoosh
schema = Schema(Title=TEXT(stored=True), Plot=TEXT(stored=True))
if not os.path.exists("indexdir3"):
    os.mkdir("indexdir3")

ix = create_in("indexdir3", schema)
writer = ix.writer()

# Agregar documentos al índice
for _, row in data.iterrows():
    writer.add_document(Title=row['Title'], Plot=row['Plot'])
writer.commit()

# Función de búsqueda en Whoosh con un límite dinámico
def search_whoosh_index(query, index_dir="indexdir3", limit=None):
    from whoosh.index import open_dir
    ix = open_dir(index_dir)
    with ix.searcher() as searcher:
        query = QueryParser("Plot", ix.schema).parse(query)
        results = searcher.search(query, limit=limit)
        return [result['Title'] for result in results]

# Comparar resultados entre el índice invertido manual y Whoosh
query = "man"

# Búsqueda en índice invertido manual
manual_results = search_manual_index(query, index)
print("Resultados del índice invertido manual:", manual_results)

# Búsqueda en Whoosh
whoosh_results = search_whoosh_index(query, limit=len(manual_results))
print("Resultados en Whoosh:", whoosh_results)

