In [5]:
import pandas as pd 
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import faiss


In [16]:
df = pd.read_csv("movies_list.csv")
df['genre'].value_counts()
pd.set_option('display.max_rows', None)

In [17]:
df['genre'].value_counts()

genre
–¥—Ä–∞–º–∞                                 5773
–∫–æ–º–µ–¥–∏—è                               1796
–±–æ–µ–≤–∏–∫                                1684
–¥–æ–∫—É–º–µ–Ω—Ç–∞–ª—å–Ω—ã–π                        1603
–º—É–ª—å—Ç—Ñ–∏–ª—å–º—ã                           1502
–±–∏–æ–≥—Ä–∞—Ñ–∏—á–µ—Å–∫–∏–π                         803
–¥–µ—Ç–µ–∫—Ç–∏–≤                               777
—Ç—Ä–∏–ª–ª–µ—Ä                                441
–≤–æ–π–Ω–∞                                  378
–º–µ–ª–æ–¥—Ä–∞–º–∞                              206
—É–∂–∞—Å—ã                                  190
–ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è                            131
–≤–µ—Å—Ç–µ—Ä–Ω                                115
—Å–µ–º–µ–π–Ω—ã–π                                71
–º—É–∑—ã–∫–∞                                  69
–∫–æ—Ä–æ—Ç–∫–æ–º–µ—Ç—Ä–∞–∂–Ω—ã–π                        43
—Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞                              32
–∏—Å—Ç–æ—Ä–∏—á–µ—Å–∫–∏–π                            23
–º—é–∑–∏–∫–ª                                  18
–º–∏—Å—Ç–∏

In [18]:
# –ü—Ä–∏–º–µ—Ä —Å–ª–æ–≤–∞—Ä—è –¥–ª—è –∑–∞–º–µ–Ω—ã —Ä–µ–¥–∫–∏—Ö/—É—Å—Ç–∞—Ä–µ–≤—à–∏—Ö –∂–∞–Ω—Ä–æ–≤
genre_mapping = {
    "–Ω—É–∞—Ä": "–¥—Ä–∞–º–∞",
    "–≤–µ—Å—Ç–µ—Ä–Ω": "–±–æ–µ–≤–∏–∫",
    "–º—é–∑–∏–∫–ª": "–∫–æ–º–µ–¥–∏—è",
    "–±–∏–æ–≥—Ä–∞—Ñ–∏—è": "–¥—Ä–∞–º–∞",
    "–∏—Å—Ç–æ—Ä–∏—á–µ—Å–∫–∏–π": "–¥—Ä–∞–º–∞",
    "—ç—Ä–æ—Ç–∏–∫–∞": "–¥—Ä–∞–º–∞",
    "–∞—Ä—Ç—Ö–∞—É—Å": "–¥—Ä–∞–º–∞",
    "–∫–æ–Ω—Ü–µ—Ä—Ç": "–º—É–∑—ã–∫–∞",
    '–º—É–∑—ã–∫–∞–ª—å–Ω—ã–π': '–º—É–∑—ã–∫–∞',
    "—Ç–æ–∫-—à–æ—É": "–¥–æ–∫—É–º–µ–Ω—Ç–∞–ª—å–Ω—ã–π",
    "—ç–∫—Å–ø–µ—Ä–∏–º–µ–Ω—Ç–∞–ª—å–Ω—ã–π": "–¥—Ä–∞–º–∞",
    "—Å–ø–æ—Ä—Ç–∏–≤–Ω—ã–π": "–¥—Ä–∞–º–∞",
    "—Å–µ–º–µ–π–Ω—ã–π": "–ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è",
    "–≤–æ–µ–Ω–Ω—ã–π": "–≤–æ–π–Ω–∞",
    '—Ä–µ–∞–ª–∏—Ç–∏-—à–æ—É': '–¥–æ–∫—É–º–µ–Ω—Ç–∞–ª—å–Ω—ã–π',
    "–∏–≥—Ä–æ–≤–æ–µ —à–æ—É": '–¥–æ–∫—É–º–µ–Ω—Ç–∞–ª—å–Ω—ã–π',
    "—Ñ–∏–ª—å–º-–∫–∞—Ç–∞—Å—Ç—Ä–æ—Ñ–∞": "–±–æ–µ–≤–∏–∫",
    '–∞–Ω–∏–º–µ': '–º—É–ª—å—Ç—Ñ–∏–ª—å–º—ã',
    '–∫—Ä–∏–º–∏–Ω–∞–ª': '–¥—Ä–∞–º–∞'
}

# –†–∞–∑–¥–µ–ª—è–µ–º –∂–∞–Ω—Ä—ã
df['genre_list'] = df['genre'].fillna('').apply(lambda x: [g.strip().lower() for g in x.split(',') if g.strip()])

# –ó–∞–º–µ–Ω—è–µ–º –ø–æ —Å–ª–æ–≤–∞—Ä—é
def map_genres(genres):
    return [genre_mapping.get(g, g) for g in genres]

df['genre_list'] = df['genre_list'].apply(map_genres)

In [19]:
df['genre'] = df['genre_list'].apply(lambda lst: ', '.join(sorted(set(lst))))

In [20]:
df['genre'].value_counts()

genre
–¥—Ä–∞–º–∞                       5802
–∫–æ–º–µ–¥–∏—è                     1815
–±–æ–µ–≤–∏–∫                      1799
–¥–æ–∫—É–º–µ–Ω—Ç–∞–ª—å–Ω—ã–π              1606
–º—É–ª—å—Ç—Ñ–∏–ª—å–º—ã                 1502
–±–∏–æ–≥—Ä–∞—Ñ–∏—á–µ—Å–∫–∏–π               803
–¥–µ—Ç–µ–∫—Ç–∏–≤                     777
—Ç—Ä–∏–ª–ª–µ—Ä                      441
–≤–æ–π–Ω–∞                        382
–º–µ–ª–æ–¥—Ä–∞–º–∞                    206
–ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è                  202
—É–∂–∞—Å—ã                        190
–º—É–∑—ã–∫–∞                        70
–∫–æ—Ä–æ—Ç–∫–æ–º–µ—Ç—Ä–∞–∂–Ω—ã–π              43
—Ñ–∞–Ω—Ç–∞—Å—Ç–∏–∫–∞                    32
–º–∏—Å—Ç–∏–∫–∞                       17
—Ñ—ç–Ω—Ç–µ–∑–∏                       13
–±–∏–æ–≥—Ä–∞—Ñ–∏—á–µ—Å–∫–∏–π, –¥—Ä–∞–º–∞          7
–≤–æ–π–Ω–∞, –¥—Ä–∞–º–∞                   7
–¥–ª—è –≤–∑—Ä–æ—Å–ª—ã—Ö                   5
–±–æ–µ–≤–∏–∫, –¥—Ä–∞–º–∞                  4
–º—É–ª—å—Ç—Ñ–∏–ª—å–º—ã, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è       4
–¥—Ä–∞–º–∞, –ø—Ä–∏–∫–ª—é—á–µ–Ω–∏—è             

In [21]:
df.to_csv('movies_list.csv', index=False)

In [3]:
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2") 



In [4]:
embeddings = model.encode(df['description'].tolist(),show_progress_bar=True)

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493/493 [00:27<00:00, 18.17it/s]


In [5]:
embeddings = np.array(embeddings).astype("float32")

In [6]:
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

In [4]:
import pandas as pd
import numpy as np
import faiss
import time
from sentence_transformers import SentenceTransformer
import wikipediaapi
from tqdm import tqdm

# --- –ù–∞—Å—Ç—Ä–æ–π–∫–∏ Wikipedia ---
USER_AGENT = "MovieRecommendationBot/1.0 (https://example.com/contact)"
wiki = wikipediaapi.Wikipedia(language="ru", user_agent=USER_AGENT)

# --- –ü–æ–ª—É—á–µ–Ω–∏–µ —Å—é–∂–µ—Ç–∞ –∏–∑ –í–∏–∫–∏–ø–µ–¥–∏–∏ ---
def get_plot(title):
    try:
        # –î–æ–±–∞–≤–ª—è–µ–º –∑–∞–¥–µ—Ä–∂–∫—É, —á—Ç–æ–±—ã –∏–∑–±–µ–∂–∞—Ç—å –±–ª–æ–∫–∏—Ä–æ–≤–∫–∏
        time.sleep(0.5)
        
        page = wiki.page(title)
        
        # –ü—Ä–æ–≤–µ—Ä—è–µ–º —Å—É—â–µ—Å—Ç–≤–æ–≤–∞–Ω–∏–µ —Å—Ç—Ä–∞–Ω–∏—Ü—ã —Å –¥–æ–ø–æ–ª–Ω–∏—Ç–µ–ª—å–Ω–æ–π –∑–∞—â–∏—Ç–æ–π
        if not hasattr(page, 'pageid') or getattr(page, 'pageid', -1) == -1:
            return ""
            
        # –ò—â–µ–º —Ä–∞–∑–¥–µ–ª—ã —Å —Å—é–∂–µ—Ç–æ–º (–ø—Ä–æ–±—É–µ–º —Ä–∞–∑–Ω—ã–µ –≤–∞—Ä–∏–∞–Ω—Ç—ã –Ω–∞–∑–≤–∞–Ω–∏–π)
        section_titles = ["–°—é–∂–µ—Ç", "–°–æ–¥–µ—Ä–∂–∞–Ω–∏–µ", "–§–∞–±—É–ª–∞", "Plot"]
        for section_title in section_titles:
            if section_title in page.sections:
                return page.sections[section_title].text
        
        # –ï—Å–ª–∏ —Ä–∞–∑–¥–µ–ª –Ω–µ –Ω–∞–π–¥–µ–Ω, –≤–æ–∑–≤—Ä–∞—â–∞–µ–º –ø—É—Å—Ç—É—é —Å—Ç—Ä–æ–∫—É
        return ""
    
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –ø–æ–ª—É—á–µ–Ω–∏–∏ —Å—é–∂–µ—Ç–∞ –¥–ª—è '{title}': {str(e)}")
        return ""

# --- –®–∞–≥ 1: –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö –∏ –¥–æ–±–∞–≤–ª–µ–Ω–∏–µ —Å—é–∂–µ—Ç–æ–≤ ---
df = pd.read_csv("movies_list.csv")
df["description"] = df["description"].fillna("")

tqdm.pandas(desc="üìñ –ü–æ–ª—É—á–µ–Ω–∏–µ —Å—é–∂–µ—Ç–æ–≤ —Å –í–∏–∫–∏–ø–µ–¥–∏–∏")
df["plot"] = df["movie_title"].progress_apply(get_plot)

# --- –®–∞–≥ 2: –û–±—ä–µ–¥–∏–Ω—ë–Ω–Ω—ã–π —Ç–µ–∫—Å—Ç –¥–ª—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ ---
df["full_text"] = df["description"] + "\n" + df["plot"]

# --- –®–∞–≥ 3: –ì–µ–Ω–µ—Ä–∞—Ü–∏—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ ---
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")
texts = df["full_text"].tolist()
embeddings = model.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

# --- –®–∞–≥ 4: –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤ (–¥–ª—è –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞) ---
embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)

# --- –®–∞–≥ 5: –°–æ–∑–¥–∞–Ω–∏–µ FAISS –∏–Ω–¥–µ–∫—Å–∞ ---
index = faiss.IndexFlatIP(embeddings.shape[1])  # IP == cosine sim (–ø–æ—Å–ª–µ –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–∏)
index.add(embeddings)

# --- –®–∞–≥ 6: –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ ---
np.save("movie_vectors1.npy", embeddings)
faiss.write_index(index, "index1.bin")
df.to_csv("movies_with_plots.csv", index=False)

print("‚úÖ –ì–æ—Ç–æ–≤–æ! –í—Å—ë —Å–æ—Ö—Ä–∞–Ω–µ–Ω–æ.")



üìñ –ü–æ–ª—É—á–µ–Ω–∏–µ —Å—é–∂–µ—Ç–æ–≤ —Å –í–∏–∫–∏–ø–µ–¥–∏–∏:   6%|‚ñã         | 1016/15756 [15:58<3:51:42,  1.06it/s]


KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
df = pd.read_csv("movies_with_plots.csv")
embeddings = np.load("movie_vectors1.npy")
index = faiss.read_index("index1.bin")

# –ó–∞–≥—Ä—É–∑–∫–∞ —Ç–æ–π –∂–µ –º–æ–¥–µ–ª–∏
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")


In [None]:
def recommend_by_text(user_text, top_k=5):
    query_vec = model.encode([user_text])
    query_vec = query_vec / np.linalg.norm(query_vec, axis=1, keepdims=True)
    scores, indices = index.search(query_vec.astype("float32"), top_k)

    print(f"üéØ –†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏ –ø–æ –æ–ø–∏—Å–∞–Ω–∏—é: '{user_text}'\n")
    for i in indices[0]:
        print(f"‚Üí {df.iloc[i]['movie_title']} ({df.iloc[i].get('year', '–≥–æ–¥ –Ω–µ —É–∫–∞–∑–∞–Ω')})")
        print(f"–û–ø–∏—Å–∞–Ω–∏–µ: {df.iloc[i]['description'][:200]}...")
        print("-" * 50)



In [None]:
recommend_by_text("–∫–∏–±–µ—Ä–ø–∞–Ω–∫, –º—Ä–∞—á–Ω–æ–µ –±—É–¥—É—â–µ–µ, –≤–∏—Ä—Ç—É–∞–ª—å–Ω–∞—è —Ä–µ–∞–ª—å–Ω–æ—Å—Ç—å", top_k=5)


In [None]:
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
GROQ_API_KEY="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"
# –î–ª—è Groq Cloud
from langchain_groq import ChatGroq
from langchain_core.messages import HumanMessage, SystemMessage
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# === –ù–∞—Å—Ç—Ä–æ–π–∫–∏ –º–æ–¥–µ–ª–∏ ===
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
GROQ_MODEL = "mixtral-8x7b-32768"  # –∏–ª–∏ "llama3-70b-8192"

# === –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö ===
@st.cache_data
def load_data():
    df = pd.read_csv("movies_list.csv")
    return df

# === –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏ –∏ –∏–Ω–¥–µ–∫—Å–∞ ===
@st.cache_resource
def load_model_and_index():
    model = SentenceTransformer(MODEL_NAME)
    vectors = np.load("movie_vectors.npy")
    index = faiss.read_index("index.bin")
    return model, index, vectors

# === –§—É–Ω–∫—Ü–∏—è –ø–æ–∏—Å–∫–∞ —Ñ–∏–ª—å–º–æ–≤ ===
def find_similar_movies(query, model, index, df, top_k=5):
    query_vec = model.encode([query]).astype('float32')
    D, I = index.search(query_vec, top_k)
    return df.iloc[I[0]]

# === –ü–æ–¥–∫–ª—é—á–µ–Ω–∏–µ –∫ Groq Cloud ===
def get_groq_llm():
    return ChatGroq(
        model=GROQ_MODEL,
        temperature=0.7,
        max_tokens=1000,
        timeout=None,
        api_key=os.getenv("GROQ_API_KEY")
    )

# === –§–æ—Ä–º–∞—Ç–∏—Ä–æ–≤–∞–Ω–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤ –¥–ª—è LLM ===
def format_docs(docs):
    formatted = []
    for i, row in docs.iterrows():
        info = f"""
{i+1}. **{row['movie_title']}** ({row.get('year', '?')})
   –ñ–∞–Ω—Ä: {row.get('genre', '–ù–µ —É–∫–∞–∑–∞–Ω')}
   –û–ø–∏—Å–∞–Ω–∏–µ: {row.get('description', '')[:200]}...
"""
        formatted.append(info)
    return "\n".join(formatted)

# === RAG —Ü–µ–ø–æ—á–∫–∞ —Å Groq Cloud ===
def create_rag_chain(model, index, df):
    llm = get_groq_llm()

    prompt = ChatPromptTemplate.from_messages([
        ("system", """–¢—ã –∫–∏–Ω–æ–∫—Ä–∏—Ç–∏–∫ —Å —á—É–≤—Å—Ç–≤–æ–º —é–º–æ—Ä–∞.
–¢–≤–æ—è –∑–∞–¥–∞—á–∞: 
- –ü—Ä–æ–∞–Ω–∞–ª–∏–∑–∏—Ä–æ–≤–∞—Ç—å —Ñ–∏–ª—å–º—ã –∏–∑ –∫–æ–Ω—Ç–µ–∫—Å—Ç–∞
- –î–∞—Ç—å —à—É—Ç–ª–∏–≤—ã–µ, –Ω–æ —Ç–æ—á–Ω—ã–µ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏
- –û–±—ä—è—Å–Ω–∏—Ç—å, –ø–æ—á–µ–º—É –æ–Ω–∏ –ø–æ–¥—Ö–æ–¥—è—Ç –ø–æ–¥ –∑–∞–ø—Ä–æ—Å
- –ú–æ–∂–Ω–æ –¥–æ–±–∞–≤–∏—Ç—å –º–µ–º—ã –∏–ª–∏ —Å—Ä–∞–≤–Ω–µ–Ω–∏—è —Å –∏–∑–≤–µ—Å—Ç–Ω—ã–º–∏ —Ñ–∏–ª—å–º–∞–º–∏

–ï—Å–ª–∏ —Ñ–∏–ª—å–º—ã –Ω–µ –Ω–∞–π–¥–µ–Ω—ã ‚Äî —Ç–æ–∂–µ —Å–∫–∞–∂–∏ –æ–± —ç—Ç–æ–º, –Ω–æ —Å —é–º–æ—Ä–æ–º üòä"""),
        ("human", """
üîç –ó–∞–ø—Ä–æ—Å –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è: "{question}"
üé¨ –í–æ—Ç —Ñ–∏–ª—å–º—ã, –∫–æ—Ç–æ—Ä—ã–µ —è –Ω–∞—à—ë–ª:

{context}

üí¨ –û—Ç–≤–µ—Ç:""")
    ])

    def retrieve_and_format(query):
        results = find_similar_movies(query, model, index, df, top_k=5)
        if len(results) == 0:
            return {"context": "–ù–∏—á–µ–≥–æ –Ω–µ –Ω–∞—à–ª–æ—Å—å...", "question": query}
        return {"context": format_docs(results), "question": query}

    rag_chain = (
        RunnablePassthrough(input=lambda x: x["query"])
        | retrieve_and_format
        | prompt
        | llm
        | StrOutputParser()
    )
    return rag_chain

# === Streamlit UI ===
import os
os.environ["GROQ_API_KEY"] = st.secrets.get("GROQ_API_KEY", "–≤–∞—à_–∫–ª—é—á")

st.set_page_config(page_title="üé¨ –£–º–Ω—ã–µ —Ä–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏–∏", layout="wide")
st.title("ü§ñ –£–º–Ω—ã–π –ø–æ–∏—Å–∫ —Ñ–∏–ª—å–º–æ–≤ —á–µ—Ä–µ–∑ Groq Cloud")

df = load_data()
model, full_index, vectors = load_model_and_index()

rag_chain = create_rag_chain(model, full_index, df)

# === –í–≤–æ–¥ –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è ===
user_query = st.text_input("–í–≤–µ–¥–∏—Ç–µ –∑–∞–ø—Ä–æ—Å, –Ω–∞–ø—Ä–∏–º–µ—Ä: '–§–∏–ª—å–º –ø—Ä–æ –ª—é–±–æ–≤—å –≤ —Å—Ç–∏–ª–µ –∞–Ω–∏–º–µ'")
if st.button("üîç –ù–∞–π—Ç–∏ –∏ —Å–ø—Ä–æ—Å–∏—Ç—å –ò–ò"):
    if not user_query.strip():
        st.warning("‚ö†Ô∏è –ü–æ–∂–∞–ª—É–π—Å—Ç–∞, –≤–≤–µ–¥–∏—Ç–µ –∑–∞–ø—Ä–æ—Å!")
    else:
        with st.spinner("üß† –î—É–º–∞—é –Ω–∞–¥ —ç—Ç–∏–º..."):
            try:
                answer = rag_chain.invoke({"query": user_query})
                st.markdown("### üí¨ –û—Ç–≤–µ—Ç –æ—Ç –ò–ò:")
                st.markdown(answer)
            except Exception as e:
                st.error(f"‚ùå –û—à–∏–±–∫–∞: {e}")

In [8]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –¥–∞–Ω–Ω—ã—Ö
df = pd.read_csv("movies_list.csv")

# –ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞ –æ–ø–∏—Å–∞–Ω–∏–π
df["description"] = df["description"].fillna("")

# –ó–∞–≥—Ä—É–∑–∫–∞ –º–æ–¥–µ–ª–∏
model = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2")

# –ì–µ–Ω–µ—Ä–∞—Ü–∏—è —ç–º–±–µ–¥–¥–∏–Ω–≥–æ–≤
embeddings = model.encode(df["description"].tolist(), show_progress_bar=True)
embeddings = np.array(embeddings).astype("float32")

# –ù–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏—è –¥–ª—è –∫–æ—Å–∏–Ω—É—Å–Ω–æ–≥–æ —Å—Ö–æ–¥—Å—Ç–≤–∞
embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)

# –°–æ–∑–¥–∞–Ω–∏–µ –∏–Ω–¥–µ–∫—Å–∞ –Ω–∞ –æ—Å–Ω–æ–≤–µ —Å–∫–∞–ª—è—Ä–Ω–æ–≥–æ –ø—Ä–æ–∏–∑–≤–µ–¥–µ–Ω–∏—è (—ç–∫–≤–∏–≤–∞–ª–µ–Ω—Ç –∫–æ—Å–∏–Ω—É—Å–Ω–æ–º—É —Å—Ö–æ–¥—Å—Ç–≤—É –ø—Ä–∏ –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–∏)
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)  # Inner Product == Cosine similarity –ø—Ä–∏ –Ω–æ—Ä–º–∞–ª–∏–∑–∞—Ü–∏–∏
index.add(embeddings)

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ
np.save("movie_vectors.npy", embeddings)
faiss.write_index(index, "index.bin")

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 493/493 [00:25<00:00, 19.21it/s]


In [18]:
# –ó–∞–ø—Ä–æ—Å –ø–æ–ª—å–∑–æ–≤–∞—Ç–µ–ª—è
user_query = "–ù–æ–≤–æ–≥–æ–¥–Ω—è—è –Ω–æ—á—å"
query_vector = model.encode([user_query],convert_to_numpy=True).astype("float32")

# –í–µ–∫—Ç–æ—Ä–∏–∑—É–µ–º –∑–∞–ø—Ä–æ—Å
#query_embedding = model.encode([user_query])

# –°—á–∏—Ç–∞–µ–º —Å—Ö–æ–∂–µ—Å—Ç—å –∑–∞–ø—Ä–æ—Å–∞ —Å –∫–∞–∂–¥—ã–º —Å–µ—Ä–∏–∞–ª–æ–º
#similarities = cosine_similarity(query_embedding, embeddings)[0]  # –ø–æ–ª—É—á–∞–µ–º 1D-–º–∞—Å—Å–∏–≤

# –ü–æ–ª—É—á–∏–º –∏–Ω–¥–µ–∫—Å—ã —Ç–æ–ø-5 —Å–∞–º—ã—Ö –ø–æ—Ö–æ–∂–∏—Ö —Å–µ—Ä–∏–∞–ª–æ–≤
#top_n = 5
#top_indices = similarities.argsort()[-top_n:][::-1]  # —Å–æ—Ä—Ç–∏—Ä–æ–≤–∫–∞ –ø–æ —É–±—ã–≤–∞–Ω–∏—é

# –í—ã–≤–æ–¥–∏–º –Ω–∞–∑–≤–∞–Ω–∏—è –∏ –æ–ø–∏—Å–∞–Ω–∏—è –ø–æ—Ö–æ–∂–∏—Ö —Å–µ—Ä–∏–∞–ª–æ–≤
# for i in top_indices:
#     print(f"\nüé¨ –ù–∞–∑–≤–∞–Ω–∏–µ: {df.iloc[i]['tvshow_title']}")
#     print(f"üìÑ –û–ø–∏—Å–∞–Ω–∏–µ: {df.iloc[i]['description']}")
#     print(f"üîç –°—Ö–æ–∂–µ—Å—Ç—å: {similarities[i]:.3f}")

k = 9
distances, indices = index.search(query_vector, k)

# –í—ã–≤–æ–¥–∏–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
for i in indices[0]:
    print(f"\nüé¨ –ù–∞–∑–≤–∞–Ω–∏–µ: {df.iloc[i]['movie_title']}")
    print(f"üìÑ –û–ø–∏—Å–∞–Ω–∏–µ: {df.iloc[i]['description']}")


üé¨ –ù–∞–∑–≤–∞–Ω–∏–µ: –° –ù–æ–≤—ã–º –≥–æ–¥–æ–º!
üìÑ –û–ø–∏—Å–∞–Ω–∏–µ: –î—Ä—É–∑—å—è —Å–æ–±–∏—Ä–∞—é—Ç—Å—è, —á—Ç–æ–±—ã –æ—Ç–ø—Ä–∞–∑–¥–Ω–æ–≤–∞—Ç—å –ù–æ–≤—ã–π –≥–æ–¥. –í —Ö–æ–¥–µ –≤–µ—á–µ—Ä–∞ –æ–Ω–∏ —Ä–µ—à–∞—é—Ç —Å—ã–≥—Ä–∞—Ç—å –≤ –∏–≥—Ä—É –∏ –ø–æ–ª–æ–∂–∏—Ç—å —Å–≤–æ–∏ —Ç–µ–ª–µ—Ñ–æ–Ω—ã –Ω–∞ –æ–±—â–µ–µ –æ–±–æ–∑—Ä–µ–Ω–∏–µ. –í–æ –≤—Ä–µ–º—è –∏–≥—Ä—ã —Ä–∞—Å–∫—Ä–æ–µ—Ç—Å—è –º–Ω–æ–≥–æ —Å–µ–∫—Ä–µ—Ç–æ–≤, —Ä–∞–∑–æ–±—å—é—Ç—Å—è —Å–µ—Ä–¥—Ü–∞ –∏ —Å–ª–æ–º–∞—é—Ç—Å—è —Å—É–¥—å–±—ã. –° –ù–æ–≤—ã–º –≥–æ–¥–æ–º!

üé¨ –ù–∞–∑–≤–∞–Ω–∏–µ: –û —á–µ–º –µ—â–µ –≥–æ–≤–æ—Ä—è—Ç –º—É–∂—á–∏–Ω—ã
üìÑ –û–ø–∏—Å–∞–Ω–∏–µ: –¢–∞–∫ —Å–ª—É—á–∏–ª–æ—Å—å, —á—Ç–æ –ø–æ—á—Ç–∏ –≤–µ—Å—å –¥–µ–Ω—å 31 –¥–µ–∫–∞–±—Ä—è –Ω–∞—à–∏ –≥–µ—Ä–æ–∏ –ø—Ä–æ–≤–æ–¥—è—Ç –≤–º–µ—Å—Ç–µ, –ö–∞–∫ –æ–Ω–∏ –ø—Ä–æ–≤–æ–¥—è—Ç –≤—Ä–µ–º—è –Ω–∞–∫–∞–Ω—É–Ω–µ –ù–æ–≤–æ–≥–æ –≥–æ–¥–∞? –†–∞–∑—É–º–µ–µ—Ç—Å—è, –≤ —Ä–∞–∑–≥–æ–≤–æ—Ä–∞—Ö. –û —á–µ–º? –í–æ-–ø–µ—Ä–≤—ã—Ö, –∫–æ–Ω–µ—á–Ω–æ, –æ –∂–µ–Ω—â–∏–Ω–∞—Ö ‚Äì —Ç–µ–º–∞-—Ç–æ –Ω–µ–∏—Å—á–µ—Ä–ø–∞–µ–º–∞—è. –ê –µ—â–µ –æ‚Ä¶

üé¨ –ù–∞–∑–≤–∞–Ω–∏–µ: –û–