In [None]:
import pandas as pd
import tiktoken
import openai
from openai import OpenAI
from scipy.spatial.distance import cosine
import os
client = OpenAI()
openai.api_key = os.environ["OPENAI_API_KEY"]

In [None]:
top_movies_df = pd.read_csv('watcha_top_movies.csv')
top_movies_df.fillna(" ", inplace=True)

In [None]:
def combine_movie_info(row):
    title = row['title'].strip()
    content = row['plot'].strip()
    ratings = str(row['ratings_avg'])[:3]
    nations = ", ".join(eval(row['nations'])).strip()
    genres = ", ".join(eval(row['genres'])).strip()
    
    combined = [f"제목: {title}", f"줄거리: {content}", f"평점: {ratings}", f"제작국가: {nations}", f"장르: {genres}"]
    combined = "\n".join(combined)
    
    return f"```{combined}```"

top_movies_df["combined"] = top_movies_df.apply(combine_movie_info, axis=1)

In [None]:
def get_embedding(text, model="text-embedding-ada-002"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

In [None]:
top_movies_df["embedding"] = top_movies_df.combined.apply(get_embedding)

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")
top_movies_df['n_tokens'] = top_movies_df.combined.apply(lambda x: len(tokenizer.encode(x)))

In [None]:
top_movies_df['embedding']

In [None]:
def create_context(
    question, df, max_len=3000
):
    q_embeddings = client.embeddings.create(input=question, model='text-embedding-ada-002').data[0].embedding
    df["distances"] = df["embedding"].apply(lambda x: cosine(q_embeddings, x))
    returns = []
    cur_len = 0
    for i, row in df.sort_values('distances', ascending=True).iterrows():
        cur_len += row['n_tokens'] + 4
        if cur_len > max_len:
            break
        returns.append(row["combined"])
    return "\n\n===\n\n".join(returns)
def answer_question(
    df,
    model="gpt-4-32k",
    question="스파이더맨 줄거리 얘기해줘.",
    max_len=3000,
    debug=False,
):
    context = create_context(
        question,
        df,
        max_len=max_len,
    )
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\n"},
                {"role": "user", "content": f"context: {context}\n\n---\n\n Question: {question}, 한국어로 대답해줘."}
            ],
            temperature=0.5,
        )
        return response.choices[0].message.content
    except Exception as e:
        print("Error occurred:", e)
        return ""


In [None]:
answer_question(top_movies_df, question="펄프픽션 줄거리 얘기해줘.", debug=True)

In [None]:
answer_question(top_movies_df, question="가장 높은 평점을 가진 영화가 뭐야? ", debug=True)

In [None]:
answer_question(top_movies_df, question="미국에서 제작에 참여한 영화 뭐가 있어?", debug=True)

In [None]:
answer_question(top_movies_df, question="의사가 나오는 영화 뭐 있어?", debug=True)