In [None]:
import pandas as pd
from transformers import pipeline, set_seed
from sentence_transformers import SentenceTransformer
import requests
from bs4 import BeautifulSoup
import chromadb
from nltk.tokenize import sent_tokenize
from tqdm.notebook import tqdm

In [None]:
picture_info = pd.read_csv('../data/best_picture_2000.csv')

In [None]:
def get_text(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.text)
    # Find all paragraphs within the main content area
    paragraphs = soup.find(id='mw-content-text').find_all('p')
    # Extract plain text from paragraphs using get_text() method
    text = '\n'.join([p.get_text() for p in paragraphs])
    return text

In [None]:
picture_info['body_text'] = picture_info['link'].apply(get_text)

In [None]:
picture_info = picture_info.reset_index()

In [None]:
picture_info.head()

In [None]:
# client = chromadb.PersistentClient(path="../chromadb")
# collection = client.create_collection("picture_info")

In [None]:
# def add_picture(picture):
#     sentences = sent_tokenize(picture['body_text'])
#     collection.add(
#         documents = sentences,
#         ids = [f'{picture["index"]}_{i}' for i in range(len(sentences))],
#         metadatas = [{'picture': picture['title']}] * len(sentences)
#     )

In [None]:
# import nltk
# import os
# nltk.data.path.append('/home/prahlad/nltk_data')
# nltk.download('punkt_tab', download_dir='/home/prahlad/nltk_data')

In [None]:
# for _, row in tqdm(picture_info.iterrows()):
#     print(row['title'])
#     add_picture(row)

In [None]:
questions = pd.read_csv('../data/QAs.csv')

In [None]:
questions

In [None]:
client = chromadb.PersistentClient(path="../chromadb")
collection = client.get_collection("picture_info")

In [None]:
collection.count()

In [None]:
def context(row):
    question = row['question']
    movie_title = row['film']
    query_text = f"{movie_title} {question}"
    results = collection.query(
    query_texts = [query_text],
    n_results = 5
    )
    
    filtered_texts = []
    for doc, metadatas in zip(results['documents'], results['metadatas']):
        for index, meta in enumerate(metadatas):
            # Check if movie_title is in the values associated with 'picture' key
            if meta['picture'] == movie_title:
                filtered_texts.append(doc[index])
                break  # Stop iterating over metadata once a match is found
            
    return '\n'.join(filtered_texts)

In [None]:
def context2(question):
    results = collection.query(
    query_texts = [question],
    n_results = 5
    )
    return '\n'.join(results['documents'][0])

In [None]:
questions['context']=  questions.apply(context, axis=1)

In [None]:
questions['context']=  questions['question'].apply(context2)

Question answering models

In [None]:
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
embeddings = questions['context'].apply(encoder.encode)

In [None]:
decoder = pipeline('text-generation', model='distilgpt2')

In [None]:
qa = pipeline(task="question-answering")

In [None]:
def answer_generation(row):
    question = row['question']
    context = row['context']  
    try:
        answer = qa(question=question, context=context)
        return answer['answer']
    except ValueError as e:
        print(f"Error generating answer for question '{question}': {e}")
        return "Error: Unable to generate answer"

In [None]:
questions['answer_generated'] = questions.apply(answer_generation, axis=1)

In [None]:
questions[['question','answer','answer_generated']]

In [None]:
import numpy as np
np.sum(questions['answer']== questions['answer_generated'])