In [28]:
import pandas as pd
from transformers import pipeline, set_seed
from sentence_transformers import SentenceTransformer
import requests
from bs4 import BeautifulSoup
import chromadb
from nltk.tokenize import sent_tokenize
from tqdm.notebook import tqdm

In [29]:
picture_info = pd.read_csv('../data/best_picture_2000.csv')

In [30]:
def get_text(link):
    response = requests.get(link)
    soup = BeautifulSoup(response.text)
    # Find all paragraphs within the main content area
    paragraphs = soup.find(id='mw-content-text').find_all('p')
    # Extract plain text from paragraphs using get_text() method
    text = '\n'.join([p.get_text() for p in paragraphs])
    return text

In [31]:
picture_info['body_text'] = picture_info['link'].apply(get_text)

In [32]:
picture_info = picture_info.reset_index()

In [33]:
picture_info.head()

Unnamed: 0,index,title,link,year,body_text
0,0,Gladiator,https://en.wikipedia.org/wiki/Gladiator_(2000_...,2000,\n\nGladiator is a 2000 epic historical drama...
1,1,A Beautiful Mind,https://en.wikipedia.org/wiki/A_Beautiful_Mind...,2001,\n\nA Beautiful Mind is a 2001 American biogra...
2,2,Chicago,https://en.wikipedia.org/wiki/Chicago_(2002_film),2002,\n\nChicago is a 2002 American musical black c...
3,3,The Lord of the Rings: The Return of the King,https://en.wikipedia.org/wiki/The_Lord_of_the_...,2003,\n\nThe Lord of the Rings: The Return of the K...
4,4,Million Dollar Baby,https://en.wikipedia.org/wiki/Million_Dollar_Baby,2004,\n\nMillion Dollar Baby is a 2004 American spo...


In [34]:
# client = chromadb.PersistentClient(path="../chromadb")
# collection = client.create_collection("picture_info")

In [35]:
# def add_picture(picture):
#     sentences = sent_tokenize(picture['body_text'])
#     collection.add(
#         documents = sentences,
#         ids = [f'{picture["index"]}_{i}' for i in range(len(sentences))],
#         metadatas = [{'picture': picture['title']}] * len(sentences)
#     )

In [36]:
# import nltk
# import os
# nltk.data.path.append('/home/prahlad/nltk_data')
# nltk.download('punkt_tab', download_dir='/home/prahlad/nltk_data')

In [37]:
# for _, row in tqdm(picture_info.iterrows()):
#     print(row['title'])
#     add_picture(row)

In [38]:
questions = pd.read_csv('../data/QAs.csv')

In [39]:
questions

Unnamed: 0,film,question,answer
0,Gladiator,What was the worldwide gross of Gladiator?,$465.4 million
1,A Beautiful Mind,Who was consulted on the mathematical equation...,Dave Bayer
2,Chicago,Where did principal photography take place for...,"Toronto, Ontario, Canada."
3,The Lord of the Rings: The Return of the King,Who composed the score for the film The Lord o...,Howard Shore
4,Million Dollar Baby,What day was Million Dollar Baby released in t...,"December 15, 2004"
5,Crash,When was Crash released on DVD?,"September 6, 2005"
6,The Departed,How much did The Departed gross on opening wee...,$26.9 million
7,No Country for Old Men,The film No Country for Old Men is based on th...,Cormac McCarthy
8,Slumdog Millionaire,Slumdog Millionaire is based on what novel?,Q & A
9,The Hurt Locker,Where was The Hurt Locker filmed?,Jordan


In [40]:
client = chromadb.PersistentClient(path="../chromadb")
collection = client.get_collection("picture_info")

In [41]:
collection.count()

4888

In [42]:
def context(row):
    question = row['question']
    movie_title = row['film']
    query_text = f"{movie_title} {question}"
    results = collection.query(
    query_texts = [query_text],
    n_results = 5
    )
    
    filtered_texts = []
    for doc, metadatas in zip(results['documents'], results['metadatas']):
        for index, meta in enumerate(metadatas):
            # Check if movie_title is in the values associated with 'picture' key
            if meta['picture'] == movie_title:
                filtered_texts.append(doc[index])
                break  # Stop iterating over metadata once a match is found
            
    return '\n'.join(filtered_texts)

In [43]:
def context2(question):
    results = collection.query(
    query_texts = [question],
    n_results = 5
    )
    return '\n'.join(results['documents'][0])

In [44]:
questions['context']=  questions.apply(context, axis=1)

In [45]:
questions['context']=  questions['question'].apply(context2)

Question answering models

In [46]:
encoder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')



In [47]:
embeddings = questions['context'].apply(encoder.encode)

In [48]:
decoder = pipeline('text-generation', model='distilgpt2')

In [49]:
qa = pipeline(task="question-answering")

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [50]:
def answer_generation(row):
    question = row['question']
    context = row['context']  
    try:
        answer = qa(question=question, context=context)
        return answer['answer']
    except ValueError as e:
        print(f"Error generating answer for question '{question}': {e}")
        return "Error: Unable to generate answer"

In [51]:
questions['answer_generated'] = questions.apply(answer_generation, axis=1)

In [52]:
questions[['question','answer','answer_generated']]

Unnamed: 0,question,answer,answer_generated
0,What was the worldwide gross of Gladiator?,$465.4 million,$465.5 million
1,Who was consulted on the mathematical equation...,Dave Bayer,Dave Bayer
2,Where did principal photography take place for...,"Toronto, Ontario, Canada.","New Orleans, Louisiana"
3,Who composed the score for the film The Lord o...,Howard Shore,Peter Jackson
4,What day was Million Dollar Baby released in t...,"December 15, 2004","December 15, 2004"
5,When was Crash released on DVD?,"September 6, 2005","September 6, 2005"
6,How much did The Departed gross on opening wee...,$26.9 million,$34.8 million
7,The film No Country for Old Men is based on th...,Cormac McCarthy,Cormac McCarthy
8,Slumdog Millionaire is based on what novel?,Q & A,Q & A
9,Where was The Hurt Locker filmed?,Jordan,3rd Zurich Film Festival


In [53]:
import numpy as np
np.sum(questions['answer']== questions['answer_generated'])

16