### Dependencies

In [68]:
%%capture
!pip install -q -U openai langchain
!pip install -q -U requests
!pip install -q -U scrapy selenium
!apt install chromium-chromedriver
!pip install jq
!pip install -q -U faiss-cpu tiktoken

In [79]:
import os
import getpass
import numpy as np
import pandas as pd
from scrapy.selector import Selector
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
from tqdm import tqdm
import warnings
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.document_loaders.json_loader import JSONLoader

from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.embeddings import CacheBackedEmbeddings
from langchain.vectorstores import FAISS
from langchain.storage import LocalFileStore

from langchain.llms.openai import OpenAIChat

from langchain.chains import RetrievalQA
from langchain.callbacks import StdOutCallbackHandler


warnings.filterwarnings("ignore")

### Data Extraction

Use of Selenium to extract the information of review about the movie Barbie 2023

In [4]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=chrome_options)

In [5]:
url = "https://www.imdb.com/title/tt1517268/reviews/?ref_=tt_ov_rt"
driver.get(url)

In [6]:

sel = Selector(text = driver.page_source)
review_counts = sel.css('.lister .header span::text').extract_first().replace(',','').split(' ')[0]
more_review_pages = int(int(review_counts)/25)

In [7]:
for i in tqdm(range(more_review_pages)):
    try:
        css_selector = 'load-more-trigger'
        driver.find_element(By.ID, css_selector).click()
    except:
        pass

100%|██████████| 46/46 [00:01<00:00, 25.43it/s]


In [8]:
rating_list = []
review_date_list = []
review_title_list = []
author_list = []
review_list = []
review_url_list = []
error_url_list = []
error_msg_list = []
reviews = driver.find_elements(By.CSS_SELECTOR, 'div.review-container')

for d in tqdm(reviews):
    try:
        sel2 = Selector(text = d.get_attribute('innerHTML'))
        try:
            rating = sel2.css('.rating-other-user-rating span::text').extract_first()
        except:
            rating = np.NaN
        try:
            review = sel2.css('.text.show-more__control::text').extract_first()
        except:
            review = np.NaN
        try:
            review_date = sel2.css('.review-date::text').extract_first()
        except:
            review_date = np.NaN
        try:
            author = sel2.css('.display-name-link a::text').extract_first()
        except:
            author = np.NaN
        try:
            review_title = sel2.css('a.title::text').extract_first()
        except:
            review_title = np.NaN
        try:
            review_url = sel2.css('a.title::attr(href)').extract_first()
        except:
            review_url = np.NaN
        rating_list.append(rating)
        review_date_list.append(review_date)
        review_title_list.append(review_title)
        author_list.append(author)
        review_list.append(review)
        review_url_list.append(review_url)
    except Exception as e:
        error_url_list.append(url)
        error_msg_list.append(e)
review_df = pd.DataFrame({
    'Review_Date':review_date_list,
    'Author':author_list,
    'Rating':rating_list,
    'Review_Title':review_title_list,
    'Review':review_list,
    'Review_Url':review_url
    })

100%|██████████| 125/125 [00:01<00:00, 97.82it/s] 


In [41]:
review_df

Unnamed: 0,Review_Date,Author,Rating,Review_Title,Review,Review_Url
0,21 July 2023,LoveofLegacy,6,"Beautiful film, but so preachy\n","Margot does the best with what she's given, bu...",/review/rw9239935/?ref_=tt_urv
1,22 July 2023,imseeg,7,3 reasons FOR seeing it and 1 reason AGAINST.\n,The first reason to go see it:,/review/rw9239935/?ref_=tt_urv
2,22 July 2023,Natcat87,6,Too heavy handed\n,"As a woman that grew up with Barbie, I was ver...",/review/rw9239935/?ref_=tt_urv
3,31 July 2023,ramair350,10,"As a guy I felt some discomfort, and that's o...",As much as it pains me to give a movie called ...,/review/rw9239935/?ref_=tt_urv
4,24 July 2023,heatherhilgers,9,A Technicolor Dream\n,"Wow, this movie was a love letter to cinema. F...",/review/rw9239935/?ref_=tt_urv
...,...,...,...,...,...,...
120,30 July 2023,Rice-and-Beans34,6,I don't get all the hype...\n,This movie is many things. One may say it is a...,/review/rw9239935/?ref_=tt_urv
121,23 July 2023,mafiagirl-21431,8,So fun and charming!\n,So i've just got back from theaters and all i ...,/review/rw9239935/?ref_=tt_urv
122,24 July 2023,steveinadelaide,6,Cute but shallow\n,"So I headed to the cinema to watch Barbie, the...",/review/rw9239935/?ref_=tt_urv
123,21 July 2023,GusherPop,10,Greta shows real power of woman in Barbie\n,Greta Gerwig's Barbie explores creation myths ...,/review/rw9239935/?ref_=tt_urv


In [45]:
review_df.to_json("barbie.json", lines=True, orient="records")
#review_df.to_csv("barbie.csv")

In [46]:
json_loader = JSONLoader("barbie.json", jq_schema=".Review", json_lines=True, )
#csv_loader = CSVLoader(file_path="barbie.csv",source_column="Review",)

In [48]:
data = json_loader.load()

In [49]:
assert len(data) == 125

### Data processing

In [63]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, 
    chunk_overlap = 100,
    length_function = len,
)

In [64]:
documents = text_splitter.split_documents(data)

In [65]:
len(documents) 

148

In [67]:

assert len(documents) == 148

### Index data

Reference Vector Frameworks: FAISS, Annoy and ScANN

In [71]:
os.environ["OPENAI_API_KEY"] = getpass.getpass("Open AI API Key:")

In [72]:
#temporal folder to save information
store = LocalFileStore("cache/")
# Open AI Embeedings - Could be replaced by any other embeddings
core_embeddings_model = OpenAIEmbeddings()

embedder = CacheBackedEmbeddings.from_bytes_store(
    core_embeddings_model,
    store, 
    namespace=core_embeddings_model.model
)

# FAISS vector store using local file storage
vector_store = FAISS.from_documents(documents, embedder)

In [73]:
query = "How is Will Ferrell in this movie?"
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

for page in docs:
  print(page.page_content)

The first half was pretty enjoyable, fun, light, but it took itself too seriously by the second half. No longer allowing the talented cast, especially Gosling, to shine and make us laugh. It felt like the talents of Will Ferrell and Michael Cera were also somewhat underutilized. Interesting concept, had potential, but later in the movie, it definitely started to fall flat for me.
Okay maybe it was a 9.5 because of two flaws: First was the Will Ferrell character and his board that made their point but then became superfluos. Second was that it is definitely not a kids' movie (although maybe they would see things that I didn't - I mean to be fair, the few kids in the theatre were well behaved so perhaps the movie got their full attention as well).
I really wanted to enjoy this and I know that I am not the target audience but there were massive plot holes and no real flow. The film was very disjointed. Ryan Gosling as good as he is seemed to old to play Ken and Will Ferrell ruined every s

In [74]:
%%timeit -n 1 -r 1
query = "I really wanted to enjoy this and I know that I am not the target audience but there were massive plot holes and no real flow."
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

149 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [75]:
%%timeit
query = "I really wanted to enjoy this and I know that I am not the target audience but there were massive plot holes and no real flow."
embedding_vector = core_embeddings_model.embed_query(query)
docs = vector_store.similarity_search_by_vector(embedding_vector, k = 4)

166 ms ± 39.3 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### QA Pipeline

In [77]:
llm = OpenAIChat(temperature=0,)

In [78]:
retriever = vector_store.as_retriever()

In [80]:
# this is a callback to print the results of the LLM
handler = StdOutCallbackHandler()

qa_with_sources_chain = RetrievalQA.from_chain_type(
    llm=llm, 
    retriever=retriever,
    callbacks=[handler],
    return_source_documents=True
)

In [82]:

qa_with_sources_chain({"query" : "How was Will Ferrell in this movie? Explain"})



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'How was Will Ferrell in this movie? Explain',
 'result': "Will Ferrell's performance in this movie was not well-received. The reviewer mentions that Ferrell's character and his board made their point but then became superfluous, and that Ferrell ruined every scene he was in. They also state that the talents of Ferrell and Michael Cera were underutilized. Overall, it seems that the reviewer did not enjoy Ferrell's performance in the movie.",
 'source_documents': [Document(page_content="Okay maybe it was a 9.5 because of two flaws: First was the Will Ferrell character and his board that made their point but then became superfluos. Second was that it is definitely not a kids' movie (although maybe they would see things that I didn't - I mean to be fair, the few kids in the theatre were well behaved so perhaps the movie got their full attention as well).", metadata={'source': '/workspaces/ai-maker-space-llm-ops/A1-RAQA/notebooks/barbie.json', 'seq_num': 77}),
  Document(page_con