In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from transformers import AutoModelForCausalLM, AutoTokenizer
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.documents.base import Document
from langchain.vectorstores import FAISS
from gnews import GNews
from tqdm import tqdm
import torch
import json

device = 'cuda:0'

## Load the LLM

In [None]:
model_name = 'mistralai/Mistral-7B-Instruct-v0.2'

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map=device, cache_dir='/mnt/esperanto/et/huggingface/hub')

In [None]:
user_question = "Why was Sam Altman fired from OpenAI in November 2023?"

inputs = tokenizer(f"[INST]{user_question}[/INST]", return_tensors='pt').to(device)
with torch.no_grad():
    outputs = model.generate(**inputs, do_sample=False, max_new_tokens=500)

answer = tokenizer.decode(outputs[0,inputs['input_ids'].shape[1]:])
print(answer)

## Gathering news data

In [None]:
google_news = GNews(language='en', country='US', period='365d', start_date=None, end_date=None, max_results=100)
news_list = google_news.get_news('openai')

In [None]:
full_news_list = []
for news in tqdm(news_list):
    try:
        news_article = google_news.get_full_article(news['url']).text
        full_news_list.append({
                            'metadata': news,
                            'article': news_article,
        })
    except:
        continue

In [None]:
with open("database.json", "w") as write_file:
    json.dump(full_news_list, write_file, indent=4)

## Prepare database for RAG

In [None]:
with open('./database.json') as file:
    data = json.load(file)
    
documents = []
for item in data:
    article = item['article']
    metadata = item.get('metadata', {})
    document = Document(page_content=article, metadata=metadata)
    documents.append(document)

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
documents = text_splitter.split_documents(documents)

In [None]:

documents[:3]

In [None]:
embedding_model = "sentence-transformers/all-MiniLM-l6-v2"

embeddings = HuggingFaceEmbeddings(
                            model_name=embedding_model,
                            model_kwargs={'device': device},
                            )

In [None]:
embeddings.embed_query("Why was Sam Altman fired from OpenAI in November 2023?")

In [None]:
vector_db = FAISS.from_documents(documents, embeddings)
retriever = vector_db.as_retriever(search_kwargs={"k": 3})

In [None]:
relevant_docs = retriever.get_relevant_documents('Sam Altman')
relevant_docs

## Build RAG pipeline

In [None]:
bot_prompt = "You are a helpful chatbot assistant. Your role is to answer questions of a curious user. To help you with this, you are provided below some context that may or may not be relevant. You should decide wether to base your answer on the provided context to be as helpful and accurate as possible. DO NOT MENTION that you base your answer on the context if you do, act as if it was your own knowledge. In any case, you should always try to use your knoweldge to provide a helpful and consistent answer regardless of the quality of the context."
context_prompt = '\n'.join([retriever.get_relevant_documents(user_question)[i].page_content for i in range(3)])
full_prompt = f"""[INST]{bot_prompt}\n[CONTEXT]{context_prompt}[/CONTEXT]\nUSER: {user_question}[/INST]"""
print(full_prompt)

In [None]:
inputs = tokenizer(full_prompt, return_tensors='pt').to(device)

with torch.no_grad():
    outputs = model.generate(**inputs, do_sample=False, max_new_tokens=500)

answer = tokenizer.decode(outputs[0,inputs['input_ids'].shape[1]:])
print(answer)

In [None]:

retriever.get_relevant_documents(user_question)