In [1]:
import pandas as pd
from transformers import BertModel, BertTokenizer
import torch
import numpy as np
import faiss

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load a pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def text_to_vector(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Load your CSV
df = pd.read_csv('israel.csv')
df.head()



Unnamed: 0,Question,Answer
0,What is the current travel advisory for Gaza?,The advisory recommends not traveling to Gaza ...
1,Why should travelers reconsider visiting Israe...,Travelers should reconsider due to ongoing ter...
2,What precautions should U.S. citizens take if ...,Travelers should maintain situational awarenes...
3,Are there any specific travel restrictions for...,"Yes, U.S. government employees have travel res..."
4,What should individuals do if they absolutely ...,"They should prepare for an indefinite stay, ha..."


In [4]:
# Convert texts to vectors
q_vectors = np.vstack([text_to_vector(text) for text in df['Question']])
a_vectors = np.vstack([text_to_vector(text) for text in df['Answer']])

In [None]:
# Load your CSV
df = pd.read_csv('israel.csv')
data = df[['feature1', 'feature2', 'feature3']].values  # Assuming these are your vector features
data = data.astype('float32')

# Build the index
dimension = data.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(data)

# Optionally save the index to disk
faiss.write_index(index, 'your_index.faiss')


In [None]:
# Load the tokenizer and the RAG model
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq")

In [None]:
# Function to create FAISS index
def create_index(data, dimension):
    index = faiss.IndexFlatL2(dimension)
    index.add(data.astype('float32'))
    return index

In [None]:
# Load CSV and create FAISS index
@st.cache(allow_output_mutation=True)
def load_data(csv_file):
    df = pd.read_csv(csv_file)
    data_vectors = df.to_numpy()
    index = create_index(data_vectors, data_vectors.shape[1])
    return index, df

In [None]:
# Streamlit interface
st.title('RAG with FAISS for Information Retrieval')

csv_file = st.file_uploader("Upload your CSV", type=['csv'])
if csv_file is not None:
    index, df = load_data(csv_file)
    st.success('Data loaded and index created!')

    # Text input for query
    query = st.text_area("Enter your query here:")
    if st.button('Generate Answer'):
        if query:
            # Retrieve context from FAISS index
            input_ids = tokenizer(query, return_tensors="pt").input_ids
            retrieved = model.context_encoder(input_ids)[0]  # Context encoding
            distances, indices = index.search(retrieved.detach().numpy(), 1)  # Search in FAISS
            context = df.iloc[indices[0][0]]['text_column_name']  # Adjust column name

            # Generate answer using RAG
            inputs = tokenizer(context + " \\n " + query, return_tensors="pt")
            outputs = model.generate(**inputs)
            answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
            st.write(answer)
        else:
            st.error("Please enter a query to generate an answer.")