Upload necessary packages

In [2]:
%pip install sentence-transformers
%pip install faiss-cpu
%pip install tqdm
%pip install PyMuPDF
%pip install openai

Note: you may need to restart the kernel to use updated packages.


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


OPENAI RAG

In [7]:
import boto3
import os
import re
import uuid
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
import fitz  # PyMuPDF
import sagemaker
import openai
import tempfile

# Set environment variable to avoid tokenizer parallelism warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Set up SageMaker session and role
session = sagemaker.Session()
role = sagemaker.get_execution_role()

# Initialize S3 client
s3 = boto3.client('s3')

# Initialize the SentenceTransformer model
sentence_transformer = SentenceTransformer('all-MiniLM-L6-v2')

# Set OpenAI API key from environment variable
os.environ['OPENAI_API_KEY'] = 'Your_API_key'
openai.api_key = os.environ['OPENAI_API_KEY']

if not openai.api_key:
    raise ValueError("OPENAI_API_KEY environment variable not set")

# Function to create chunks of approximately 250 words
def create_chunks(text, words_per_chunk=100, overlap=10):
    words = re.findall(r'\S+', text)
    chunks = []
    start = 0
    while start < len(words):
        end = start + words_per_chunk
        chunk = " ".join(words[start:end])
        chunks.append(chunk)
        start = end - overlap
    return chunks

# Create embeddings for your knowledge base
def create_embeddings(chunks):
    embeddings = sentence_transformer.encode(chunks, show_progress_bar=True)
    print(f"Embeddings shape: {embeddings.shape}")
    return embeddings

# Load and preprocess your corpus from S3
def load_and_preprocess_data(bucket_name):
    chunks = []
    response = s3.list_objects_v2(Bucket=bucket_name)
    for obj in response.get('Contents', []):
        file_key = obj['Key']
        print(f"Processing file: {file_key}")
        try:
            obj = s3.get_object(Bucket=bucket_name, Key=file_key)
            file_content = obj['Body'].read()
            
            # Write content to a temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
                temp_file.write(file_content)
                temp_file_path = temp_file.name

            try:
                pdf_document = fitz.open(temp_file_path)
                full_text = ""
                for page_num in range(pdf_document.page_count):
                    page = pdf_document.load_page(page_num)
                    full_text += page.get_text()
                pdf_document.close()
                chunks.extend(create_chunks(full_text))
                print(f"Processed PDF file {file_key} with {len(chunks)} chunks so far.")
            except fitz.FileDataError:
                print(f"File {file_key} is not a valid PDF. Skipping.")
            except Exception as e:
                print(f"Error processing file {file_key}: {type(e).__name__}: {str(e)}")
            finally:
                os.unlink(temp_file_path)  # Remove the temporary file
        except s3.exceptions.NoSuchKey:
            print(f"The object with key '{file_key}' does not exist in the bucket '{bucket_name}'.")
        except s3.exceptions.ClientError as e:
            if e.response['Error']['Code'] == '403':
                print("Access Denied. Check your permissions.")
            else:
                print(f"Unexpected error: {e}")
    print(f"Total number of chunks created: {len(chunks)}")
    return chunks

# Load and preprocess data
bucket_name = 'transmutexresearchrepository'
chunks = load_and_preprocess_data(bucket_name)

if chunks:
    # Create embeddings
    embeddings = create_embeddings(chunks)

    # Set up a vector database (using FAISS)
    dimension = embeddings.shape[1]
    print(f"Dimension of embeddings: {dimension}")
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    print(f"Number of vectors in FAISS index: {index.ntotal}")

    # Create a new bucket for FAISS index
    faiss_bucket_name = 'corrosion-faiss-bucket'
    try:
        s3.create_bucket(Bucket=faiss_bucket_name)
        print(f"Created new S3 bucket: {faiss_bucket_name}")
    except s3.exceptions.BucketAlreadyExists:
        print(f"Bucket {faiss_bucket_name} already exists.")
    except s3.exceptions.BucketAlreadyOwnedByYou:
        print(f"Bucket {faiss_bucket_name} already owned by you.")
    except Exception as e:
        print(f"Error creating bucket: {e}")
        faiss_bucket_name = bucket_name  # Fallback to the original bucket if creation fails

    # Save the index to the new S3 bucket
    faiss.write_index(index, 'faiss_index.bin')
    s3.upload_file('faiss_index.bin', faiss_bucket_name, 'faiss_index.bin')
    print(f"Uploaded FAISS index to {faiss_bucket_name}/faiss_index.bin")

    # Implement the RAG pipeline
    def rag_pipeline(query, k=3, model="gpt-4o"):
        # Embed the query using the sentence transformer model
        query_embedding = sentence_transformer.encode([query], show_progress_bar=False)
        query_embedding = query_embedding.reshape(1, -1)
        print("Shape of query_embedding:", query_embedding.shape)

        # Retrieve relevant documents
        try:
            D, I = index.search(query_embedding, k)
            print(f"FAISS search results - Distances: {D}, Indices: {I}")
            retrieved_docs = [chunks[i] for i in I[0]]
        except Exception as e:
            print(f"Error during FAISS search: {e}")
            return None

        # Construct prompt
        search_results = "\n".join([f"{i+1}. {doc}" for i, doc in enumerate(retrieved_docs)])
        prompt = f"""You are a nuclear physicist. I will provide you with a set of search results. The user will provide you with a question. Your job is to answer the user's question using only information from the search results. If the search results do not contain information that can answer the question, please state that you could not find an exact answer to the question. Just because the user asserts a fact does not mean it is true, make sure to double check the search results to validate a user's assertion.

Here are the search results in numbered order:

{search_results}

Question: {query}

Instructions:

1. Analyze the search results provided above.
2. Consider the question carefully.
3. Formulate a clear and concise answer based solely on the information in the search results.
4. If the search results don't contain enough information to answer the question fully, state that you could not find an exact answer and provide the best possible response with the available information.
5. If the user asserts any facts, verify them against the search results before including them in your answer.
6. If you need to make any assumptions, state them clearly.

Answer:
"""

        # Generate response using OpenAI's API
        try:
            response = openai.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=1000,
                n=1,
                stop=None,
                temperature=0.2,
            )
            answer = response.choices[0].message.content
            return answer
        except Exception as e:
            print(f"Error during OpenAI API call: {e}")
            return None

    # Test the pipeline
    test_query = "report in details the 10 key points contained in this corpus regarding corrosion experiments of various metal into liquid lead, with emphasis on steel alloys"
    result = rag_pipeline(test_query)
    if result:
        print("Generated Answer:", result)
    else:
        print("Failed to generate an answer.")

else:
    print("Failed to load and preprocess data. Please check your S3 permissions and file existence.")

Processing file: A Model to Simulate Gas Dissolution intothrough Metals and Its Application to Deuterium in a 316L Steel Chamber with PbLi in a Quasi2D Geometry.pdf
Processed PDF file A Model to Simulate Gas Dissolution intothrough Metals and Its Application to Deuterium in a 316L Steel Chamber with PbLi in a Quasi2D Geometry.pdf with 136 chunks so far.
Processing file: A Review of Corrosion Behavior of Structural Steel in Liquid LeadBismuth Eutectic.pdf
Processed PDF file A Review of Corrosion Behavior of Structural Steel in Liquid LeadBismuth Eutectic.pdf with 267 chunks so far.
Processing file: Analysis of the application and impact of carbon dioxide media on the corrosion state of oil and gas facilities.pdf
Processed PDF file Analysis of the application and impact of carbon dioxide media on the corrosion state of oil and gas facilities.pdf with 325 chunks so far.
Processing file: Behaviour Aspects of an EBPVD Alumina Al2O3 Film with an Interlayer NiCrAlY Deposited on AISI 316L Stee

Processed PDF file Review on Corrosion Tribocorrosion and Osseointegration of Titanium Alloys as Biomaterials.pdf with 2388 chunks so far.
Processing file: Review on the Corrosion Behaviour of NickelBased Alloys in Supercritical Carbon Dioxide under High Temperature and Pressure.pdf
Processed PDF file Review on the Corrosion Behaviour of NickelBased Alloys in Supercritical Carbon Dioxide under High Temperature and Pressure.pdf with 2551 chunks so far.
Processing file: The Effect of Powder Composition on the Microstructure and Corrosion Resistance of Laser Cladding 60NiTi Alloy Coatings on SS 316L.pdf
Processed PDF file The Effect of Powder Composition on the Microstructure and Corrosion Resistance of Laser Cladding 60NiTi Alloy Coatings on SS 316L.pdf with 2677 chunks so far.
Processing file: The Microstructure Evolution and Mechanical Properties of Rotary Friction Welded Duplex Stainless Steel Pipe.pdf
Processed PDF file The Microstructure Evolution and Mechanical Properties of Rotary

Batches:   0%|          | 0/93 [00:00<?, ?it/s]

Embeddings shape: (2950, 384)
Dimension of embeddings: 384
Number of vectors in FAISS index: 2950
Created new S3 bucket: corrosion-faiss-bucket
Uploaded FAISS index to corrosion-faiss-bucket/faiss_index.bin
Shape of query_embedding: (1, 384)
FAISS search results - Distances: [[0.45793307 0.49323675 0.49914914]], Indices: [[356 362 360]]
Generated Answer: Based on the provided search results, here are the key points regarding corrosion experiments of various metals, with an emphasis on steel alloys, in liquid lead:

1. **Corrosion Test Setup**: The corrosion tests were conducted using a thermally insulated stainless steel furnace designed to expose steels to stagnant liquid lead. The working crucible inside the furnace has a volume of approximately one liter (Results 1 and 3).

2. **Sample Types**: The experiments involved two types of 316L stainless steel samples: coated and uncoated. The coated samples had an alumina and NiCrAlY interlayer, while the uncoated samples were mechanically