# Openshift with WatsonX and PosgreSQL for RAG

In [None]:
!pip install --upgrade pip
!pip install "langchain==0.0.345" 
!pip install wget 
!pip install sentence-transformers 
!pip install "chromadb==0.3.26" 
!pip install "ibm-watson-machine-learning>=1.0.333" 
#!pip install ibm-watson-machine-learning==1.0.335
#!pip install "pydantic==1.10.0"
!pip install pydantic==1.10.11
!pip install python-dotenv
!pip install typing-inspect==0.8.0
!pip install typing_extensions==4.5.0
!pip install psycopg2-binary
!pip install pypdf
!pip install pgvector

In [None]:
import os, getpass
from dotenv import load_dotenv
load_dotenv()

In [None]:
project_id = os.getenv("PROJECT_ID", None)
credentials = {
    #"url":  "https://eu-de.ml.cloud.ibm.com",
    "url": "https://us-south.ml.cloud.ibm.com",
    "apikey": os.getenv("API_KEY", None)
}

In [None]:
try:
    project_id = os.environ["PROJECT_ID"]
except KeyError:
    project_id = input("Please enter your project_id (hit enter): ")

In [None]:
import wget
filename = 'state_of_the_union.txt'
url = 'https://raw.github.com/IBM/watson-machine-learning-samples/master/cloud/data/foundation_models/state_of_the_union.txt'
if not os.path.isfile(filename):
    wget.download(url, out=filename)

In [None]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter
loader = TextLoader(filename ,encoding='utf-8')
documents = loader.load()

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)

In [None]:
import os
from dotenv import load_dotenv
# Load the .env file
load_dotenv()
# Get the values from the .env file
user = os.getenv("user")
password = os.getenv("password")
database = os.getenv("database")
server = os.getenv("server")
print("User:", user)
print("Database:", database)

In [None]:
# Construct the connection string
CONNECTION_STRING = f"postgresql+psycopg://{user}:{password}@{server}/{database}"
CONNECTION_STRING = f"postgresql://{user}:{password}@{server}/{database}"

# Print the connection string
#print(CONNECTION_STRING)

In [None]:
import psycopg2

conn = psycopg2.connect(
    host=server,
    database="vectordb",
    user="vectordb",
    password=password
)

cur = conn.cursor()
cur.execute("SELECT 1")
print(cur.fetchone())  # Should print (1,)
conn.close()


In [None]:
import os
from dotenv import load_dotenv
import psycopg2

# Construct the connection string
CONNECTION_STRING = f"postgresql://{user}:{password}@{server}/{database}"
#print(CONNECTION_STRING)

In [None]:
# Create a connection to the database
conn = psycopg2.connect(CONNECTION_STRING)

# Create a cursor object to execute queries
cur = conn.cursor()

# Execute the SQL command
cur.execute("""
    CREATE EXTENSION IF NOT EXISTS vector;
    CREATE TABLE IF NOT EXISTS embeddings (
      id SERIAL PRIMARY KEY,
      embedding vector,
      text text,
      created_at timestamptz DEFAULT now()
    );
""")

# Commit the changes
conn.commit()

# Close the cursor and connection
cur.close()
conn.close()

In [None]:
# Create a connection to the database
conn = psycopg2.connect(CONNECTION_STRING)

# Create a cursor object to execute queries
cur = conn.cursor()

# Check if the table exists
cur.execute("SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_name = 'embeddings')")
table_exists = cur.fetchone()[0]

if table_exists:
    print("Table 'embeddings' exists!")
else:
    print("Table 'embeddings' does not exist.")

# Get the schema of the table
cur.execute("SELECT column_name, data_type FROM information_schema.columns WHERE table_name = 'embeddings'")
schema = cur.fetchall()

print("Schema of table 'embeddings':")
for column in schema:
    print(f"  {column[0]}: {column[1]}")

# Close the cursor and connection
cur.close()
conn.close()

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.pgvector import PGVector

In [None]:
import os
import wget

pdf_folder_path = './rhods-doc'
filename = 'Vector_database.pdf'
url = 'https://github.com/ruslanmv/WatsonX-with-Langchain-PostgreSQL-with-pgvector/raw/master/rhods-doc/Vector_database.pdf'

# Create the directory if it doesn't exist
if not os.path.exists(pdf_folder_path):
    os.makedirs(pdf_folder_path)

full_path = os.path.join(pdf_folder_path, filename)

if not os.path.isfile(full_path):
    wget.download(url, out=full_path)


In [None]:

loader = PyPDFDirectoryLoader(pdf_folder_path)
docs = loader.load()

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1024,
                                               chunk_overlap=40)
all_splits_pdfs = text_splitter.split_documents(docs)


In [None]:
all_splits_pdfs[0]

In [None]:
for doc in all_splits_pdfs:
    doc.page_content = doc.page_content.replace('\x00', '')

In [None]:
embeddings = HuggingFaceEmbeddings()

COLLECTION_NAME = "documents_test"

db = PGVector.from_documents(
    documents=all_splits_pdfs,
    embedding=embeddings,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,)

In [None]:
from ibm_watson_machine_learning.foundation_models.utils.enums import ModelTypes



In [None]:
#model_id = ModelTypes.GRANITE_13B_CHAT
model_id = "ibm/granite-13b-instruct-v2"

In [None]:
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods



In [None]:
project_id = os.getenv("PROJECT_ID", None)
credentials = {
    #"url":  "https://eu-de.ml.cloud.ibm.com",
    "url": "https://us-south.ml.cloud.ibm.com",
    "apikey": os.getenv("API_KEY", None)
}

In [None]:
credentials.get("url")

In [None]:
credentials.get("apikey")

In [None]:
# Create an instance of WatsonxLLM
# WatsonxLLM initialization
parameters = {
    GenParams.DECODING_METHOD: DecodingMethods.SAMPLE.value,
    GenParams.MAX_NEW_TOKENS: 1000,
    GenParams.MIN_NEW_TOKENS: 50,
    GenParams.TEMPERATURE: 0.7,
    GenParams.TOP_K: 50,
    GenParams.TOP_P: 1
}

In [None]:
from ibm_watson_machine_learning.metanames import GenTextParamsMetaNames as GenParams
from ibm_watson_machine_learning.foundation_models.utils.enums import DecodingMethods
from langchain.llms import WatsonxLLM

In [None]:
parameters = {
    GenParams.DECODING_METHOD: DecodingMethods.GREEDY,
    GenParams.MIN_NEW_TOKENS: 1,
    GenParams.MAX_NEW_TOKENS: 200,
    GenParams.STOP_SEQUENCES: ["<|endoftext|>"]
}

In [None]:
from langchain.llms import WatsonxLLM
watsonx_granite = WatsonxLLM(
    model_id=model_id,
    url=credentials.get("url"),
    apikey=credentials.get("apikey"),
    project_id=project_id,
    params=parameters
)