In [1]:
# Standard library imports
import os
from typing import Literal

# Third-party imports
import spacy
import trafilatura
import ibm_db
import ibm_db_dbi
from dotenv import load_dotenv
from pydantic import BaseModel, Field
from IPython.display import Image, display

# LangChain core imports
from langchain.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.tools.retriever import create_retriever_tool
from langchain_core.documents import Document
from langchain_core.messages import convert_to_messages
from langchain_core.vectorstores import InMemoryVectorStore

# LangChain community imports
from langchain_community.llms import LlamaCpp
from langchain.embeddings import LlamaCppEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy

# LangChain IBM imports
from langchain_ibm import ChatWatsonx
from ibm_watsonx_ai.metanames import GenTextParamsMetaNames as GenParams, EmbedTextParamsMetaNames

# LangChain DB2 imports
from langchain_db2 import db2vs
from langchain_db2.db2vs import DB2VS

# LangGraph imports
from langgraph.graph import MessagesState, StateGraph, START, END
from langgraph.prebuilt import ToolNode, tools_condition

# Load environment variables with explicit path and override
load_dotenv(os.path.join(os.getcwd(), ".env"), override=True)

True

In [2]:
DB_NAME = os.getenv("DB_NAME", "")
DB_HOST = os.getenv("DB_HOST", "")
DB_PORT = os.getenv("DB_PORT", "")
DB_PROTOCOL = os.getenv("DB_PROTOCOL", "")
DB_USER = os.getenv("DB_USER", "")
DB_PASSWORD = os.getenv("DB_PASSWORD", "")
LLM_PATH = os.getenv("LLM_PATH", "")
EMBEDDING_MODEL_PATH = os.getenv("EMBEDDING_MODEL_PATH", "")


conn_str=f"DATABASE={DB_NAME};hostname={DB_HOST};port={DB_PORT};protocol={DB_PROTOCOL};uid={DB_USER};pwd={DB_PASSWORD}"

try:
    connection = ibm_db_dbi.connect(conn_str, '', '')
    print("Connection successful!")
except Exception as e:
    print("Connection failed!")

Connection successful!


## 1. Load and Preprocess documents

In [3]:
url = 'https://community.ibm.com/community/user/blogs/shaikh-quader/2024/05/07/building-an-in-db-linear-regression-model-with-ibm'
downloaded = trafilatura.fetch_url(url)

if downloaded:
    article = trafilatura.extract(downloaded)
    print(article[:1000])  # Preview first 1000 chars
else:
    print("Failed to fetch content.")

Despite being one of the earlier machine learning techniques, linear regression continues to be a top choice among ML practitioners for a regression task. For the past three years, over 80% of the respondents to Kaggle’s annual state of data science and machine learning survey mentioned linear regression as a ML algorithm they most frequently use. IBM Db2 provides an in-database stored procedure (SP) for Linear Regression as part of its ML library, which is a collection of over 200 SPs for performing different ML tasks in the database. Using the linear regression SP and other functionality of DB2’s ML Library, ML practitioners can build and deploy linear regression models in the database when their ML dataset is available in a Db2 database. In this post, I will show you the following steps of building and using a linear regression pipeline using SQL with a Db2 database:
Let’s begin.
The Regression Task
In this exercise, I will use the GoSales dataset, which is available from this link.

In [4]:
nlp = spacy.load("en_core_web_sm")

In [5]:
def overlapping_sentence_chunker(text, max_words=200, overlap_words=50):
    doc = nlp(text)
    sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
    
    chunks = []
    current_chunk = []
    current_length = 0

    i = 0
    while i < len(sentences):
        sentence = sentences[i]
        sentence_length = len(sentence.split())

        if current_length + sentence_length <= max_words:
            current_chunk.append(sentence)
            current_length += sentence_length
            i += 1
        else:
            chunks.append(" ".join(current_chunk))
            # Start new chunk with overlap
            overlap = []
            overlap_len = 0
            j = len(current_chunk) - 1
            while j >= 0 and overlap_len < overlap_words:
                s = current_chunk[j]
                overlap.insert(0, s)
                overlap_len += len(s.split())
                j -= 1
            current_chunk = overlap
            current_length = overlap_len

    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [6]:
chunks = overlapping_sentence_chunker(article, max_words=200, overlap_words=50)
print(f"{len(chunks)} chunks created.")
print(chunks[0])

15 chunks created.
Despite being one of the earlier machine learning techniques, linear regression continues to be a top choice among ML practitioners for a regression task. For the past three years, over 80% of the respondents to Kaggle’s annual state of data science and machine learning survey mentioned linear regression as a ML algorithm they most frequently use. IBM Db2 provides an in-database stored procedure (SP) for Linear Regression as part of its ML library, which is a collection of over 200 SPs for performing different ML tasks in the database. Using the linear regression SP and other functionality of DB2’s ML Library, ML practitioners can build and deploy linear regression models in the database when their ML dataset is available in a Db2 database. In this post, I will show you the following steps of building and using a linear regression pipeline using SQL with a Db2 database:
Let’s begin. The Regression Task
In this exercise, I will use the GoSales dataset, which is availa

In [7]:
chunks

['Despite being one of the earlier machine learning techniques, linear regression continues to be a top choice among ML practitioners for a regression task. For the past three years, over 80% of the respondents to Kaggle’s annual state of data science and machine learning survey mentioned linear regression as a ML algorithm they most frequently use. IBM Db2 provides an in-database stored procedure (SP) for Linear Regression as part of its ML library, which is a collection of over 200 SPs for performing different ML tasks in the database. Using the linear regression SP and other functionality of DB2’s ML Library, ML practitioners can build and deploy linear regression models in the database when their ML dataset is available in a Db2 database. In this post, I will show you the following steps of building and using a linear regression pipeline using SQL with a Db2 database:\nLet’s begin. The Regression Task\nIn this exercise, I will use the GoSales dataset, which is available from this l

## 2. Create a retriever tool

Set up embedding model

In [8]:
%%capture init_logs

embeddings = LlamaCppEmbeddings(
    model_path=EMBEDDING_MODEL_PATH,
    n_ctx=256,
    n_batch=1,
    verbose=False,
    n_gpu_layers=0,
)

Set up vector store

In [9]:
vectorstore = DB2VS.from_texts(
    chunks,
    embeddings,
    client=connection,
    table_name="Documents_EUCLIDEAN",
    distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

# Set up LLM

In [10]:
%%capture init_logs

# Adjusted parameters for better text generation
llm = LlamaCpp(
    model_path=LLM_PATH,
    n_gpu_layers=-1,
    max_tokens=750,        # Increase for more detailed responses
    n_ctx=4096,
    seed=42,
    temperature=0.3,       # Increase slightly for more creativity
    top_p=0.9,            # Add nucleus sampling
    top_k=40,             # Add top-k sampling
    repeat_penalty=1.1,    # Reduce repetition
    verbose=False
)

# Limit retriever to prevent context overflow
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}  # Only 2 chunks to stay within context
)

# The RAG Prompt

In [11]:
# Better prompt that encourages combining information
template = """<|system|>
You are a helpful assistant. When answering questions, synthesize information from all the provided context to give a comprehensive, well-structured response.<|end|>
<|user|>
Context Information:
{context}

Question: {question}

Instructions: 
- Use ALL relevant information from the context above
- Combine related points into a coherent answer
- Organize your response clearly with examples when available
- If multiple sources mention the same topic, integrate them smoothly<|end|>
<|assistant|>"""

prompt = PromptTemplate(
    template=template,
    input_variables=["context", "question"]
)

In [12]:
# Modify your RAG to return source documents
rag = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type='stuff',
    retriever=retriever,
    chain_type_kwargs={"prompt": prompt},
    verbose=True,
    return_source_documents=True  # This returns the retrieved context
)

In [13]:
# Now you can see both answer and context
result = rag.invoke('How to build a linear regression model in Db2?')

print("="*50)
print("ANSWER:")
print(result['result'])

print("\n" + "="*50)
print("RETRIEVED CONTEXT:")
for i, doc in enumerate(result['source_documents']):
    print(f"\n--- Chunk {i+1} ---")
    print(doc.page_content)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
ANSWER:
 To build a linear regression model in Db2, you need to follow several steps involving data preparation and model training. Here’s a comprehensive guide:

### 1. Data Preparation
- **Train/Test Split**: You first divide the records from the `GOSALES_FULL` table into two partitions: a training partition and a test partition.
  
  ```sql
  CALL IDAX.SPLIT_DATA(
    'intable=GOSALES.GOSALES_FULL, id=ID, traintable=GOSALES.GOSALES_TRAIN, testtable=GOSALES.GOSALES_TEST, fraction=0.8, seed=1'
  );
  ```

- **Handling Missing Values**: You then handle missing values in specific columns using the `IMPUTE_DATA` stored procedure.

  ```sql
  CALL IDAX.IMPUTE_DATA(
    'intable=GOSALES.GOSALES_TRAIN, incolumn=AGE, method=mean'
  );

  CALL IDAX.IMPUTE_DATA(
    'intable=GOSALES.GOSALES_TRAIN, method=replace, nominalValue=M, incolumn=GENDER'
  );
  ```

### 2. Model Training
- **Linear Regression**: After preparing th

In [14]:
# Now you can see both answer and context
result = rag.invoke('How to evaluate a linear regression model in Db2?')

print("="*50)
print("ANSWER:")
print(result['result'])

print("\n" + "="*50)
print("RETRIEVED CONTEXT:")
for i, doc in enumerate(result['source_documents']):
    print(f"\n--- Chunk {i+1} ---")
    print(doc.page_content)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
ANSWER:
 To evaluate a linear regression model in Db2, you can follow these steps:

1. **Model Training**: Ensure that the model has been trained using the `LINEAR_REGRESSION` stored procedure (SP). This SP will learn the intercept and coefficients of the linear regression model.

2. **Model Metadata Table**: The learned values of the intercept and coefficients are saved in a metadata table named after the model, e.g., `GOSALES_LINREG_MODEL`.

3. **Evaluation Metrics**: To evaluate the performance of your linear regression model, you can use various metrics such as:
   - **Mean Absolute Error (MAE)**: Measures the average absolute difference between predicted and actual values.
   - **Mean Squared Error (MSE)**): Measures the average squared difference between predicted and actual values.
   - **Root Mean Squared Error (RMSE)**): The square root of MSE, providing a measure in the same units as the target variable.