In [1]:
print("starting")

starting


In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
# ## langchain imports
# from langchain_text_splitter import RecursiveCharacterTextSplitter
# from langchain_community.document_loaders import TextLoader
# from langchain_openai import OpenAIEmbeddings
# from langchain.schema import Document

# ## vectorstores
# from langchain_community.vectorstores import Chroma

# ## utility imports
# import numpy as np
# from typing import List


## text splitters
from langchain_text_splitters import RecursiveCharacterTextSplitter

## document loaders
from langchain_community.document_loaders import TextLoader

## embeddings
from langchain_openai import OpenAIEmbeddings

## document schema
# from langchain.schema import Document
from langchain_core.documents import Document

## vector stores
from langchain_community.vectorstores import Chroma

## utility imports
import numpy as np
from typing import List


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
## create sample documents
sample_docs = [
    """
    Machine Learning Fundamentals
    
    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. There are three main 
    types of machine learning: supervised learning, unsupervised learning, and reinforcement 
    learning. Supervised learning uses labeled data to train models, while unsupervised 
    learning finds patterns in unlabeled data. Reinforcement learning learns through 
    interaction with an environment using rewards and penalties.
    """,
    
    """
    Deep Learning and Neural Networks
    
    Deep learning is a subset of machine learning based on artificial neural networks. 
    These networks are inspired by the human brain and consist of layers of interconnected 
    nodes. Deep learning has revolutionized fields like computer vision, natural language 
    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly 
    effective for image processing, while Recurrent Neural Networks (RNNs) and Transformers 
    excel at sequential data processing.
    """,
    
    """
    Natural Language Processing (NLP)
    
    NLP is a field of AI that focuses on the interaction between computers and human language. 
    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, 
    machine translation, and question answering. Modern NLP heavily relies on transformer 
    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand 
    context and relationships between words in text.
    """
]

sample_docs


['\n    Machine Learning Fundamentals\n\n    Machine learning is a subset of artificial intelligence that enables systems to learn \n    and improve from experience without being explicitly programmed. There are three main \n    types of machine learning: supervised learning, unsupervised learning, and reinforcement \n    learning. Supervised learning uses labeled data to train models, while unsupervised \n    learning finds patterns in unlabeled data. Reinforcement learning learns through \n    interaction with an environment using rewards and penalties.\n    ',
 '\n    Deep Learning and Neural Networks\n\n    Deep learning is a subset of machine learning based on artificial neural networks. \n    These networks are inspired by the human brain and consist of layers of interconnected \n    nodes. Deep learning has revolutionized fields like computer vision, natural language \n    processing, and speech recognition. Convolutional Neural Networks (CNNs) are particularly \n    effective f

In [5]:
## save sample documents to files
import tempfile
temp_dir=tempfile.mkdtemp()

for i,doc in enumerate(sample_docs):
    with open(f"doc_{i}.txt","w") as f:
        f.write(doc)



In [6]:
from langchain_community.document_loaders import DirectoryLoader,TextLoader

# Load documents from directory
loader = DirectoryLoader(
    "data", 
    glob="*.txt", 
    loader_cls=TextLoader,
    loader_kwargs={'encoding': 'utf-8'}
)
documents = loader.load()

print(f"Loaded {len(documents)} documents")
print(f"\nFirst document preview:")
print(documents[0].page_content[:200] + "...")


Loaded 3 documents

First document preview:

    Machine Learning Fundamentals

    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experience without being explicitly programmed. Ther...


In [7]:
# document splitting
text_splitter  = RecursiveCharacterTextSplitter(
    chunk_size = 500,
    chunk_overlap = 50,
    length_function = len,
    separators = [" "]
)

chunks = text_splitter.split_documents(documents)



print(f"Created {len(chunks)} chunks from {len(documents)} documents")
print(f"\nChunk example:")
print(f"Content: {chunks[0].page_content[:150]}...")
print(f"Metadata: {chunks[0].metadata}")

Created 5 chunks from 3 documents

Chunk example:
Content: Machine Learning Fundamentals

    Machine learning is a subset of artificial intelligence that enables systems to learn 
    and improve from experie...
Metadata: {'source': 'data\\doc_0.txt'}


In [8]:
# performing embeddings
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [9]:
sample_text = "Machine Learning is fascinating"
embeddings = OpenAIEmbeddings()
embeddings


OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x000001C2B63367E0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x000001C2B77A66F0>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version=None, openai_api_base=None, openai_api_type=None, openai_proxy=None, embedding_ctx_length=8191, openai_api_key=SecretStr('**********'), openai_organization=None, allowed_special=None, disallowed_special=None, chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None, http_async_client=None, check_embedding_ctx_length=True)

In [10]:
vectors = embeddings.embed_query(sample_text)
vectors

[-0.02172444760799408,
 0.016208980232477188,
 0.010213345289230347,
 -0.022516079246997833,
 -0.0037213172763586044,
 0.01783117651939392,
 4.82096329506021e-05,
 0.01027174387127161,
 -0.015547124668955803,
 -0.04134652763605118,
 0.007929293438792229,
 0.03628527745604515,
 -0.019128933548927307,
 -0.008234266191720963,
 -0.0013058676850050688,
 0.00581719446927309,
 0.03880292549729347,
 0.008811768144369125,
 -0.0005584409227594733,
 -0.008591149002313614,
 -0.031224025413393974,
 0.022048886865377426,
 -0.005914526060223579,
 -0.03441650792956352,
 -0.014898247085511684,
 0.0023018959909677505,
 0.003834871109575033,
 -0.03885483369231224,
 -0.012523352168500423,
 -0.002739888848736882,
 0.027590306475758553,
 -0.004736811853945255,
 -0.0170655008405447,
 -0.03981517627835274,
 -0.008513283915817738,
 -0.012211889959871769,
 -0.004152821376919746,
 0.0028583090752363205,
 -0.01670212857425213,
 -0.00013068816042505205,
 0.020076295360922813,
 0.02541007660329342,
 -0.008435418829

In [11]:
# # chromDB
# presist_directory = "./chroma_db"

# vectorstore=Chroma.from_documents(
#     documents = chunks,
#     embedding = OpenAIEmbeddings(),
#     presist_directory = presist_directory,
#     collection_name = "rag_collection"
# )



# print(f"Vector store created with {vectorstore._collection.count()} vectors")
# print(f"Persisted to: {persist_directory}")


## Create a Chromdb vector store
persist_directory="./chroma_db"

## Initialize Chromadb with Open AI embeddings
vectorstore=Chroma.from_documents(
    documents=chunks,
    embedding=OpenAIEmbeddings(),
    persist_directory=persist_directory,
    collection_name="rag_collection"
)

print(f"Vector store created with {vectorstore._collection.count()} vectors")
print(f"Persisted to: {persist_directory}")

Vector store created with 20 vectors
Persisted to: ./chroma_db


In [12]:
# similarity search 

In [13]:
# what is nlp 
query = "what is nlp ?"

similar_doc = vectorstore.similarity_search(query, k=3)

In [14]:
similar_doc

[Document(metadata={'source': 'data\\doc_2.txt'}, page_content='Natural Language Processing (NLP)\n\n    NLP is a field of AI that focuses on the interaction between computers and human language. \n    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, \n    machine translation, and question answering. Modern NLP heavily relies on transformer \n    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand \n    context and relationships between words in text.'),
 Document(metadata={'source': 'data\\doc_2.txt'}, page_content='Natural Language Processing (NLP)\n\n    NLP is a field of AI that focuses on the interaction between computers and human language. \n    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, \n    machine translation, and question answering. Modern NLP heavily relies on transformer \n    architectures like BERT, GPT, and T5. These models use attention mec

In [15]:
query = "what is deep learning"

similar_docs = vectorstore.similarity_search(query, k=3)

In [16]:
similar_doc

[Document(metadata={'source': 'data\\doc_2.txt'}, page_content='Natural Language Processing (NLP)\n\n    NLP is a field of AI that focuses on the interaction between computers and human language. \n    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, \n    machine translation, and question answering. Modern NLP heavily relies on transformer \n    architectures like BERT, GPT, and T5. These models use attention mechanisms to understand \n    context and relationships between words in text.'),
 Document(metadata={'source': 'data\\doc_2.txt'}, page_content='Natural Language Processing (NLP)\n\n    NLP is a field of AI that focuses on the interaction between computers and human language. \n    Key tasks in NLP include text classification, named entity recognition, sentiment analysis, \n    machine translation, and question answering. Modern NLP heavily relies on transformer \n    architectures like BERT, GPT, and T5. These models use attention mec

In [17]:
# rag chain promt template , query  the rag system 

Initializeing llm

In [18]:
from langchain_openai import ChatOpenAI

llm=ChatOpenAI(
    model_name = "gpt-3.5-turbo"
)

In [19]:
res = llm.invoke("what is neural network")
res.content

'A neural network is a type of artificial intelligence model that is designed to mimic the way the human brain processes information. It is made up of interconnected nodes, or neurons, that work together to process and analyze complex data. Neural networks are commonly used in a variety of applications such as image recognition, natural language processing, and speech recognition. They can be trained to recognize patterns, make predictions, and solve complex problems.'

In [1]:
# another way
from langchain.chat_models.base import init_chat_model

llm = init_chat_model("openai:gpt-3.5-turbo")
# llm
res = llm.invoke("what is back propogation")
res.content

  from .autonotebook import tqdm as notebook_tqdm


'Backpropagation is a popular algorithm used in training deep learning neural networks. It involves updating the weights of the network by iteratively adjusting them in the direction that minimizes the error between the predicted output and the actual output of the network. This is achieved by calculating the gradient of the loss function with respect to the weights of the network, and then using this gradient to update the weights in a step-wise manner. By continuously adjusting the weights in the direction that minimizes the error, the network can learn to make more accurate predictions over time.'

##Modern rag

In [4]:
# from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

# from langchain.chains import create_retrieval_chain


# from langchain.chains.retrieval import create_retrieval_chain


ModuleNotFoundError: No module named 'langchain.chains'