In [1]:
import requests
from bs4 import BeautifulSoup
import sys

In [None]:
url = 'https://pandas.pydata.org/docs/user_guide/10min.html'
output_filename = 'documentation_content.txt'

In [None]:
response = requests.get(url, timeout=10)
response.raise_for_status()

In [None]:
response

In [None]:
soup = BeautifulSoup(response.content, 'html.parser')

In [None]:
soup

In [None]:
main_content = soup.find('div', class_='bd-content')

In [None]:
main_content

In [None]:
text_content = main_content.get_text(separator='\n', strip=True)

In [None]:
text_content

In [None]:
try:
    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(text_content)
    print(f"Success! Content saved to '{output_filename}'.")

except IOError as e:
    print(f"Error: Could not write to file '{output_filename}'. {e}")
    sys.exit(1)

In [None]:
with open("documentation_content.txt", "r", encoding="utf-8") as f:
    documentation_content = f.read()

In [None]:
from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base",
    chunk_size=100,
    chunk_overlap=0
)
texts = text_splitter.split_text(documentation_content)

In [None]:
texts

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import sys

In [None]:
input_filename = 'documentation_content.txt'

In [None]:
try:
    with open(input_filename, 'r', encoding='utf-8') as f:
        documentation_text = f.read()
            
except FileNotFoundError:
    print(f"Error: The file '{input_filename}' was not found.")
    sys.exit(1)

print("Content read successfully.")

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
)

In [None]:
chunks = text_splitter.split_text(documentation_text)

In [None]:
num_chunks = len(chunks)
print(f"Success! The text was split into {num_chunks} chunks.")

In [None]:
chunks[0]

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings

In [None]:
persist_directory = 'chroma_db'

In [None]:
embedding_model = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
embedding_model

In [None]:
vectordb = Chroma.from_texts(
    texts=chunks, 
    embedding=embedding_model,
    persist_directory=persist_directory
)

In [None]:
vectordb

In [None]:
import os
import sys
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA

In [None]:
load_dotenv()
PERSIST_DIRECTORY = 'chroma_db'
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

In [None]:
def create_qa_chain():
    embedding_model = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    
    vectordb = Chroma(
        persist_directory=PERSIST_DIRECTORY, 
        embedding_function=embedding_model
    )
    
    retriever = vectordb.as_retriever()
    
    #llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")
    llm = ChatOllama(model="llama3") 
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    return qa_chain

In [None]:
def ask_question(chain, query):
    result = chain.invoke({"query" : query})
    print("\n Answer: ")
    print(result["result"])

In [None]:
qa_chain = create_qa_chain()

question = "How do I create a Series in pandas?"

ask_question(qa_chain, question)

In [2]:
import requests
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter
import sys

In [24]:
count = int(input("Number of URL(s): "))
u = []
print(count)

for i in range(count):
    url = input(f"url {i}: ")
    u.append(url)
print(u)

Number of URL(s):  3


3


url 0:  https://pandas.pydata.org/docs/user_guide/10min.html
url 1:  https://pandas.pydata.org/docs/user_guide/dsintro.html
url 2:  https://pandas.pydata.org/docs/user_guide/basics.html


['https://pandas.pydata.org/docs/user_guide/10min.html', 'https://pandas.pydata.org/docs/user_guide/dsintro.html', 'https://pandas.pydata.org/docs/user_guide/basics.html']


In [3]:
urls = [
    'https://pandas.pydata.org/docs/user_guide/10min.html',
    'https://pandas.pydata.org/docs/user_guide/dsintro.html',
    'https://pandas.pydata.org/docs/user_guide/basics.html'
]

output_filename = 'docs_content.txt'

In [4]:
all_text_content = ""

print(f"Starting to process {len(urls)} URL(s)...")

for url in urls:
    print(f"Scraping content from: {url}")
    response = requests.get(url, timeout=10)
    response.raise_for_status()

    soup = BeautifulSoup(response.content, 'html.parser')
    
    main_content = soup.find('div', class_='bd-content')
        
    if main_content:
        text_content = main_content.get_text(separator='\\n', strip=True)
        all_text_content += text_content + "\\n\\n--- Page Break ---\\n\\n"
    else:
        print(f"Warning: Could not find the main content for {url}. The page structure might be different.")


with open(output_filename, 'w', encoding='utf-8') as f:
    f.write(all_text_content)
print(f"\\nSuccess! All content has been combined and saved to '{output_filename}'.")

Starting to process 3 URL(s)...
Scraping content from: https://pandas.pydata.org/docs/user_guide/10min.html
Scraping content from: https://pandas.pydata.org/docs/user_guide/dsintro.html
Scraping content from: https://pandas.pydata.org/docs/user_guide/basics.html
\nSuccess! All content has been combined and saved to 'docs_content.txt'.


In [5]:
with open("docs_content.txt", "r", encoding="utf-8") as f:
    docs_content = f.read()

In [6]:
docs_content



In [7]:
from langchain_text_splitters import CharacterTextSplitter
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    encoding_name="cl100k_base",
    chunk_size=100,
    chunk_overlap=0
)
texts = text_splitter.split_text(docs_content)

In [8]:
texts



In [9]:
texts_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    length_function=len,
)

In [10]:
chunks = texts_splitter.split_text(docs_content)

In [11]:
num_chunks = len(chunks)
print(f"Success! The text was split into {num_chunks} chunks.")

Success! The text was split into 281 chunks.


In [12]:
chunks[0]

'User Guide\\n10 minutes to pandas\\n10 minutes to pandas\\n#\\nThis is a short introduction to pandas, geared mainly for new users.\nYou can see more complex recipes in the\\nCookbook\\n.\\nCustomarily, we import as follows:\\nIn [1]:\\nimport\\nnumpy\\nas\\nnp\\nIn [2]:\\nimport\\npandas\\nas\\npd\\nBasic data structures in pandas\\n#\\nPandas provides two types of classes for handling data:\\nSeries\\n: a one-dimensional labeled array holding data of any type\\nsuch as integers, strings, Python objects etc.\\nDataFrame\\n: a two-dimensional data structure that holds data like\na two-dimension array or a table with rows and columns.\\nObject creation\\n#\\nSee the\\nIntro to data structures section\\n.\\nCreating a\\nSeries\\nby passing a list of values, letting pandas create'

In [13]:
import os
import sys
from dotenv import load_dotenv

from langchain_openai import ChatOpenAI
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA

In [14]:
load_dotenv()
PERSIST_DIRECTORY = 'chroma_db_1'
EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"

In [15]:
def create_qa_chain_0():
    embedding_model = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    
    vectordb = Chroma(
        persist_directory=PERSIST_DIRECTORY, 
        embedding_function=embedding_model
    )
    
    retriever = vectordb.as_retriever()
    
    llm = ChatOllama(model="llama3") 
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    return qa_chain

In [16]:
def ask_question_0(chain, query):
    result = chain.invoke({"query" : query})
    print("\n Answer: ")
    print(result["result"])

In [17]:
qa_chain = create_qa_chain_0()

question = "How do I create a Series in pandas?"

ask_question_0(qa_chain, question)

  embedding_model = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
  from .autonotebook import tqdm as notebook_tqdm
  vectordb = Chroma(
  llm = ChatOllama(model="llama3")



 Answer: 
To create a series in pandas, you can use the `pd.Series()` function or the `Series` constructor. Here are some examples:

1. From a list:
```
import pandas as pd

my_list = [1, 2, 3, 4, 5]
my_series = pd.Series(my_list)
print(my_series)
```

This will create a series with integer values.

2. From a dictionary:
```
import pandas as pd

my_dict = {'a': 1, 'b': 2, 'c': 3}
my_series = pd.Series(my_dict)
print(my_series)
```

This will create a series with string keys and integer values.

3. From a scalar value (repeat the value for all indices):
```
import pandas as pd

scalar_value = 5
my_series = pd.Series([scalar_value] * 5, index=range(5))
print(my_series)
```

This will create a series with all elements equal to `scalar_value` and indices from 0 to 4.

Note that if you don't specify an index, pandas will automatically assign default integer indices starting from 0.


In [18]:
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

In [20]:
def create_qa_chain_1():
    embedding_model = SentenceTransformerEmbeddings(model_name=EMBEDDING_MODEL_NAME)
    
    vectordb = Chroma(
        persist_directory=PERSIST_DIRECTORY, 
        embedding_function=embedding_model
    )
    
    retriever = vectordb.as_retriever()
    
    llm = ChatOllama(model="llama3")

    memory = ConversationBufferMemory(
        memory_key="chat_history", 
        return_messages=True
    )

    qa_chain = ConversationalRetrievalChain.from_llm(
        llm=llm,
        retriever=retriever,
        memory=memory
    )

    return qa_chain

In [22]:
qa_chain = create_qa_chain_1()

print("Chatbot ready! Type 'exit' to end the conversation.")

while True:
    query = input("Ask a question about pandas: ")
    
    if query.lower() == 'exit':
        print("Goodbye!")
        break
    
    if query:
        result = qa_chain.invoke({"question": query})
        print("Answer:", result['answer'])

Chatbot ready! Type 'exit' to end the conversation.


Ask a question about pandas:  How do I create a Series in pandas?


Answer: To create a Series in pandas, you can use the `pandas.Series()` function or pass a dictionary or iterable to the `pandas.DataFrame()` function.

Here are some examples:

1. Create a Series from a dictionary:
```
import pandas as pd
data = {'A': 1, 'B': 2, 'C': 3}
series = pd.Series(data)
print(series)
```

This will create a Series with the keys from the dictionary and values.

2. Create a Series from an iterable (like a list or tuple):
```
import pandas as pd
data = [1, 2, 3, 4, 5]
series = pd.Series(data)
print(series)
```

In this case, the indices will be automatically created based on the order of the values in the iterable.

3. Create a Series with specific index:
```
import pandas as pd
data = [1, 2, 3, 4, 5]
index = ['A', 'B', 'C', 'D', 'E']
series = pd.Series(data, index=index)
print(series)
```

In this case, you can specify the indices for your Series.

Remember to import pandas as `pd` before creating a Series.


Ask a question about pandas:  exit


Goodbye!


In [None]:
from langchain.agents import Tool, AgentExecutor, create_react_agent
from langchain import hub

In [None]:
llm = ChatOpenAI(temperature=0, model_name="gpt-4o-mini")

In [None]:
qa_chain = create_qa_chain(llm)

In [None]:
tools = [
    Tool(
        name="pandas_documentation_search",
        func=qa_chain.invoke,
        description="""Use this tool whenever you need to answer a question about the Python pandas library. This is your primary source for pandas-related queries."""
    )
]

In [None]:
prompt = hub.pull("")

In [None]:
agent = create_react_agent(llm, tools, prompt)

In [None]:
agent_executor = AgentExecutor(
    agent=agent,
    tools=tools,
    verbose=True,
    handle_parsing_errors=True
)

In [None]:
query = "How do I create a Series in pandas?"

In [None]:
result = agent_executor.invoke({"input": query})

In [None]:
result["output"]