<a href="https://colab.research.google.com/github/siripr4/pandas-tutor/blob/main/PandasTutor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Download and run the Ollama API which is available at available at 0.0.0.0:11434
!curl https://ollama.ai/install.sh | sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11868    0 11868    0     0  11280      0 --:--:--  0:00:01 --:--:-- 11292>>> Downloading ollama...
100 11868    0 11868    0     0  10752      0 --:--:--  0:00:01 --:--:-- 10759
############################################################################################# 100.0%
>>> Installing ollama to /usr/local/bin...
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


A script to run the Ollama API in a separate thread so that the notebook's main thread is not blocked

In [2]:
import os
import threading
import subprocess
import requests
import json

def ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()

In [3]:
# Download the model
from IPython.display import clear_output
!ollama pull llama3.1:8b
clear_output()

In [4]:
!pip install ollama

Collecting ollama
  Downloading ollama-0.3.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx<0.28.0,>=0.27.0 (from ollama)
  Downloading httpx-0.27.0-py3-none-any.whl.metadata (7.2 kB)
Collecting httpcore==1.* (from httpx<0.28.0,>=0.27.0->ollama)
  Downloading httpcore-1.0.5-py3-none-any.whl.metadata (20 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<0.28.0,>=0.27.0->ollama)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading ollama-0.3.1-py3-none-any.whl (10 kB)
Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.5-py3-none-any.whl (77 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.9/77.9 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h11-0.14.0-py3-none-any.whl (58 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.3/58.3 kB[0m [31m5.2 MB/s[0m et

In [5]:
import ollama

ollama.pull(model='llama3.1:8b')

{'status': 'success'}

In [6]:
generated = ollama.generate(model='llama3.1:8b', prompt='42 is the answer to the universe because...')
generated['response']

'A reference to Douglas Adams\' science fiction series "The Hitchhiker\'s Guide to the Galaxy"!\n\nIn the book, a supercomputer named Deep Thought is asked to calculate the "Answer to the Ultimate Question of Life, the Universe, and Everything." After 7.5 million years of computation, Deep Thought finally reveals that the answer is... 42.\n\nHowever, it\'s later revealed that the characters don\'t actually know what the ultimate question is, so the answer of 42 is essentially meaningless. The number 42 becomes a kind of joke or meme, symbolizing the idea that there might not be a definitive answer to life\'s greatest mysteries after all!\n\nSo, in short, "42" is the answer because it was a clever plot device and humorous commentary on our search for meaning in the universe!'

In [8]:
!pip install nest_asyncio



In [7]:
import asyncio
from ollama import AsyncClient

async def chat(prompt):
  message = {'role': 'user', 'content': prompt}
  response = await AsyncClient().chat(model='llama3.1:8b', messages=[message])

# Function to handle the chat interaction
async def chat_interaction(content):
    message = {'role': 'user', 'content': content}
    response = await AsyncClient().chat(model='llama3', messages=[message])
    return response

# Function to abstract the asyncio.run call
def run_async(func, *args):
    return asyncio.run(func(*args))

# asyncio.run(chat('Hello'))
user_content = "Why is the sky blue?"
response = run_async(chat_interaction, user_content)
print(response)

RuntimeError: asyncio.run() cannot be called from a running event loop

### Add functions to abstract ollama's chat API

In [18]:
import asyncio
from ollama import AsyncClient
import nest_asyncio

# Apply nest_asyncio to allow nested use of asyncio.run
nest_asyncio.apply()

# Asynchronous function to handle the chat interaction
async def _chat_interaction(content):
    message = {'role': 'user', 'content': content}
    response = await AsyncClient().chat(model='llama3.1:8b', messages=[message])
    return response

# Synchronous function to run the asynchronous chat interaction
def chat(content):
    loop = asyncio.get_event_loop()
    if loop.is_running():
        # If the event loop is already running, create a new task and wait for it
        future = asyncio.ensure_future(_chat_interaction(content))
        return loop.run_until_complete(future)
    else:
        return loop.run_until_complete(_chat_interaction(content))

In [21]:
  content = "Why is the sky blue?"
  response = chat(content)
  print(response['message'])

{'role': 'assistant', 'content': "The sky appears blue to us during the day because of a phenomenon called scattering, which involves the interaction between sunlight and the tiny molecules of gases in the atmosphere. Here's a simplified explanation:\n\n1. **Sunlight Composition**: Sunlight is made up of all the colors of the rainbow. When sunlight enters Earth's atmosphere, it encounters various particles like nitrogen (N2), oxygen (O2), and water vapor.\n\n2. **Scattering Effect**: These particles scatter the light in all directions but do so more efficiently for shorter wavelengths (like blue and violet) than for longer wavelengths (like red and orange). This is known as Rayleigh scattering, named after Lord Rayleigh who first described it in the late 19th century.\n\n3. **Perception of Color**: To our eyes on Earth's surface, the scattered blue light reaches us from all directions around the sky, making the sky appear blue. The longer wavelengths like red and orange are less scatte

## Dataset preparation

In [5]:
import zipfile
import os
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Unzip the HTML Files
zip_path = 'path_to_your_zip_file.zip'
extracted_path = 'extracted_html_files'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

# Step 2: Parse Each HTML File and Generate QA Pairs
data = {'question': [], 'answer': []}

for root, dirs, files in os.walk(extracted_path):
    for file in files:
        if file.endswith('.html'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'html.parser')

                # Extract relevant sections
                for section in soup.find_all(['section', 'div']):  # Adjust tags as needed
                    heading = section.find(['h2', 'h3'])
                    if heading:
                        content = section.get_text(separator="\n").strip()

                        # "What is" questions
                        what_question = f"What is {heading.text}?"
                        data['question'].append(what_question)
                        data['answer'].append(content)

                        # "How to" questions if code snippets are present
                        code_snippets = section.find_all('code')
                        if code_snippets:
                            how_question = f"How to use {heading.text} in pandas?"
                            how_answer = "\n".join([code.get_text() for code in code_snippets])
                            data['question'].append(how_question)
                            data['answer'].append(how_answer)

# Step 3: Save as QA Pairs
df = pd.DataFrame(data)
df.to_csv('pandas_qa_dataset.csv', index=False)

print("QA pairs have been generated and saved to 'pandas_qa_dataset.csv'")

