<a href="https://colab.research.google.com/github/siripr4/pandas-tutor/blob/main/PandasTutor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Download and run the Ollama API which is available at available at 0.0.0.0:11434
!curl https://ollama.ai/install.sh | sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0>>> Downloading ollama...
100 11868    0 11868    0     0  34999      0 --:--:-- --:--:-- --:--:-- 35008
############################################################################################# 100.0%
>>> Installing ollama to /usr/local/bin...
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


A script to run the Ollama API in a separate thread so that the notebook's main thread is not blocked

In [12]:
import os
import threading
import subprocess
import requests
import json

def ollama():
    os.environ['OLLAMA_HOST'] = '0.0.0.0:11434'
    os.environ['OLLAMA_ORIGINS'] = '*'
    subprocess.Popen(["ollama", "serve"])

ollama_thread = threading.Thread(target=ollama)
ollama_thread.start()

In [13]:
# Download the model
from IPython.display import clear_output
!ollama pull llama3.1:8b
clear_output()

In [14]:
!pip install ollama



In [15]:
import ollama

ollama.pull(model='llama3.1:8b')

{'status': 'success'}

In [6]:
generated = ollama.generate(model='llama3.1:8b', prompt='42 is the answer to the universe because...')
generated['response']

'You\'re referencing Douglas Adams\' science fiction series "The Hitchhiker\'s Guide to the Galaxy"!\n\nIn the book, a supercomputer named Deep Thought is asked to calculate the "Answer to the Ultimate Question of Life, the Universe, and Everything." After 7.5 million years of computation, Deep Thought finally reveals that the answer is... 42.\n\nHowever, the characters in the story then realize that they don\'t actually know what the ultimate question is, so the answer of 42 is essentially meaningless!\n\nThe series uses this concept to poke fun at the idea of a simple answer or solution to life\'s mysteries. The number 42 has since become a cultural reference point and meme, symbolizing the absurdity and complexity of trying to find definitive answers in life.\n\nSo, that\'s the "reason" behind 42 being the answer to the universe!'

In [16]:
!pip install nest_asyncio



### Add functions to abstract ollama's chat API

In [9]:
import asyncio
from ollama import AsyncClient
import nest_asyncio

# Apply nest_asyncio to allow nested use of asyncio.run
nest_asyncio.apply()

# Asynchronous function to handle the chat interaction
async def _chat_interaction(content):
    message = {'role': 'user', 'content': content}
    response = await AsyncClient().chat(model='llama3.1:8b', messages=[message])
    return response

# Synchronous function to run the asynchronous chat interaction
def chat(content):
    loop = asyncio.get_event_loop()
    if loop.is_running():
        # If the event loop is already running, create a new task and wait for it
        future = asyncio.ensure_future(_chat_interaction(content))
        return loop.run_until_complete(future)
    else:
        return loop.run_until_complete(_chat_interaction(content))

In [None]:
  content = "Why is the sky blue?"
  response = chat(content)
  print(response['message'])

ERROR:asyncio:Task exception was never retrieved
future: <Task finished name='Task-1' coro=<_chat_interaction() done, defined at <ipython-input-9-a81e98bea7f2>:9> exception=RemoteProtocolError('Server disconnected without sending a response.')>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/httpx/_transports/default.py", line 69, in map_httpcore_exceptions
    yield
  File "/usr/local/lib/python3.10/dist-packages/httpx/_transports/default.py", line 373, in handle_async_request
    resp = await self._pool.handle_async_request(req)
  File "/usr/local/lib/python3.10/dist-packages/httpcore/_async/connection_pool.py", line 216, in handle_async_request
    raise exc from None
  File "/usr/local/lib/python3.10/dist-packages/httpcore/_async/connection_pool.py", line 196, in handle_async_request
    response = await connection.handle_async_request(
  File "/usr/local/lib/python3.10/dist-packages/httpcore/_async/connection.py", line 101, in handle_async_reques

KeyboardInterrupt: 

## Dataset preparation

In [None]:
import zipfile
import os
from bs4 import BeautifulSoup
import pandas as pd

# Step 1: Unzip the HTML Files
zip_path = 'pandas.zip'
extracted_path = 'extracted_html_files'

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

# Step 2: Parse Each HTML File and Generate QA Pairs
data = {'question': [], 'answer': []}

for root, dirs, files in os.walk(extracted_path):
    for file in files:
        if file.endswith('.html'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r', encoding='utf-8') as f:
                soup = BeautifulSoup(f, 'html.parser')

                # Extract relevant sections
                for section in soup.find_all(['section', 'div']):  # Adjust tags as needed
                    heading = section.find(['h2', 'h3'])
                    if heading:
                        content = section.get_text(separator="\n").strip()

                        # "What is" questions
                        what_question = f"What is {heading.text}?"
                        data['question'].append(what_question)
                        data['answer'].append(content)

                        # "How to" questions if code snippets are present
                        code_snippets = section.find_all('code')
                        if code_snippets:
                            how_question = f"How to use {heading.text} in pandas?"
                            how_answer = "\n".join([code.get_text() for code in code_snippets])
                            data['question'].append(how_question)
                            data['answer'].append(how_answer)

# Step 3: Save as QA Pairs
df = pd.DataFrame(data)
df.to_csv('pandas_qa_dataset.csv', index=False)

print("QA pairs have been generated and saved to 'pandas_qa_dataset.csv'")

### Fine-tune the model using the dataset created above

In [None]:
!pip install -U transformers accelerate

In [None]:
!pip install --upgrade pyarrow
!pip install --upgrade --force-reinstall datasets
!pip install numpy==1.26.4
!pip install datasets

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from datasets import Dataset

# Load model and tokenizer
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Load dataset
dataset = Dataset.from_pandas(pd.read_csv('pandas_qa_dataset.csv'))

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples['question'], examples['answer'], padding='max_length', truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets
)

# Fine-tune the model
trainer.train()