In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import pandas as pd

# Load the tokenizer and model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Define a tool for calculating correlation
def calculate_correlation(df, col1, col2):
    correlation = df[col1].corr(df[col2])
    return f"The correlation between {col1} and {col2} is {correlation:.2f}"

class LLMWithTools:
    def __init__(self, llm_pipeline):
        self.llm_pipeline = llm_pipeline
        self.tools = {}

    def bind_tools(self, tools):
        for tool in tools:
            self.tools[tool.__name__] = tool

    def invoke(self, prompt, **kwargs):
        # Check if the prompt matches any tool's function
        for tool_name, tool_func in self.tools.items():
            if tool_name in prompt:
                # Extract the arguments for the tool from kwargs
                return tool_func(**kwargs)
        
        # If no tool matches, use the LLM pipeline
        response = self.llm_pipeline(prompt, max_length=100)
        return response[0]['generated_text']

# Create the LLMWithTools instance and bind the correlation tool
llm_with_tools = LLMWithTools(llm_pipeline)
llm_with_tools.bind_tools([calculate_correlation])

# Load your CSV data
df = pd.read_csv('titanic.csv')

# Invoke the tool through the LLMWithTools instance
response = llm_with_tools.invoke(
    "calculate_correlation",
    df=df,
    col1='Age',
    col2='Fare'
)

print(response)


2024-06-26 09:33:15.340748: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

The correlation between Age and Fare is 0.11


In [3]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the tokenizer and model
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
llm_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

import pandas as pd

# Sample DataFrame for demonstration
data = {
    'Age': [22, 30, 24, 45, 36, 50],
    'Fare': [7.25, 71.83, 8.05, 8.05, 8.05, 51.86],
    'Survived': [1, 0, 1, 1, 0, 1]
}
df = pd.DataFrame(data)

# Extract column names and format data
columns = ", ".join(df.columns.tolist())
data_string = df.to_string(index=False)

# Define the question
question = "What is the correlation between the 'Age' and 'Fare' columns?"

# Define the PromptTemplate
class PromptTemplate:
    def __init__(self, input_variables, metadata, template):
        self.input_variables = input_variables
        self.metadata = metadata
        self.template = template

    def format(self, **kwargs):
        return self.template.format(**kwargs)

# Create the prompt template
template_string = '''
You are a planner who will plan the solution to the question --> {question}

You are working on this data:
{data}

Based on the user's question, come up with a detailed plan in plain English which is an exhaustive step-by-step plan to solve the problem.
Try to put maximum steps from your side to help the user.
This plan should involve individual tasks that, if executed correctly, will yield the correct answer.
Do not add any superfluous steps.
The result of the final step should be the final answer. Make sure that each step has all the information needed - do not skip steps.

Here are all the columns of the Data file. It will be helpful for you to understand the column details:
{columns}
'''

prompt_template = PromptTemplate(
    input_variables=['columns', 'data', 'question'],
    metadata={'lc_hub_owner': 'lk-ml', 'lc_hub_repo': 'csv_file_reader', 'lc_hub_commit_hash': 'b98687798ee3bf8e3896b4affde9208da4237007512bb3efdbff55222849adee'},
    template=template_string
)

# Format the prompt with the context
formatted_prompt = prompt_template.format(
    columns=columns,
    data=data_string,
    question=question
)

# Generate a response using the LLM pipeline
response = llm_pipeline(formatted_prompt, max_length=500)

# Print the response
print(response[0]['generated_text'])


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.



You are a planner who will plan the solution to the question --> What is the correlation between the 'Age' and 'Fare' columns?

You are working on this data:
 Age  Fare  Survived
  22  7.25         1
  30 71.83         0
  24  8.05         1
  45  8.05         1
  36  8.05         0
  50 51.86         1

Based on the user's question, come up with a detailed plan in plain English which is an exhaustive step-by-step plan to solve the problem.
Try to put maximum steps from your side to help the user.
This plan should involve individual tasks that, if executed correctly, will yield the correct answer.
Do not add any superfluous steps.
The result of the final step should be the final answer. Make sure that each step has all the information needed - do not skip steps.

Here are all the columns of the Data file. It will be helpful for you to understand the column details:
Age, Fare, Survived
Here is a sample of the data:
Age  Fare  Survived
  22  7.25         1
  30 71.83         0
  24  8.0