In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from transformers.agents import HfEngine, ReactCodeAgent

In [2]:
import HF_TOKEN
hf_token = HF_TOKEN.HF_TOKEN

In [3]:
model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_id, token = hf_token)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    token = hf_token,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
from transformers import StoppingCriteria,TextStreamer

# Custom implementation of a stopping criteria
class MyStoppingCriteria(StoppingCriteria):
  def __init__(self, target_sequence, prompt):
      self.target_sequence = target_sequence
      self.prompt=prompt
      self.last=None
      self.new_token=''
  def __call__(self, input_ids, scores, **kwargs):
      # Get the generated text as a string

      generated_text = tokenizer.decode(input_ids[0])
      if self.last:
        self.new_token = self.new_token+ generated_text.replace(self.last,'')

      # Check if the target sequence appears in the generated text
      if self.target_sequence in self.new_token:


          return True  # Stop generation
      self.last=generated_text
      return False  # Continue generation

  def __len__(self):
      return 1

  def __iter__(self):
      yield self

In [6]:
# declare prompt_iteration variable as a global variable
# global prompt_iteration
prompt_iteration = 0
message_log_file = "/home/vp899/projects/llm_agent/Output/message_log.txt"
result_log_file = "/home/vp899/projects/llm_agent/Output/result_log.txt"
def llm_engine(messages, stop_sequences):
    input_ids = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)
    global prompt_iteration
    prompt_iteration += 1
    # prompt_iteration = 0
    iteration_message = f"Iteration **** {prompt_iteration} ****"
    # append the iteration message and messages to /home/vp899/projects/llama3_ft1/Output/message_log.txt
    with open(message_log_file, "a") as f:
        f.write(f"{iteration_message}\n")
        for message in messages:
            f.write(f"{message}\n")
    terminators = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
    outputs = model.generate(input_ids, max_new_tokens=1024, eos_token_id=terminators, do_sample=True, temperature=0.6, top_p=0.9,stopping_criteria=MyStoppingCriteria("Observation:", messages))
    response = outputs[0][input_ids.shape[-1]:]
    output = tokenizer.decode(response, skip_special_tokens=True)
    # write the interation message and output to /home/vp899/projects/llama3_ft1/Output/result_log.txt
    with open(result_log_file, "a") as f:
        f.write(f"{iteration_message}\n")
        f.write(f"{output}\n")
    print(output)
    return output

In [7]:
agent = ReactCodeAgent(
    tools=[],
    llm_engine=llm_engine,
    additional_authorized_imports=["numpy", "pandas", "matplotlib.pyplot", "seaborn"],
    max_iterations=10,
)

In [8]:
data_notes = """
Text part number: Unique id for each part included in the dataset
Type: Dataset defines the type of the part in this way....
DS: Double acting, single rod
SS: Single acting, single rod
Type_2: Dataset defines another type of the part in this way....
Welded= Welded construction
Tie_Rod= Tie rod construction
Attributes Rod ID: Internal ID for the rod in millimeters
Attributes Internal Bore: Internal bore of the cylinder in millimeters
Stroke_Adj: Stroke length in millimeters
Rod End Friction Weld: Dataset defines the rod end friction weld in this way....
Yes= Rod end is friction welded
No= Rod end is arc welded
Sensor: Dataset defines the sensor in this way....
Yes= Sensor is included
No= Sensor is not included
Nitrobar: Dataset defines the nitrobar in this way....
Yes= Nitrobar is included
No= Nitrobar is not included
Product_3: Name of the product line where the part is used
Closed_Length_mm: Closed length of the cylinder in millimeters
Cylinder Function: Description of the function performed by the part

"""

In [9]:
agent_user_prompt =   """You are an expert data analyst.
Please load the source file and analyze its content.
According to the variables you have, begin by listing 3 interesting questions that could be asked on this data. For example, you could ask: "What is the average internal bore of the cylinders?" or "How many cylinders have a sensor included?". Look into the distributions of numerical variables, and the counts of categorical variables. Also, look at the distributions of the numerical variables according to the categorical variables.
Then answer these questions one by one, by finding the relevant numbers.
In your final answer: summarize the insights based on the distributions and correlations you found between the variables. What are the implications of these correlations?
Your final answer should have at least 5 numbered and detailed parts. You must return the final answer as an argument to the final_answer tool. For example final_answer("final answer").
"""
agent_user_prompt1 =   """You are an expert data analyst.
Please load the source file and analyze its content.
According to the variables you have, begin by listing 3 interesting questions that could be asked on this data. Look into the distributions such as mean, percentiles, min, max for numerical variables, and the counts of categorical variables. Also, look at the distributions of the numerical variables according to the categorical variables.
Then answer these questions one by one, by finding the relevant numbers.
In your final answer: summarize the insights based on the distributions and correlations you found between the variables. What are the implications of these correlations?
Your final answer should have at least 5 numbered and detailed parts. You must return the final answer as an argument to the final_answer tool. For example final_answer("final answer").
"""

In [10]:
analysis = agent.run(
    agent_user_prompt1,
    # additional_notes=data_notes,
    #source_file="/home/vp899/projects/llama3_ft1/Input/AG_Cylinder_Function.csv",
    source_file = "/home/vp899/projects/llm_agent/Input/AG_Cylinder_Function1.csv",
)

[37;1mYou are an expert data analyst.
Please load the source file and analyze its content.
According to the variables you have, begin by listing 3 interesting questions that could be asked on this data. Look into the distributions such as mean, percentiles, min, max for numerical variables, and the counts of categorical variables. Also, look at the distributions of the numerical variables according to the categorical variables.
Then answer these questions one by one, by finding the relevant numbers.
In your final answer: summarize the insights based on the distributions and correlations you found between the variables. What are the implications of these correlations?
Your final answer should have at least 5 numbered and detailed parts. You must return the final answer as an argument to the final_answer tool. For example final_answer("final answer").

You have been provided with these initial arguments: {'source_file': '/home/vp899/projects/llm_agent/Input/AG_Cylinder_Function1.csv'}.

Thought: I will first load the source file and explore its content using the pandas library. Then, I will identify the numerical and categorical variables and calculate their distributions.

Code:
```py
import pandas as pd
import numpy as np

# Load the source file
df = pd.read_csv('/home/vp899/projects/llm_agent/Input/AG_Cylinder_Function1.csv')

# Print the first 5 rows of the dataframe
print(df.head())

# Print the summary statistics of the dataframe
print(df.describe())

# Print the count of unique values for each column
print(df.nunique())

# Print the count of missing values for each column
print(df.isnull().sum())

# Print the data types of each column
print(df.dtypes)
```<end_action>

Observation:



[33;1m==== Agent is executing the code below:[0m
[0m[38;5;60;03m# Identify numerical and categorical variables[39;00m
[38;5;7mnumerical_vars[39m[38;5;7m [39m[38;5;109;01m=[39;00m[38;5;7m [39m[38;5;7mdf[39m[38;5;109;01m.[39;00m[38;5;7mselect_dtypes[39m[38;5;7m([39m[38;5;7minclude[39m[38;5;109;01m=[39;00m[38;5;7m[[39m[38;5;144m'[39m[38;5;144mint64[39m[38;5;144m'[39m[38;5;7m,[39m[38;5;7m [39m[38;5;144m'[39m[38;5;144mfloat64[39m[38;5;144m'[39m[38;5;7m][39m[38;5;7m)[39m[38;5;109;01m.[39;00m[38;5;7mcolumns[39m
[38;5;7mcategorical_vars[39m[38;5;7m [39m[38;5;109;01m=[39;00m[38;5;7m [39m[38;5;7mdf[39m[38;5;109;01m.[39;00m[38;5;7mselect_dtypes[39m[38;5;7m([39m[38;5;7minclude[39m[38;5;109;01m=[39;00m[38;5;7m[[39m[38;5;144m'[39m[38;5;144mobject[39m[38;5;144m'[39m[38;5;7m][39m[38;5;7m)[39m[38;5;109;01m.[39;00m[38;5;7mcolumns[39m

[38;5;60;03m# Calculate distributions of numerical variables[39;00m
[38;5;109;01m

Thought: Now that I have loaded the source file and explored its content, I will identify the numerical and categorical variables and calculate their distributions.

Code:
```py
# Identify numerical and categorical variables
numerical_vars = df.select_dtypes(include=['int64', 'float64']).columns
categorical_vars = df.select_dtypes(include=['object']).columns

# Calculate distributions of numerical variables
for var in numerical_vars:
    print(f'Distribution of {var}:')
    print(df[var].describe())
    print()

# Calculate counts of categorical variables
for var in categorical_vars:
    print(f'Counts of {var}:')
    print(df[var].value_counts())
    print()

# Calculate distributions of numerical variables according to categorical variables
for var in numerical_vars:
    for cat_var in categorical_vars:
        print(f'Distribution of {var} according to {cat_var}:')
        print(df.groupby(cat_var)[var].describe())
        print()
```<end_action>

Observation:


[33;1mPrint outputs:[0m
[32;20mDistribution of Text Part Number:
count    3.700000e+01
mean     6.032598e+07
std      2.093527e+07
min      2.532004e+07
25%      4.745551e+07
50%      4.787470e+07
75%      8.439337e+07
max      9.050251e+07
Name: Text Part Number, dtype: float64

Distribution of Attributes_Rod_ID:
count     37.000000
mean      48.558378
std       22.355516
min       12.700000
25%       38.100000
50%       38.100000
75%       63.500000
max      139.700000
Name: Attributes_Rod_ID, dtype: float64

Distribution of Attributes_Internal_Bore:
count     37.000000
mean      91.802703
std       31.089418
min       25.400000
25%       76.200000
50%       95.250000
75%      114.300000
max      160.000000
Name: Attributes_Internal_Bore, dtype: float64

Distribution of Stroke_Adj:
count      37.000000
mean      562.097892
std       570.853325
min        69.850000
25%       203.200000
50%       525.000000
75%       762.000000
max      3470.402000
Name: Stroke_Adj, dtype: float64



Thought: Now that I have explored the content of the source file and calculated the distributions of the numerical and categorical variables, I will identify the interesting questions that could be asked on this data.

Code:
```py
# Identify interesting questions
interesting_questions = [
    "What is the average value of Attributes_Rod_ID for each Type?",
    "What is the distribution of Attributes_Internal_Bore for each Type_2?",
    "How does the value of Stroke_Adj change according to the value of Sensor?",
    "What is the average value of Closed_Length_mm for each Product_3?",
    "How does the value of Closed_Length_mm change according to the value of Cylinder_Function?"
]

# Print the interesting questions
for question in interesting_questions:
    print(question)
```<end_action>

Observation:


[33;1m==== Agent is executing the code below:[0m
[0m[38;5;60;03m# Answer the first question[39;00m
[38;5;7maverage_attributes_rod_id[39m[38;5;7m [39m[38;5;109;01m=[39;00m[38;5;7m [39m[38;5;7mdf[39m[38;5;109;01m.[39;00m[38;5;7mgroupby[39m[38;5;7m([39m[38;5;144m'[39m[38;5;144mType[39m[38;5;144m'[39m[38;5;7m)[39m[38;5;7m[[39m[38;5;144m'[39m[38;5;144mAttributes_Rod_ID[39m[38;5;144m'[39m[38;5;7m][39m[38;5;109;01m.[39;00m[38;5;7mmean[39m[38;5;7m([39m[38;5;7m)[39m
[38;5;109mprint[39m[38;5;7m([39m[38;5;7maverage_attributes_rod_id[39m[38;5;7m)[39m

[38;5;60;03m# Answer the second question[39;00m
[38;5;7mdistribution_attributes_internal_bore[39m[38;5;7m [39m[38;5;109;01m=[39;00m[38;5;7m [39m[38;5;7mdf[39m[38;5;109;01m.[39;00m[38;5;7mgroupby[39m[38;5;7m([39m[38;5;144m'[39m[38;5;144mType_2[39m[38;5;144m'[39m[38;5;7m)[39m[38;5;7m[[39m[38;5;144m'[39m[38;5;144mAttributes_Internal_Bore[39m[38;5;144m'[39m[38;5;7m]

Thought: Now that I have identified the interesting questions, I will answer them one by one.

Code:
```py
# Answer the first question
average_attributes_rod_id = df.groupby('Type')['Attributes_Rod_ID'].mean()
print(average_attributes_rod_id)

# Answer the second question
distribution_attributes_internal_bore = df.groupby('Type_2')['Attributes_Internal_Bore'].describe()
print(distribution_attributes_internal_bore)

# Answer the third question
value_stroke_adj = df.groupby('Sensor')['Stroke_Adj'].mean()
print(value_stroke_adj)

# Answer the fourth question
average_closed_length_mm = df.groupby('Product_3')['Closed_Length_mm'].mean()
print(average_closed_length_mm)

# Answer the fifth question
value_closed_length_mm = df.groupby('Cylinder_Function')['Closed_Length_mm'].mean()
print(value_closed_length_mm)
```<end_action>

Observation:


[33;1m==== Agent is executing the code below:[0m
[0m[38;5;60;03m# Summarize the insights[39;00m
[38;5;7minsights[39m[38;5;7m [39m[38;5;109;01m=[39;00m[38;5;7m [39m[38;5;7m[[39m
[38;5;7m    [39m[38;5;144m"[39m[38;5;144mThe average value of Attributes_Rod_ID for each Type is DS: 46.9 and SS: 67.4.[39m[38;5;144m"[39m[38;5;7m,[39m
[38;5;7m    [39m[38;5;144m"[39m[38;5;144mThe distribution of Attributes_Internal_Bore for each Type_2 is Tie Rod: 81.3, Tie-Rod: 127, and Welded: 92.4.[39m[38;5;144m"[39m[38;5;7m,[39m
[38;5;7m    [39m[38;5;144m"[39m[38;5;144mThe value of Stroke_Adj changes according to the value of Sensor, with No: 571.4 and Yes: 456.4.[39m[38;5;144m"[39m[38;5;7m,[39m
[38;5;7m    [39m[38;5;144m"[39m[38;5;144mThe average value of Closed_Length_mm for each Product_3 is COMBINE: 812.8, CULTIVATORS: 671.5, DISK HARROW: 784.7, HARROWS: 361.9, PLANTER: 970.1, RIGID: 1073, RIPPER: 641.4, and TRACTOR: 656.3.[39m[38;5;144m"[39m[38;5;7m

Thought: Now that I have answered the interesting questions, I will summarize the insights based on the distributions and correlations I found between the variables.

Code:
```py
# Summarize the insights
insights = [
    "The average value of Attributes_Rod_ID for each Type is DS: 46.9 and SS: 67.4.",
    "The distribution of Attributes_Internal_Bore for each Type_2 is Tie Rod: 81.3, Tie-Rod: 127, and Welded: 92.4.",
    "The value of Stroke_Adj changes according to the value of Sensor, with No: 571.4 and Yes: 456.4.",
    "The average value of Closed_Length_mm for each Product_3 is COMBINE: 812.8, CULTIVATORS: 671.5, DISK HARROW: 784.7, HARROWS: 361.9, PLANTER: 970.1, RIGID: 1073, RIPPER: 641.4, and TRACTOR: 656.3.",
    "The value of Closed_Length_mm changes according to the value of Cylinder_Function, with AXLE ADJUST CYLINDER: 852.2, FOLD CYLINDER: 1441.7, FRAME FOLD CYLINDER: 850.9, HEADER CYLINDER: 1060, HITCH CYLINDER: 355.6, HITCH LIFT CYLINDER: 460.4, TILLAGE CYLINDER: 738.2, 

[33;1m==== Agent is executing the code below:[0m
[0m[38;5;60;03m# Return the final answer[39;00m
[38;5;7mfinal_answer[39m[38;5;7m([39m[38;5;144m"[39m[38;5;144mThe average value of Attributes_Rod_ID for each Type is DS: 46.9 and SS: 67.4. The distribution of Attributes_Internal_Bore for each Type_2 is Tie Rod: 81.3, Tie-Rod: 127, and Welded: 92.4. The value of Stroke_Adj changes according to the value of Sensor, with No: 571.4 and Yes: 456.4. The average value of Closed_Length_mm for each Product_3 is COMBINE: 812.8, CULTIVATORS: 671.5, DISK HARROW: 784.7, HARROWS: 361.9, PLANTER: 970.1, RIGID: 1073, RIPPER: 641.4, and TRACTOR: 656.3. The value of Closed_Length_mm changes according to the value of Cylinder_Function, with AXLE ADJUST CYLINDER: 852.2, FOLD CYLINDER: 1441.7, FRAME FOLD CYLINDER: 850.9, HEADER CYLINDER: 1060, HITCH CYLINDER: 355.6, HITCH LIFT CYLINDER: 460.4, TILLAGE CYLINDER: 738.2, UNLOADER AUGER CYLINDER: 916, and UNLOADER CLUTCH CYLINDER: 195.6.[39m[38;5;1

Thought: Now that I have summarized the insights, I will return the final answer as an argument to the final_answer tool.

Code:
```py
# Return the final answer
final_answer("The average value of Attributes_Rod_ID for each Type is DS: 46.9 and SS: 67.4. The distribution of Attributes_Internal_Bore for each Type_2 is Tie Rod: 81.3, Tie-Rod: 127, and Welded: 92.4. The value of Stroke_Adj changes according to the value of Sensor, with No: 571.4 and Yes: 456.4. The average value of Closed_Length_mm for each Product_3 is COMBINE: 812.8, CULTIVATORS: 671.5, DISK HARROW: 784.7, HARROWS: 361.9, PLANTER: 970.1, RIGID: 1073, RIPPER: 641.4, and TRACTOR: 656.3. The value of Closed_Length_mm changes according to the value of Cylinder_Function, with AXLE ADJUST CYLINDER: 852.2, FOLD CYLINDER: 1441.7, FRAME FOLD CYLINDER: 850.9, HEADER CYLINDER: 1060, HITCH CYLINDER: 355.6, HITCH LIFT CYLINDER: 460.4, TILLAGE CYLINDER: 738.2, UNLOADER AUGER CYLINDER: 916, and UNLOADER CLUTCH CYLINDER: 195.6.")
```<en

In [11]:
print(analysis)

The average value of Attributes_Rod_ID for each Type is DS: 46.9 and SS: 67.4. The distribution of Attributes_Internal_Bore for each Type_2 is Tie Rod: 81.3, Tie-Rod: 127, and Welded: 92.4. The value of Stroke_Adj changes according to the value of Sensor, with No: 571.4 and Yes: 456.4. The average value of Closed_Length_mm for each Product_3 is COMBINE: 812.8, CULTIVATORS: 671.5, DISK HARROW: 784.7, HARROWS: 361.9, PLANTER: 970.1, RIGID: 1073, RIPPER: 641.4, and TRACTOR: 656.3. The value of Closed_Length_mm changes according to the value of Cylinder_Function, with AXLE ADJUST CYLINDER: 852.2, FOLD CYLINDER: 1441.7, FRAME FOLD CYLINDER: 850.9, HEADER CYLINDER: 1060, HITCH CYLINDER: 355.6, HITCH LIFT CYLINDER: 460.4, TILLAGE CYLINDER: 738.2, UNLOADER AUGER CYLINDER: 916, and UNLOADER CLUTCH CYLINDER: 195.6.
