In [None]:
!pip install langchain
!pip install langchain_experimental

In [None]:
# import json
# import torch
# from langchain import PromptTemplate, LLMChain
# from langchain.llms import HuggingFacePipeline
# from langchain.agents import initialize_agent, AgentType
# from langchain_experimental.tools import PythonREPLTool
# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# # Check if CUDA is available and set the device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Load the open-source model
# model_name = "facebook/opt-1.3b"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# # Create a text-generation pipeline
# text_generation_pipeline = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     max_length=512,
#     temperature=0.7,
#     top_p=0.95,
#     repetition_penalty=1.15,
#     device=0 if device.type == "cuda" else -1  # Use GPU if available
# )

# # Create a LangChain LLM from the pipeline
# llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# # Create a Python agent
# python_repl = PythonREPLTool()
# tools = [python_repl]
# python_agent = initialize_agent(
#     tools,
#     llm,
#     agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
#     verbose=True,
#     handle_parsing_errors=True  # Handle parsing errors
# )

# def process_financial_query(context, question, script):
#     script = script.replace('\\n', '\n')
#     # Execute the Python code directly using the PythonREPLTool
#     result = python_repl.run(script)
#     return result

# # Load data from JSON file
# with open('/content/python_script_langchain.json', 'r') as f:
#     dataset = json.load(f)

# # Process each item in the dataset
# for item in dataset:
#     result = process_financial_query(item['context'], item['question'], item['answer'])
#     print(f"Context: {item['context']}")
#     print(f"Question: {item['question']}")
#     print(f"Answer: {result}")
#     print("---")

In [None]:
!pip install bert_score

In [None]:
!pip install evaluate

In [None]:
!pip install datasets
!pip install rouge_score

In [5]:
import json
import re
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
from langchain.agents import initialize_agent, AgentType
from langchain_experimental.tools import PythonREPLTool
from datasets import load_metric
import numpy as np
import evaluate

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the open-source model
model_name = "facebook/opt-1.3b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Create a text-generation pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=512,
    temperature=0.7,
    top_p=0.95,
    repetition_penalty=1.15,
    device=0 if device.type == "cuda" else -1  # Use GPU if available
)

# Create a LangChain LLM from the pipeline
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create a Python agent
python_repl = PythonREPLTool()
tools = [python_repl]
python_agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
    handle_parsing_errors=True
)

def preprocess_and_extract(text):
    # Remove any leading/trailing whitespace and dollar signs
    text = text.strip().replace('$', '')

    # If there's a colon, take everything after the last colon
    if ':' in text:
        parts = text.split(':')
        text = parts[-1].strip()

    # Extract all numbers (including negative and decimal)
    numbers = re.findall(r'-?(?:\d{1,3}(?:,\d{3})*|\d+)(?:\.\d+)?', text)

    # If we found numbers, return them as floats
    if numbers:
        return [float(num.replace(',', '')) for num in numbers]

    # If no numbers, return the text split into words
    return [word.lower() for word in re.findall(r'\b\w+\b', text)]

def normalize_number(num):
    # Convert to string and remove trailing zeros after decimal point
    return '{:f}'.format(num).rstrip('0').rstrip('.')

def calculate_accuracy(pred, ref):
    pred_processed = preprocess_and_extract(pred)
    ref_processed = preprocess_and_extract(ref)

    # If both are lists of numbers
    if isinstance(pred_processed, list) and isinstance(ref_processed, list) and all(isinstance(x, (int, float)) for x in pred_processed + ref_processed):
        # Normalize numbers to remove trailing zeros
        pred_normalized = [normalize_number(x) for x in pred_processed]
        ref_normalized = [normalize_number(x) for x in ref_processed]
        return 1 if pred_normalized == ref_normalized else 0
    # If both are lists of words
    if isinstance(pred_processed, list) and isinstance(ref_processed, list):
        return 1 if set(pred_processed) == set(ref_processed) else 0

    # If formats don't match
    return 0

def process_financial_query(context, question, script):
    script = script.replace('\\n', '\n')
    # Execute the Python code directly using the PythonREPLTool
    result = python_repl.run(script)
    return result

def evaluate_metrics(predictions, references):
    tokenized_predictions = [pred.split() for pred in predictions]
    tokenized_references = [[ref.split()] for ref in references]

    rouge = load_metric('rouge')
    bleu = load_metric('bleu')
    meteor = load_metric('meteor')
    bertscore = load_metric('bertscore')
    google_bleu = evaluate.load("google_bleu")

    # Compute ROUGE
    rouge_result = rouge.compute(predictions=predictions, references=references)
    rouge_score = rouge_result['rougeL'].mid.fmeasure

    # Compute BLEU
    bleu_result = bleu.compute(predictions=tokenized_predictions, references=tokenized_references)
    bleu_score = bleu_result['bleu']

    # Compute METEOR
    meteor_result = meteor.compute(predictions=predictions, references=references)
    meteor_score = meteor_result['meteor']

    # Compute BERTScore
    bertscore_results = bertscore.compute(predictions=predictions, references=references, lang="en")
    bertscore_precision = np.mean(bertscore_results['precision'])
    bertscore_recall = np.mean(bertscore_results['recall'])
    bertscore_f1 = np.mean(bertscore_results['f1'])

    google_bleu_result = google_bleu.compute(predictions=predictions, references=references)
    google_bleu_score = np.mean(google_bleu_result['google_bleu'])

    results = {
        "rouge": rouge_score,
        "bleu": bleu_score,
        "meteor": meteor_score,
        "bertscore_precision": bertscore_precision,
        "bertscore_recall": bertscore_recall,
        "bertscore_f1": bertscore_f1,
        "googlebleuscore": google_bleu_score
    }
    return results

# Load data from JSON files
with open('/content/python_script_langchain.json', 'r') as f:
    dataset = json.load(f)

with open('/content/test_pyscript.json', 'r') as f:
    reference_answers = json.load(f)

accuracy_scores = []
all_predictions = []
all_references = []

# Process each item in the dataset and collect predictions and references
for i, item in tqdm(enumerate(dataset), total=len(dataset), desc="Processing queries"):
    result = process_financial_query(item['context'], item['question'], item['answer'])
    all_predictions.append(result)
    all_references.append(reference_answers[i]['answer'])

    accuracy = calculate_accuracy(result, reference_answers[i]['answer'])
    accuracy_scores.append(accuracy)

    print(f"\nContext: {item['context'][:100]}{'...' if len(item['context']) > 100 else ''}")
    print(f"Question: {item['question']}")
    # print(f"Python Script Generated: {item['answer']}")
    print(f"Prediction: {result}")
    print(f"Reference: {reference_answers[i]['answer']}")
    print(f"Accuracy: {accuracy}")
    print("---")

# Calculate and print metrics
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Average Accuracy: {average_accuracy:.2f}")

# Calculate other metrics
metric_results = evaluate_metrics(all_predictions, all_references)
print(f"ROUGE Score: {metric_results['rouge']:.2f}")
print(f"BLEU Score: {metric_results['bleu']:.2f}")
print(f"METEOR Score: {metric_results['meteor']:.2f}")
print(f"BERTScore Precision: {metric_results['bertscore_precision']:.3f}")
print(f"BERTScore Recall: {metric_results['bertscore_recall']:.3f}")
print(f"BERTScore F1: {metric_results['bertscore_f1']:.3f}")
print(f"GLEU Score: {metric_results['googlebleuscore']:.2f}")

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/177 [00:00<?, ?B/s]

  warn_deprecated(
  warn_deprecated(
Processing queries:  49%|████▉     | 74/150 [00:00<00:00, 373.33it/s]


Context: Actuarial assumptions The Group’s scheme liabilities are measured using the projected unit credit me...
Question: What does the Weighted average actuarial assumptions consist of?
Prediction: Weighted average actuarial assumptions consist of: Rate of inflation, Rate of increase in salaries, Discount rate

Reference: Rate of inflation, Rate of increase in salaries, Discount rate
Accuracy: 1
---

Context: Actuarial assumptions The Group’s scheme liabilities are measured using the projected unit credit me...
Question: How much is the 2019 rate of inflation?
Prediction: 2019 rate of inflation: 2.9

Reference: 2.9
Accuracy: 1
---

Context: Actuarial assumptions The Group’s scheme liabilities are measured using the projected unit credit me...
Question: How much is the 2018 rate of inflation?
Prediction: 2018 rate of inflation: 2.9

Reference: 2.9
Accuracy: 1
---

Context: Actuarial assumptions The Group’s scheme liabilities are measured using the projected unit credit me...
Question

Processing queries:  75%|███████▍  | 112/150 [00:00<00:00, 343.87it/s]


Context: Lines of Credit The following table summarizes our available lines of credit and committed and uncom...
Question: How much money has not been committed as of December 31, 2019 for total available lines of credit?
Prediction: Money not committed as of December 31, 2019 for total available lines of credit: $206.69999999999982 million

Reference: 206.7
Accuracy: 1
---

Context: Lines of Credit The following table summarizes our available lines of credit and committed and uncom...
Question: What is the percentage of used lines of credit to Total available lines of credit as of December 31, 2019?
Prediction: Percentage of used lines of credit to Total available lines of credit as of December 31, 2019: 7.36%

Reference: 7.36
Accuracy: 1
---

Context: Lines of Credit The following table summarizes our available lines of credit and committed and uncom...
Question: What is the difference between the Unused lines of credit for 2019 and 2018?
Prediction: Difference between the Unused li

Processing queries: 100%|██████████| 150/150 [00:00<00:00, 346.45it/s]
  rouge = load_metric('rouge')



Prediction: Total number of sites acquired and constructed during 2014: 8450

Reference: 8450
Accuracy: 1
---

Context: united parcel service , inc . and subsidiaries notes to consolidated financial statements 2014 ( con...
Question: what portion of the balance of unrecognized tax benefits as of december 2017 will impact the effective tax rate?
Prediction: Portion of unrecognized tax benefits impacting the effective tax rate: 37.74647887323944

Reference: 37.7%
Accuracy: 0
---

Context: table of contents recoverability of goodwill is measured at the reporting unit level and begins with...
Question: what is the average weighted-average useful life for all those intangible assets , in number of years?
Prediction: Average weighted-average useful life for all intangible assets: 21.25

Reference: 21.25
Accuracy: 1
---

Context: stock performance graph : the graph below shows the cumulative total shareholder return assuming the...
Question: what was the five year change in value of the o 20

Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

The repository for rouge contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/rouge.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

The repository for bleu contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bleu.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

The repository for meteor contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/meteor.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Downloading builder script:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

The repository for bertscore contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/bertscore.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading builder script:   0%|          | 0.00/8.64k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ROUGE Score: 0.31
BLEU Score: 0.23
METEOR Score: 0.33
BERTScore Precision: 0.822
BERTScore Recall: 0.878
BERTScore F1: 0.849
GLEU Score: 0.23




In [None]:
# import os
# from huggingface_hub import HfApi

# os.environ["HF_TOKEN"] = ""

# hf_token = os.getenv("HF_TOKEN")
# if not hf_token:
#     raise ValueError("Hugging Face token not found. Please set the HF_TOKEN environment variable.")

# api = HfApi()
# user = api.whoami(token=hf_token)
# print(f"Logged in as: {user['name']}")

Logged in as: rjanant


In [None]:
# import json
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
# from langchain import PromptTemplate, LLMChain
# from langchain.llms import HuggingFacePipeline
# from langchain.agents import initialize_agent, AgentType
# from langchain_experimental.tools import PythonREPLTool

# # Check if CUDA is available and set the device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Load the Llama 2 8B model
# model_name = "meta-llama/Llama-2-7b-chat-hf"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# # Create a text-generation pipeline
# text_generation_pipeline = pipeline(
#     "text-generation",
#     model=model,
#     tokenizer=tokenizer,
#     max_length=1024,
#     temperature=0.7,
#     top_p=0.95,
#     repetition_penalty=1.15,
#     max_new_tokens = 250,
#     #device=0 if device.type == "cuda" else -1  # Use GPU if available
# )

# # Create a LangChain LLM from the pipeline
# llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# # Create a Python agent
# python_repl = PythonREPLTool()
# tools = [python_repl]
# python_agent = initialize_agent(
#     tools,
#     llm,
#     agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
#     verbose=True,
#     handle_parsing_errors=True
# )

# # Create a prompt template for generating Python scripts
# script_prompt = PromptTemplate(
#     input_variables=["context", "question"],
#     template="""
#     [INST] Given the following financial context and question, write a Python script to answer the question:

#     Context: {context}

#     Question: {question}

#     Your script should:
#     1. Extract relevant information from the context
#     2. Perform necessary calculations
#     3. Print the final answer

#     Provide only the Python code without any additional explanations. [/INST]
#     """
# )

# # Create an LLMChain for script generation
# script_chain = LLMChain(llm=llm, prompt=script_prompt)

# def generate_and_execute_script(context, question):
#     # Generate the Python script
#     generated_script = script_chain.run(context=context, question=question)

#     # Clean up the generated script
#     generated_script = generated_script.strip()
#     if generated_script.startswith("```python"):
#         generated_script = generated_script[9:]
#     if generated_script.endswith("```"):
#         generated_script = generated_script[:-3]

#     print("Generated Script:")
#     print(generated_script)
#     print("---")

#     # Execute the generated script
#     result = python_repl.run(generated_script)
#     return result

# # Load data from JSON file
# with open('/content/script_data.json', 'r') as f:
#     dataset = json.load(f)

# # Process each item in the dataset
# for item in dataset:
#     result = generate_and_execute_script(item['context'], item['question'])
#     print(f"Context: {item['context']}")
#     print(f"Question: {item['question']}")
#     print(f"Answer: {result}")
#     print("---")

In [None]:
# import json
# import torch
# from transformers import AutoTokenizer, AutoModelForCausalLM

# # Check if CUDA is available and set the device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# print(f"Using device: {device}")

# # Load the Code Llama 7B model
# model_name = "codellama/CodeLlama-13b-Instruct-hf"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")

# def generate_script(context, question):
#     prompt = f"""[INST] Given the following financial context and question, write a Python script to answer the question:

# Context: {context}

# Question: {question}

# Your script should:
# 1. Extract relevant information from the context
# 2. Perform necessary calculations
# 3. Print the final answer

# Provide only the Python code without any additional explanations. [/INST]

# ```python
# """

#     inputs = tokenizer(prompt, return_tensors="pt").to(device)

#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=500,
#             temperature=0.7,
#             top_p=0.95,
#             repetition_penalty=1.15,
#             do_sample=True
#         )

#     response = tokenizer.decode(outputs[0], skip_special_tokens=True)

#     # Extract the Python code from the response
#     script_start = response.find("```python")
#     script_end = response.rfind("```")

#     if script_start != -1 and script_end != -1:
#         script = response[script_start+9:script_end].strip()
#     else:
#         script = response[len(prompt):].strip()

#     return script

# # Load data from JSON file
# with open('/content/script_data.json', 'r') as f:
#     dataset = json.load(f)

# # Generate scripts for each item in the dataset
# for i, item in enumerate(dataset):
#     script = generate_script(item['context'], item['question'])

#     # Save the generated script to a file
#     with open(f'generated_script_{i}.py', 'w') as f:
#         f.write(script)

#     print(f"Generated script for item {i}:")
#     print(script)
#     print("---")

# print("All scripts have been generated and saved.")

In [None]:
# import torch
# torch.cuda.empty_cache()

In [None]:
# from transformers import AutoTokenizer, AutoModelForCausalLM
# import torch
# import json

# # Load the model and tokenizer
# model_name = "meta-llama/Meta-Llama-3-8B"  # You'll need access to this model
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name)

# def generate_code_fewshot(context, question, examples, max_new_tokens=150):
#     # Construct the prompt with few-shot examples
#     prompt = "Given a context and a question, generate Python code to answer the question. Here are some examples:\n\n"

#     for ex in examples:
#         prompt += f"Context: {ex['context']}\n"
#         prompt += f"Question: {ex['question']}\n"
#         prompt += f"Python code:\n{ex['answer']}\n\n"

#     prompt += f"Now, generate code for the following:\n"
#     prompt += f"Context: {context}\n"
#     prompt += f"Question: {question}\n"
#     prompt += "Python code:"

#     # Tokenize the prompt
#     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

#     # Generate the response
#     with torch.no_grad():
#         outputs = model.generate(
#             **inputs,
#             max_new_tokens=max_new_tokens,
#             num_return_sequences=1,
#             temperature=0.7,
#             top_p=0.95,
#             do_sample=True
#         )

#     # Decode and return the generated code
#     generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)
#     return generated_code.split("Python code:")[-1].strip()

# # Prepare your few-shot example
# file_path = "script_data.json"
# with open(file_path, "r") as f:
#     examples = json.load(f)

# # Example usage
# context = "Sales by Contract Type: Substantially all of our contracts are fixed-price type contracts. Sales included in Other contract types represent cost plus and time and material type contracts. On a fixed-price type contract, we agree to perform the contractual statement of work for a predetermined sales price. On a cost-plus type contract, we are paid our allowable incurred costs plus a profit which can be fixed or variable depending on the contract’s fee arrangement up to predetermined funding levels determined by the customer. On a time-and-material type contract, we are paid on the basis of direct labor hours expended at specified fixed-price hourly rates (that include wages, overhead, allowable general and administrative expenses and profit) and materials at cost. The table below presents total net sales disaggregated by contract type (in millions):\nTable:\n\t\tYears Ended September 30,\t\n\t2019\t2018\t2017\nFixed Price\t$  1,452.4\t$  1,146.2\t$  1,036.9\nOther\t44.1\t56.7\t70.8\nTotal sales\t$1,496.5\t$1,202.9\t$1,107.7"
# question = "What is the change in Other in 2019 from 2018?"
# generated_code = generate_code_fewshot(context, question, examples)
# print(generated_code)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]