# LangChain 기반 추론 및 성능 평가

## 추론 테스트 및 Response 개수 확인

In [1]:
!huggingface-cli login --token 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

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
The token `llm` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
The current active token is: `llm`


In [5]:
import jsonlines
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from tqdm import tqdm
from datasets import Dataset
import re
import time
from langchain import HuggingFacePipeline, LLMChain, PromptTemplate
from langchain.prompts import PromptTemplate

# Hugging Face API token
hf_token = "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"

# Model and tokenizer path
model_id = "ukparkk/gemma-7b-r16-master"
# model_id = "ukparkk/gemma-2b-r16-master"

# Load model and tokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.bfloat16,
    token=hf_token
).to(device)

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
    token=hf_token
)

# Set up text generation pipeline
hf_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1,
    max_new_tokens=200,
    temperature=0.3,
    do_sample=True,
    top_k=5,
    repetition_penalty=1.1
)

# Initialize LangChain's HuggingFacePipeline
llm = HuggingFacePipeline(pipeline=hf_pipeline)

# Define LangChain Prompt Template
prompt_template = PromptTemplate(input_variables=["instruction"],
                                 template="Instruction: {instruction}\n\nResponse:")

# Create LangChain LLMChain with the prompt and model
llm_chain = LLMChain(llm=llm, prompt=prompt_template)

# Load test dataset
test_data_path = '/workspace/dataset/test_data.jsonl'
instructions = []

with jsonlines.open(test_data_path) as reader:
    for obj in reader:
        instructions.append(obj['instruction'])

# Prepare dataset with prompts for text generation
dataset = Dataset.from_dict({"instruction": instructions})
dataset = dataset.map(lambda x: {"prompt": f"Instruction: {x['instruction']}\n\nResponse:"})

# SWC category terms
category_patterns = {
    "SWC-101": ["swc-101", "integer overflow", "integer underflow"],
    "SWC-107": ["swc-107", "reentrancy"],
    "SWC-110": ["swc-110", "assert violation"],
    "SWC-113": ["swc-113", "dos with failed call"],
    "SWC-114": ["swc-114", "transaction order dependence"]
}

# Generate responses for each instruction and store one response per category
responses = []  # List to store all generated responses
categories = {}  # Dictionary to store one response per category

# Start the total execution time
total_start_time = time.time()

for instruction in tqdm(instructions, desc="Generating responses"):
    # Generate response using LangChain's LLMChain
    generated_response = llm_chain.run(instruction=instruction)
    response = generated_response.split("Response:")[-1].strip()
    
    # Append the response to the responses list
    responses.append(response)
    
    # Check for each category pattern in response
    for category, terms in category_patterns.items():
        if any(re.search(term, response, re.IGNORECASE) for term in terms):
            # Only store the first response for each category
            if category not in categories:
                categories[category] = (instruction, response)  # Store instruction and response for this category

# End the total execution time
total_end_time = time.time()
print(f"\nTotal time for generating all responses: {total_end_time - total_start_time:.2f} seconds")

# Print the total number of responses generated
print(f"\nNumber of responses generated: {len(responses)}")

# Print the total number of unique categories with examples
print(f"\nTotal Unique Categories with Examples: {len(categories)}\n")
print("="*145)

# Print an example response for each category with improved readability
for category, (instruction, response) in categories.items():
    print(f"{category} - Example")
    print("-" *20)
    print(f"Instruction:\n{instruction}\n")
    print(f"Response:\n{response}\n")
    print("="*145)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating responses: 100%|█████████████████████████| 1000/1000 [20:08<00:00,  1.21s/it]


Total time for generating all responses: 1208.08 seconds

Number of responses generated: 1000

Total Unique Categories with Examples: 2

SWC-107 - Example
--------------------
Instruction:
Identify the vulnerability in this smart contract function and provide a revised version:

 'function checkIfNameValid(string _nameStr) public view returns(bool) { bytes32 _name = _nameStr.nameFilter(); if (pIDxName_[_name] == 0) return (true); else return (false); }'

Response:
The given code is vulnerable to reentrancy attack, which can be fixed by adding an additional modifier that checks for non-reentrant calls before executing any other functions or statements within it - such as "nonReentrant" from OpenZeppelin Contracts library; this will ensure that only one transaction at once has access into our system while also preventing malicious actors who might try exploiting vulnerabilities through multiple concurrent transactions

SWC-101 - Example
--------------------
Instruction:
In the line `ass




## F1-Score 측정

In [3]:
import jsonlines
import re
from sklearn.metrics import f1_score

# 카테고리별 키워드 패턴 정의
category_patterns = {
    "swc-101": ["swc-101", "integer overflow", "integer underflow"],
    "swc-107": ["swc-107", "reentrancy"],
    "swc-110": ["swc-110", "assert violation"],
    "swc-113": ["swc-113", "dos with failed call"],
    "swc-114": ["swc-114", "transaction order dependence"]
}

# 각 카테고리의 키워드를 포함하는지 확인하고 해당하는 SWC를 예측
def predict_category(response):
    response_lower = response.lower()  # 대소문자 구분 없이 매칭하기 위해 소문자로 변환
    for category, keywords in category_patterns.items():
        if any(re.search(keyword, response_lower) for keyword in keywords):
            return category  # 첫 번째로 매칭되는 카테고리를 반환
    return "unknown"  # 아무런 매칭이 없을 경우

# test_data.jsonl 파일 불러오기
test_data_path = '/workspace/dataset/test_data.jsonl'
ground_truth = []
predictions = []

with jsonlines.open(test_data_path) as reader:
    for idx, obj in enumerate(reader):
        # 실제 카테고리 정답 추가
        ground_truth.append(obj['category'].lower())

        # 예측된 response에서 카테고리 추출
        response = responses[idx]  # 생성된 response 리스트에서 해당 응답 가져오기
        predicted_category = predict_category(response)
        predictions.append(predicted_category)

# F1-score 계산
f1 = f1_score(ground_truth, predictions, average='weighted')
print(f"F1 Score: {f1:.4f}")


F1 Score: 0.3345


## Cosine Similarity 측정

In [4]:
import jsonlines
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load generated and ground truth responses
test_data_path = '/workspace/dataset/test_data.jsonl'
ground_truth_responses = []

with jsonlines.open(test_data_path) as reader:
    for obj in reader:
        ground_truth_responses.append(obj['response'])

# Assuming `responses` is your list of generated responses
# Check if the number of generated responses matches the ground truth responses
print(f"Number of ground truth responses: {len(ground_truth_responses)}")
print(f"Number of generated responses: {len(responses)}")

if len(responses) != len(ground_truth_responses):
    print("Warning: The number of generated responses does not match the ground truth responses.")
else:
    print(f"Total pairs for cosine similarity calculation: {len(ground_truth_responses)}")
    
    # Calculate cosine similarity between generated and ground truth responses
    vectorizer = TfidfVectorizer()
    combined_responses = ground_truth_responses + responses
    tfidf_matrix = vectorizer.fit_transform(combined_responses)
    
    # Split the matrix into ground truth and generated response vectors
    ground_truth_vectors = tfidf_matrix[:len(ground_truth_responses)]
    generated_vectors = tfidf_matrix[len(ground_truth_responses):]

    # Calculate cosine similarities for each pair
    similarities = [
        cosine_similarity(ground_truth_vectors[i], generated_vectors[i])[0][0]
        for i in range(len(ground_truth_responses))
    ]
    
    # Output average cosine similarity and a few example similarities
    average_similarity = sum(similarities) / len(similarities)
    print(f"Average Cosine Similarity: {average_similarity:.4f}")
    
    # Display example similarities
    print("\nExample Cosine Similarities:")
    for i in range(min(5, len(similarities))):  # Show up to 5 examples
        print(f"Example {i + 1}: {similarities[i]:.4f}")


Number of ground truth responses: 1000
Number of generated responses: 1000
Total pairs for cosine similarity calculation: 1000
Average Cosine Similarity: 0.1057

Example Cosine Similarities:
Example 1: 0.1027
Example 2: 0.2985
Example 3: 0.1230
Example 4: 0.1172
Example 5: 0.1117
