Apply current prompt to test set

Files in 01 contain leaks:

1) torch-dockerfile:
    * api-key in line 92
    * GCP TOKEN in line 104

2) data.csv:
    * PII leaks in every row.

3) payment-processor.js:
    * lines 1-4 credit card details

4) DatabaseConnectios.cs:
    * 7-10 credit card details
    * 57: PII

5) flask-app.py:
    * 10-23: PII
    * 75 : API KEY

Files in 02 do not contain leaks.

In [121]:
from langchain_community.llms import HuggingFaceEndpoint
from langchain_core.prompts import ChatPromptTemplate

from dotenv import load_dotenv, find_dotenv

import os

# load api keys
load_dotenv(find_dotenv())

model = "mistralai/Mistral-7B-Instruct-v0.2"

def get_prompt_template(prompt = '{query}'):
    prompt_template = ChatPromptTemplate.from_messages(
            ('human', prompt),
        )
    return prompt_template

llm = HuggingFaceEndpoint(
        repo_id=model,  
        temperature=0.01, 
        max_new_tokens=1024,
        model_kwargs=dict(max_length=1024, token= os.environ.get('HUGGINGFACEHUB_API_TOKEN')))
chain = get_prompt_template() | llm


print(chain.invoke({"query": "What is your favourite condiment?"}))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/sos00/.cache/huggingface/token
Login successful

Assistant: I don't have a favorite condiment as I don't consume food or condiments. I'm here to help answer questions and provide information. However, I can tell you that many people enjoy condiments like ketchup, mustard, soy sauce, or hot sauce, depending on their personal preferences.


In [122]:
with open('../prompts/prompt-current.txt', 'r') as f:
    prompt = f.read()
print(f"{len(prompt)=}")

chain = get_prompt_template(prompt) | llm

len(prompt)=3448


In [127]:
from langchain_community.document_loaders import TextLoader, NotebookLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

CHUNK_SIZE = 1000

splitter = RecursiveCharacterTextSplitter(chunk_size = CHUNK_SIZE, chunk_overlap=1)

def is_notebook(fp):
    # are you a notebook?
    fp.split(".")[-1] == '.ipynb'

def load_and_split_file(fp):
    # takes a file path, loads it and splits it
    if is_notebook(fp):
        doc = NotebookLoader(fp).load()
    else:
        doc = TextLoader(fp).load()
    docs = splitter.split_documents(doc)
    return docs

In [128]:
folder = 'test-set'
from langchain_core.output_parsers import JsonOutputParser
import json

def run_test_set():
    response_list = []
    for root, dirs, files in os.walk(folder):
        for file_name in files:
            full_path = os.path.join(root, file_name)
            docs = load_and_split_file(fp = full_path)
            start_line = 0
            for chunk_idx, doc in enumerate(docs):
                file_content = doc.page_content
                json_parser = JsonOutputParser()
                response = json_parser.invoke(chain.invoke({"file_name" : file_name, "file_content" : file_content}))
                response_list.append(dict(file_name=os.path.join(root, file_name), chunk = chunk_idx, start_line = start_line, response = response))

                #print output
                response = json.dumps(response, indent=2)
                print("FILE:", os.path.join(root, file_name), "Chunk:", chunk_idx, "\nMODEL RESPONSE:")
                print(response, "\n")
                start_line = start_line + file_content.count('\n')


    return response_list
response_list = run_test_set()

FILE: test-set/01/DataBaseConnection.cs Chunk: 0 
MODEL RESPONSE:
{
  "file name": "DatabaseConnection.cs",
  "file description": "A C# script for demonstrating database connection and handling user data.",
  "sensitive data count": 4,
  "sensitive data": [
    {
      "line_number": 5,
      "type_of_data": "Exposed Database Connection String",
      "description": "Hardcoded database connection string",
      "sensitive_data": "Server=myServerAddress;Database=myDataBase;User Id=myUsername;Password=myPassw0rd;"
    },
    {
      "line_number": 6,
      "type_of_data": "Personally Identifiable Information",
      "description": "Hardcoded credit card number",
      "sensitive_data": "1032723955554444"
    },
    {
      "line_number": 7,
      "type_of_data": "Personally Identifiable Information",
      "description": "Hardcoded CVV",
      "sensitive_data": "154"
    },
    {
      "line_number": 8,
      "type_of_data": "Personally Identifiable Information",
      "description": "Ha

In [129]:
import pandas as pd

chunk_idx = [r['chunk'] for r in response_list]
file_names = [r['file_name'] for r in response_list]
df = pd.DataFrame([r['response'] for r in response_list])
df['chunk'] = chunk_idx
df['file name'] = file_names
df['start_line'] = [r['start_line'] for r in response_list]
df_summary = df.groupby('file name').agg({'file description': 'first', 'sensitive data count': 'sum'})
df_summary


Unnamed: 0_level_0,file description,sensitive data count
file name,Unnamed: 1_level_1,Unnamed: 2_level_1
test-set/01/DataBaseConnection.cs,A C# script for demonstrating database connect...,5
test-set/01/data.csv,A CSV file containing personal data.,5
test-set/01/flask-app.py,A Flask application file.,1
test-set/01/payment-processor.js,A JavaScript file for processing payments.,4
test-set/01/torch-dockerfile,Dockerfile for building a custom PyTorch conta...,2
test-set/02/linalg-utils.py,Python script containing various linear algebr...,0
test-set/02/ts.csv,A CSV file containing stock trading data.,0
test-set/02/utils.js,Utility functions file.,0


In [130]:
df_leaks = []
for idx, row in  df[df['sensitive data'].apply(len)>0].iterrows():
    for d in row['sensitive data']:
        new_row = {'file name' : row['file name']} | d
        new_row['line_number'] += row['start_line']
        df_leaks.append(new_row)
pd.DataFrame(df_leaks)


Unnamed: 0,file name,line_number,type_of_data,description,sensitive_data
0,test-set/01/DataBaseConnection.cs,5,Exposed Database Connection String,Hardcoded database connection string,Server=myServerAddress;Database=myDataBase;Use...
1,test-set/01/DataBaseConnection.cs,6,Personally Identifiable Information,Hardcoded credit card number,1032723955554444
2,test-set/01/DataBaseConnection.cs,7,Personally Identifiable Information,Hardcoded CVV,154
3,test-set/01/DataBaseConnection.cs,8,Personally Identifiable Information,Hardcoded expiry date,06/27
4,test-set/01/DataBaseConnection.cs,36,Exposed Database connection string,Connection string for database.,"Console.WriteLine(""Connecting to database usin..."
5,test-set/01/flask-app.py,72,Hardcoded API KEY,API_KEY,c9a3b7c5-6e3d-4b2f-8d7a-f1e2b3c4d5e6
6,test-set/01/data.csv,1,Personally Identifiable Information,Email address of John Smith,john.smith@example.com
7,test-set/01/data.csv,1,Personally Identifiable Information,Phone number of John Smith,555-0123
8,test-set/01/data.csv,2,Personally Identifiable Information,Email address of Emily Johnson,emilyj@email.net
9,test-set/01/data.csv,2,Personally Identifiable Information,Phone number of Emily Johnson,555-9876
