In [None]:
# supress any warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# third-party imports
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from huggingface_hub import hf_hub_download

from lingua import Language, LanguageDetectorBuilder

In [None]:
# pre-loading language models to detection
detector = LanguageDetectorBuilder.from_all_languages().with_preloaded_language_models().build()

## Bening Prompts - [No Robots](https://huggingface.co/datasets/HuggingFaceH4/no_robots)

In [None]:
no_robots = pd.concat([
    pd.read_parquet(
        hf_hub_download(repo_id='HuggingFaceH4/no_robots', filename='data/test-00000-of-00001.parquet', repo_type="dataset")),
    pd.read_parquet(
        hf_hub_download(repo_id='HuggingFaceH4/no_robots', filename='data/train-00000-of-00001.parquet', repo_type="dataset"))
])

In [None]:
no_robots.info()

In [None]:
# droping duplicates
no_robots.drop_duplicates(subset=['prompt'], inplace=True)

# droping nan values
no_robots = no_robots.loc[~no_robots['prompt'].isnull()] 

# droping empty values
no_robots = no_robots.loc[no_robots['prompt'] != '']

# selecting columns
no_robots = no_robots[['prompt', 'category']]

In [None]:
no_robots.shape

In [None]:
ax = no_robots['category'].value_counts(normalize=True).plot(kind='bar', figsize=(8,3))
ax.set_title('Category Distribution', size=14)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Level')
plt.xticks(rotation=25)
plt.show()

## Malicious Prompts - [Hackaprompt](https://huggingface.co/datasets/hackaprompt/hackaprompt-dataset)

In [None]:
hackaprompt = pd.read_parquet(
    hf_hub_download(repo_id='hackaprompt/hackaprompt-dataset', filename='hackaprompt.parquet', repo_type="dataset"))

In [None]:
hackaprompt.info()

In [None]:
# filtering sucessful prompts
hackaprompt = hackaprompt.loc[(hackaprompt['correct'] == True)]

# creating system prompt column
hackaprompt['prompt'] = hackaprompt.apply(lambda x: x['prompt'].replace(x["user_input"], '{USER PROMPT}'), axis='columns')

# renaming columns
hackaprompt.rename(columns={'prompt': 'system_prompt', 'user_input': 'prompt'}, inplace=True)

# droping duplicates
hackaprompt.drop_duplicates(subset=['prompt'], inplace=True)

# droping nan values
hackaprompt = hackaprompt.loc[~hackaprompt['prompt'].isnull()] 

# droping empty values
hackaprompt = hackaprompt.loc[hackaprompt['prompt'] != '']

# selecting columns
hackaprompt = hackaprompt[['level', 'system_prompt', 'prompt', 'completion', 'expected_completion', 'model', 'token_count']]

In [None]:
hackaprompt.shape

In [None]:
# example of system prompt
hackaprompt.loc[hackaprompt['level'] == 1, 'system_prompt'].iloc[0]

In [None]:
# detecting prompt language
hackaprompt['prompt_language'] = hackaprompt.apply(
    lambda x: detector.detect_language_of(x['prompt']).name.lower() if detector.detect_language_of(x['prompt']) else 'unrecognized', axis='columns')

In [None]:
ax = hackaprompt['prompt_language'].value_counts(normalize=True)[:10].plot(kind='bar', figsize=(4,3))
ax.set_title('Language Distribution', size=14)
ax.set_ylabel('Percentage (%)')
plt.xticks(rotation=85)
plt.show()

In [None]:
# mismatched languages because of special characters
hackaprompt.loc[hackaprompt['prompt_language'] == 'shona', 'prompt']

In [None]:
ax = hackaprompt['level'].value_counts(normalize=True).plot(kind='bar', figsize=(4,3))
ax.set_title('Level Distribution', size=14)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Level')
plt.xticks(rotation=0)
plt.show()

In [None]:
ax = hackaprompt['model'].value_counts(normalize=True).plot(kind='bar', figsize=(4,3))
ax.set_title('Model Distribution', size=14)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Model Name')
plt.xticks(rotation=10)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(4,3))
sns.histplot(data=hackaprompt, x='token_count', stat='percent', element="step", fill=True, cumulative=True, ax=ax)
ax.set_title('CDF of Token Count', size=14)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Token Count')
ax.set_xlim(0, 200)
ax.set_xticks(range(0, 201, 50))
plt.show()

## Prompt Injection Dataset

In [None]:
# sampling hackprompt dataset
malicious = hackaprompt.sample(n=no_robots.shape[0], random_state=42)

# creating label column
malicious['label'] = 1

# selecting columns
malicious = malicious[['prompt', 'label']]

In [None]:
# creating label column
no_robots['label'] = 0

# selecting columns
bening = no_robots[['prompt', 'label']]

In [None]:
# concatenating datasets
data = pd.concat([malicious, bening])

In [None]:
data.info()

In [None]:
ax = data['label'].value_counts(normalize=True).plot(kind='bar', figsize=(4,3))
ax.set_title('Label Distribution', size=14)
ax.set_ylabel('Percentage (%)')
ax.set_xlabel('Label')
plt.xticks(rotation=0)
plt.show()

In [None]:
# shuffling data
data = shuffle(data, random_state=42)

# train, val, and test split
df_train, df_test = train_test_split(data, test_size=0.2)
df_val, df_test = train_test_split(df_test, test_size=0.3)

In [None]:
df_train.size, df_val.size, df_test.size

In [None]:
# saving datasets
df_train.to_csv('data/prompt_injection_train.csv', index=False)
df_val.to_csv('data/prompt_injection_val.csv', index=False)
df_test.to_csv('data/prompt_injection_test.csv', index=False)