In [None]:
# supress any warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# python imports
import string

# third-party imports
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from gensim.models import Word2Vec

## Load Data

In [None]:
prompt_injection_train = pd.read_csv('data/prompt_injection_train.csv')
prompt_injection_test = pd.read_csv('data/prompt_injection_test.csv')

In [None]:
X_train = prompt_injection_train['prompt']
y_train = prompt_injection_train['label']

X_test = prompt_injection_test['prompt']
y_test = prompt_injection_test['label']

## Preprocess the text data

In [None]:
stop_words = set(stopwords.words('english'))
def preprocess(text):
    # casting text to lowercase
    text = text.lower()

    # removing punctuation
    text = ''.join([word for word in text if word not in string.punctuation])

    # removing stop words
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]

    return ' '.join(tokens)

In [None]:
X_train = X_train.apply(preprocess)
X_test = X_test.apply(preprocess)

## Creating vectors

In [None]:
sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4)

In [None]:
def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train = np.array([vectorize(sentence) for sentence in X_train])
X_test = np.array([vectorize(sentence) for sentence in X_test])

## Train a classification model

### Random Forest 

In [None]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Blues', normalize='true')

### Gradient Boosting

In [None]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Blues', normalize='true')

### SVC

In [None]:
clf = SVC()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Blues', normalize='true')

### XGBoost

In [None]:
clf = XGBClassifier()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Blues', normalize='true')

### Naive Bayes

In [None]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 score:', f1_score(y_test, y_pred))

In [None]:
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Blues', normalize='true')