In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import nltk
from nltk.corpus import stopwords
from wordcloud import WordCloud

import pandas as pd
import random, time
from babel.dates import format_date, format_datetime, format_time
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, accuracy_score
import torch
from torch import Tensor
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import torch.nn.functional as F

import transformers, os
from transformers import BertModel, AutoModel, AdamW, get_linear_schedule_with_warmup, BertTokenizer, BertForSequenceClassification

In [None]:
#Read the dataset
df = pd.read_csv("/content/drive/MyDrive/Classification Model/dataset.csv")

In [None]:
df.describe()

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
stop_words = stopwords.words('english')
stop_words.extend(['u', 'wa', 'ha', 'would', 'com'])
# Get a string of the legitimate request text only
data_text_fake = ",".join(txt.lower() for txt in df.request[df.label==0])

# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50,
                      max_words=100,
                      stopwords=stop_words,
                      scale=5,
                      background_color="white").generate(data_text_fake)

# Display the generated image:
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most repeated words in all true texts',fontsize=15)
plt.show()

In [None]:
# Get a string of the malicious request text only
data_text_fake = ",".join(txt.lower() for txt in df.request[df.label==1])

# Create and generate a word cloud image:
wordcloud = WordCloud(max_font_size=50,
                      max_words=100,
                      stopwords=stop_words,
                      scale=5,
                      background_color="white").generate(data_text_fake)

# Display the generated image:
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title('Most repeated words in all true texts',fontsize=15)
plt.show()

In [None]:
df['label'].value_counts()

In [None]:
# Plot the count of malicious and legitimate requests
sns.countplot(x='label', data=df, palette = "Set1")

In [None]:
import spacy
import string
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
punctuations = string.punctuation

In [None]:
# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)
   # print(doc)
    # print(type(doc))
    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]
    # print(mytokens)
    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    sentence = " ".join(mytokens)
    # return preprocessed list of tokens
    return sentence

In [None]:
df['tokenize'] = df['request'].apply(spacy_tokenizer)

In [None]:
!pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
df['embeddings'] = df['tokenize'].apply(model.encode)
df.head()

In [None]:
X = df['embeddings'].to_list()
y = df['label'].to_list()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,stratify=y)

# **LogisticRegression Model**

In [None]:
#train the model
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression()
LR.fit(X_train,y_train)

In [None]:
from sklearn import metrics
predicted = LR.predict(X_test)
print("Logistic Regression Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Logistic Regression Precision:",metrics.precision_score(y_test, predicted))
print("Logistic Regression Recall:",metrics.recall_score(y_test, predicted))

# **Albert Model**

In [None]:
pip install transformers[torch]

In [None]:
!pip install --upgrade transformers

In [None]:
!pip install --upgrade accelerate

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import softmax
from sklearn.metrics import precision_recall_fscore_support, classification_report, roc_auc_score
from sklearn.metrics import f1_score, roc_auc_score, precision_score, recall_score, accuracy_score, confusion_matrix

from transformers import Trainer, TrainingArguments
from transformers.file_utils import cached_property
from typing import Tuple


In [None]:
model_name = "albert/albert-base-v2"
your_path = "/content/drive/MyDrive/Classification Model/AlbertClassify-maliciousContent2"

In [None]:
from transformers import AlbertForSequenceClassification, AlbertTokenizer

tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name)

In [None]:
class EncodeDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [None]:
train_dataset = EncodeDataset(tokenizer(X_train.request.tolist(),
                                        max_length=64,
                                        truncation=True,
                                        padding='longest'), X_train.label.tolist())

In [None]:
eval_dataset = EncodeDataset(tokenizer(X_valid.request.tolist(),
                                       max_length=64,
                                       truncation=True,
                                       padding='longest'), X_valid.label.tolist())

In [None]:
test_dataset = EncodeDataset(tokenizer(X_test.request.tolist(),
                                       max_length=64,
                                       truncation=True,
                                       padding='longest'), X_test.label.tolist())

In [None]:
device = torch.device('cuda')

class TrAr(TrainingArguments):
    @cached_property
    def _setup_devices(self) -> Tuple["torch.device", int]:
        return device

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)
model.to(device);

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
training_args = TrainingArguments(
    output_dir=your_path+'/FINAL_VERS',   # output directory
    num_train_epochs=3,
    warmup_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=100,
    learning_rate=2e-5,
    save_steps=1000,
    gradient_accumulation_steps=2
    )

In [None]:
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
   args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset,           # evaluation dataset
    tokenizer=tokenizer,
    compute_metrics  = compute_metrics
)

In [None]:
trainer.train()

In [None]:
pred = trainer.predict(test_dataset)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def get_metrics(preds):
    preds, labels = preds.predictions, preds.label_ids
    #standard round approach
    pred_flat = np.argmax(preds, axis=1).flatten()
    pr, rec, f, _ = precision_recall_fscore_support(labels, pred_flat, average='weighted')

    print("precision", pr)
    print("recall", rec)
    print("fscore_weighted", f)

    #adjust threshold approach
    preds_adj = np.array([[float(el1),float(el2)] for el1,el2 in preds])
    preds_adj = softmax(preds_adj, axis = 1)
    roc_auc = roc_auc_score(labels, preds_adj[:, 1])
    print("roc_auc", roc_auc)

    all_metrcis = []
    for threshold in [0.7,0.8,0.9, 1]:
        metrcis = []
        pred_labels = (preds_adj[:, 1] >= threshold).astype(int)
        metrcis.append(threshold)
        metrcis.append(round(f1_score(labels, pred_labels, average='weighted'),2))
        metrcis.append(round(precision_score(labels, pred_labels),2))
        metrcis.append(round(recall_score(labels, pred_labels),2))
        metrcis.append(round(accuracy_score(labels, pred_labels),2))
        all_metrcis.append(metrcis)

    df_metrics = pd.DataFrame(data = all_metrcis, columns = ['threshold','f1','prec','rec','acc'])
    df_metrics = df_metrics.sort_values(by='f1', ascending=False)

    print(classification_report(labels, pred_flat))

    print(df_metrics.head())

    cm = confusion_matrix(labels, pred_flat, labels=[1,0])
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

    ax.set_title('Confusion Matrix')

    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

    ax.xaxis.set_ticklabels(['Malicious', 'Legitimate'])
    ax.yaxis.set_ticklabels(['Malicious', 'Legitimate'])

    return f

get_metrics(pred)