In [1]:
from google.colab import drive
drive.mount("/content/drive")
data_dir = "/content/drive/My Drive/studia/PW/NLP"

Mounted at /content/drive


In [2]:
training_percent = 0.8
testing_percent = 0.2
dataset_path = "/content/drive/My Drive/studia/PW/NLP/Phishing_Email.csv"
intended_device = "gpu" #cpu or gpu
learning_rate = 2e-5
rand_seed = 2025
max_len = 80
batch_size = 32
epochs = 2

In [3]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
import nltk
from nltk.corpus import stopwords
import re

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup

In [4]:
def preprocess_dataset(path):
    nltk.download('stopwords')
    sw = stopwords.words('english')
    df = pd.read_csv(path)

    df['Email Text'] = df['Email Text'].astype('str')
    df['Email Type'] = df['Email Type'].astype('str')
    mapping = {'Safe Email': 0, 'Phishing Email': 1}
    df['Email Type'] = df['Email Type'].map(mapping)
    df.rename(columns={'Email Type': 'label'}, inplace=True)
    df.rename(columns={'Email Text': 'text'}, inplace=True)

    #clean message bodies
    df['text'] = df['text'].apply(lambda x: clean_text(x, sw))
    return df


def clean_text(text,sw):

    # delete new lines and tabs
    text = text.replace("\n", " ").replace("\t", " ").strip()
    #lowercase
    text = text.lower()
    #change not used symbols to space
    text = re.sub(r"[^a-zA-Z?.!,$]+", " ", text)
    #remove links
    text = re.sub(r"http\S+", "",text)
    text = re.sub(r"http", "",text)
    text = re.sub(r"enron", "",text)
    #remove html tags
    html=re.compile(r'<.*?>')
    text = html.sub(r'',text)
    #remove punctuations
    #punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_' + "#"
    #for p in punctuations:
    #    text = text.replace(p,'')
    #text = re.sub(r'[{}]'.format(re.escape(punctuations)), '', text)
    text = [word.lower() for word in text.split() if word.lower() not in sw]
    text = " ".join(text) #removing stopwords
    return text

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def calc_max_len(text):
    # get length of all the messages in the train set
    seq_len = [len(i.split()) for i in text]
    hist, bin_edges = np.histogram(seq_len, bins=len(seq_len))
    hist_table = pd.DataFrame({'Message lenght': [f"{int(bin_edges[i])} - {int(bin_edges[i+1])}" for i in range(len(bin_edges)-1)],'Frequency': hist})
    print(hist_table)

    # Calculate cumulative frequency
    cumulative_frequency = np.cumsum(hist)

    # Total frequency
    total_frequency = cumulative_frequency[-1]

    # Find the bin edge that covers the desired percentage
    cutoff_frequency = total_frequency * 0.5 #0.9
    bin_index = np.searchsorted(cumulative_frequency, cutoff_frequency)

    # Return the upper edge of the bin covering the specified percentage
    return int(bin_edges[bin_index + 1])


def main():
    if intended_device == "gpu":
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    elif intended_device == "cpu":
        device = torch.device("cpu")
    print("Using device: "+str(device))
    #preprocess
    df = preprocess_dataset(dataset_path)

    print("Loaded dataset, info:")
    print(df.info())

    emails = df.text.values
    labels = df.label.values

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    input_ids = []
    attention_masks = []

    #max_len = calc_max_len(emails)
    print(max_len)


    # for each email tokenize, add [CLS] and [SEP] tokens, maps tokens to ids, pad or truncate and create attention masks for [PAD] tokens
    for email in emails:
        encoded_dict = tokenizer.encode_plus(
                            email,
                            add_special_tokens = True, #[CLS] and [SEP]
                            max_length = max_len, #pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   #attention masks.
                            return_tensors = 'pt',     # Return pytorch tensors
                    )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    #convert lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)

    #convert to TensorDataset.
    dataset = TensorDataset(input_ids, attention_masks, labels)

    #training/testing set split
    train_size = int(training_percent * len(dataset))
    val_size = int(testing_percent * len(dataset))

    #split randomly
    train_dataset, tmp_dataset = random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(rand_seed))
    val_dataset, test_dataset_tmp = random_split(tmp_dataset, [int(training_percent * len(tmp_dataset)), int(testing_percent * len(tmp_dataset))], generator=torch.Generator().manual_seed(rand_seed))

    #DataLoaders for training and validation sets.
    #samples in random order.
    train_dataloader = DataLoader(
                train_dataset,
                sampler = RandomSampler(train_dataset),
                batch_size = batch_size
                )

    #for validation sequentially.
    validation_dataloader = DataLoader(
                val_dataset,
                sampler = SequentialSampler(val_dataset),
                batch_size = batch_size
                )

    #load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.
    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased", #12-layer BERT model, with an uncased vocab
        num_labels = 2, #2 for binary classification.
        output_attentions = False, #whether the model returns attentions weights.
        output_hidden_states = False, #whether the model returns all hidden-states.
    )

    model = model.to(device)

    optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = 1e-8 #default
                )

    #total number of training steps
    total_steps = len(train_dataloader) * epochs

    #learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)



    random.seed(rand_seed)
    np.random.seed(rand_seed)
    torch.manual_seed(rand_seed)
    torch.cuda.manual_seed_all(rand_seed)
    training_stats = []
    total_t0 = time.time()

    # For each epoch...
    for epoch_i in range(0, epochs):
        #training: perform one full pass over the training set.
        print("")
        print('Epoch {:} / {:} '.format(epoch_i + 1, epochs))
        print('Training...')
        # Measure how long the training epoch takes.
        t0 = time.time()
        total_train_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            # Unpack this training batch from our dataloader.
            # As we unpack the batch, we'll also copy each tensor to the device using the `to` method.
            # `batch` contains three pytorch tensors:
            #   [0]: input ids
            #   [1]: attention masks
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            optimizer.zero_grad()
            output = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=b_labels)
            loss = output.loss
            total_train_loss += loss.item()
            # Perform a backward pass to calculate the gradients.
            loss.backward()
            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()
            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))
        #               Validation
        # After the completion of each training epoch, measure our performance on
        # our validation set.
        print("")
        print("Running Validation...")
        t0 = time.time()
        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()
        # Tracking variables
        total_eval_accuracy = 0
        best_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0
        # Evaluate data for one epoch
        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():
                output= model(b_input_ids,
                                    token_type_ids=None,
                                    attention_mask=b_input_mask,
                                    labels=b_labels)
            loss = output.loss
            total_eval_loss += loss.item()
            # Move logits and labels to CPU if we are using GPU
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            total_eval_accuracy += flat_accuracy(logits, label_ids)
        # Report the final accuracy for this validation run.
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)
        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)
        if avg_val_accuracy > best_eval_accuracy:
            torch.save(model, 'bert_model')
            best_eval_accuracy = avg_val_accuracy
        #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        #print("  Validation took: {:}".format(validation_time))
        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch_i + 1,
                'Training Loss': avg_train_loss,
                'Valid. Loss': avg_val_loss,
                'Valid. Accur.': avg_val_accuracy,
                'Training Time': training_time,
                'Validation Time': validation_time
            }
        )
    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

    model = torch.load('bert_model')

    test_input_ids, test_attention_masks, test_labels = zip(*test_dataset_tmp)
    # Tworzenie nowego TensorDataset bez etykiet
    test_dataset = TensorDataset(torch.stack(test_input_ids), torch.stack(test_attention_masks))
    test_labels_list = [label.item() for label in test_labels]


    test_dataloader = DataLoader(
                test_dataset, # The validation samples.
                sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
                batch_size = batch_size # Evaluate with this batch size.
            )


    predictions = []
    for batch in test_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            with torch.no_grad():
                output= model(b_input_ids,
                                    token_type_ids=None,
                                    attention_mask=b_input_mask)
                logits = output.logits
                logits = logits.detach().cpu().numpy()
                pred_flat = np.argmax(logits, axis=1).flatten()

                predictions.extend(list(pred_flat))


    print("Test results: ")
    conf_matrix = confusion_matrix(test_labels_list, predictions)

    confusion_df = pd.DataFrame(
        conf_matrix,
        index=["Actual Negative", "Actual Positive"],
        columns=["Predicted Negative", "Predicted Positive"]
    )

    # Display the labeled confusion matrix
    print("Confusion Matrix:")
    print(confusion_df)


    # Generowanie raportu
    report = classification_report(test_labels_list, predictions, target_names=["Negative", "Positive"])
    print("\nClassification Report:")
    print(report)

In [None]:
main()

Using device: cpu


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Loaded dataset, info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18650 entries, 0 to 18649
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  18650 non-null  int64 
 1   text        18650 non-null  object
 2   label       18650 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 437.2+ KB
None


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


80




model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Epoch 1 / 2 
Training...
