In [1]:
from google.colab import drive
drive.mount("/content/drive")
data_dir = "/content/drive/My Drive/studia/PW/NLPDATA/"

Mounted at /content/drive


In [2]:
import sys
sys.path.append('/content/drive/studia/PW/NLPDATA')

In [3]:
# training dataset
PARAM2 = data_dir + "7000_training_dataset.csv"
# test dataset
PARAM3 = data_dir + "500_test_dataset.csv"
# this is variable for spliting training dataset into train and validation
# test data set is already spited to other file
PARAM4 = 0.2
# numers of epochs
PARAM5 = 4
# clean stopwords
PARAM6= 1
# batch_size
PARAM7 = 16
# AdamW lr
PARAM8 = 1e-3
# Max_seq_len
PARAM9 = 512

In [4]:
# import config and class and function

# import libraries
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import transformers
from transformers import AutoModel, BertTokenizerFast
import time
import datetime
import gc
import random
import nltk
import re
import ssl
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup

# this part is downloading stopwoards to clean dataset before learning
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('stopwords')
from nltk.corpus import stopwords

device = torch.device("cuda")
print(device)

class BERT_Arch(nn.Module):

    def __init__(self, bert):
      super(BERT_Arch, self).__init__()
      self.bert = bert
      # dropout layer
      self.dropout = nn.Dropout(0.1)
      # relu activation function
      self.relu =  nn.ReLU()
      # dense layer 1
      self.fc1 = nn.Linear(768,512)
      # dense layer 2 (Output layer)
      self.fc2 = nn.Linear(512,2)
      #softmax activation function
      self.softmax = nn.LogSoftmax(dim=1)

    #define the forward pass
    def forward(self, sent_id, mask):
      #pass the inputs to the model
#      _, cls_hs = self.bert(sent_id, attention_mask=mask)
      cls_hs = self.bert(sent_id, attention_mask=mask)[1]
      x = self.fc1(cls_hs)
      x = self.relu(x)
      x = self.dropout(x)
      # output layer
      x = self.fc2(x)
      # apply softmax activation
      x = self.softmax(x)
      return x

sw = stopwords.words('english')

def clean_text(text, clean_stopwords):
    text = text.lower() # because using bert-base-uncased
    text = re.sub(r"http", "",text) #Removing URLs
    text = re.sub(r"enron", "",text) #Removing enron
    text = re.sub(r"[^a-zA-Z?.!,¿$]+", " ", text) #Removing special characters
    html = re.compile(r'<.*?>')
    text = html.sub(r'',text) #Removing html tags
    #remove punctuations
#    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" + '_' + "#"
#    for p in punctuations:
#        text = text.replace(p,'')
#    text = re.sub(r'[{}]'.format(re.escape(punctuations)), '', text)
    if clean_stopwords:
        text = [word for word in text.split() if word not in sw]
    text = " ".join(text) #removing stopwords
    return text


def import_data():
    # This part is responsible for importing and preparing data
    df_training = pd.read_csv(PARAM2)
    df_test = pd.read_csv(PARAM3)

    df_training['text'] = df_training['text'].astype('str')
    df_training['text'] = df_training['text'].apply(lambda x: clean_text(x, PARAM6))

    # check class distribution
    print(df_training['label'].value_counts(normalize = True))
    return df_training, df_test


def format_time(seconds):
    """Convert seconds into a formatted string (e.g., HH:MM:SS)."""
    mins, secs = divmod(seconds, 60)
    hours, mins = divmod(mins, 60)
    return f"{int(hours):02}:{int(mins):02}:{int(secs):02}"


# function for evaluating the model
def evaluate(model, val_dataloader, cross_entropy):
  print("\nEvaluating...")
  # deactivate dropout layers
  model.eval()
  total_loss, total_accuracy = 0, 0
  # empty list to save the model predictions
  total_preds = []
  t0 = time.time()
  # iterate over batches
  for step,batch in enumerate(val_dataloader):
    # Progress update every 50 batches.
    if step % 50 == 0 and not step == 0:
      # Calculate elapsed time in minutes.
      elapsed = format_time(time.time() - t0)
      t0 = time.time()
      # Report progress.
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))
      print(elapsed)
    # push the batch to device
    batch = [t.to(device) for t in batch]
    sent_id, mask, labels = batch
    # deactivate autograd
    with torch.no_grad():
      # model predictions
      preds = model(sent_id, mask)
      # compute the validation loss between actual and predicted values
      loss = cross_entropy(preds,labels)
      total_loss = total_loss + loss.item()
      preds = preds.detach().cpu().numpy()
      total_preds.append(preds)
  # compute the validation loss of the epoch
  avg_loss = total_loss / len(val_dataloader)
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)
  return avg_loss, total_preds


# function to train the model
def train(model, train_dataloader, optimizer, cross_entropy):
  model.train()
  total_loss, total_accuracy = 0, 0
  # empty list to save model predictions
  total_preds=[]
  t0 = time.time()
  # iterate over batches
  for step,batch in enumerate(train_dataloader):
    # progress update after every 50 batches.
    if step % 50 == 0 and not step == 0:
      print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))
    # push the batch to device
    batch = [r.to(device) for r in batch]
    sent_id, mask, labels = batch
    # clear previously calculated gradients
    model.zero_grad()
    # get model predictions for the current batch
    preds = model(sent_id, mask)
    # compute the loss between actual and predicted values
    loss = cross_entropy(preds, labels)
    # add on to the total loss
    total_loss = total_loss + loss.item()
    # backward pass to calculate the gradients
    loss.backward()
    # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    # update parameters
    optimizer.step()
    # model predictions are stored on GPU. So, push it to CPU
    preds=preds.detach().cpu().numpy()
    # append the model predictions
    total_preds.append(preds)
  elapsed = format_time(time.time() - t0)
  t0 = time.time()
  print(elapsed)
  # compute the training loss of the epoch
  avg_loss = total_loss / len(train_dataloader)
  # predictions are in the form of (no. of batches, size of batch, no. of classes).
  # reshape the predictions in form of (number of samples, no. of classes)
  total_preds  = np.concatenate(total_preds, axis=0)
  #returns the loss and predictions
  return avg_loss, total_preds


def main():

    df_training, df_test = import_data()
    # prepare training set and validation set
    train_text, val_text, train_labels, val_labels = train_test_split(
        df_training['text'],
        df_training['label'],
        random_state=2018,
        test_size=PARAM4,
        stratify=df_training['label'])
    # prepare test set
    test_text = df_test['text']  # Features (email text)
    test_text = test_text.astype('str')
    test_labels = df_test['label']  # Labels (e.g., ham/spam or 0/1)


    # import BERT-base pretrained model
    bert = AutoModel.from_pretrained('bert-base-uncased')
    # Load the BERT tokenizer
    tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


    # get length of all the messages in the train set
    seq_len = [len(i.split()) for i in train_text]
    hist, bin_edges = np.histogram(seq_len, bins=len(seq_len))
    hist_table = pd.DataFrame({'Message lenght': [f"{int(bin_edges[i])} - {int(bin_edges[i+1])}" for i in range(len(bin_edges)-1)],'Frequency': hist})
    cumulative_frequency = np.cumsum(hist)
    # Total frequency
    total_frequency = cumulative_frequency[-1]
    # Find the bin edge that covers the desired percentage
    cutoff_frequency = total_frequency * 0.9
    bin_index = np.searchsorted(cumulative_frequency, cutoff_frequency)
    # Return the upper edge of the bin covering the specified percentage
    if int(bin_edges[bin_index + 1]) > int(PARAM9):
        print("max len wiekszy, prawda")
        max_seq_len = int(PARAM9)
    else:
        print("max len mniejszy falsz")
        max_seq_len = int(bin_edges[bin_index + 1])
    print(max_seq_len)


    # tokenize and encode sequences in the training set
    tokens_train = tokenizer.batch_encode_plus(
        train_text.tolist(),
        max_length = max_seq_len,
        padding="max_length",
        truncation=True,
        return_token_type_ids=False
    )
    # tokenize and encode sequences in the validation set
    tokens_val = tokenizer.batch_encode_plus(
        val_text.tolist(),
        max_length = max_seq_len,
        padding="max_length",
        truncation=True,
        return_token_type_ids=False
    )
    # tokenize and encode sequences in the test set
    tokens_test = tokenizer.batch_encode_plus(
    test_text.tolist(),
    max_length = max_seq_len,
    padding="max_length",
    truncation=True,
    return_token_type_ids=False
    )
    # for train set
    train_seq = torch.tensor(tokens_train['input_ids'])
    train_mask = torch.tensor(tokens_train['attention_mask'])
    train_y = torch.tensor(train_labels.tolist())
    # for validation set
    val_seq = torch.tensor(tokens_val['input_ids'])
    val_mask = torch.tensor(tokens_val['attention_mask'])
    val_y = torch.tensor(val_labels.tolist())
    # for test set
    test_seq = torch.tensor(tokens_test['input_ids'])
    test_mask = torch.tensor(tokens_test['attention_mask'])
    test_y = torch.tensor(test_labels.tolist())


    #define a batch size
    batch_size = PARAM7
    # wrap tensors
    train_data = TensorDataset(train_seq, train_mask, train_y)
    # sampler for sampling the data during training
    train_sampler = RandomSampler(train_data)
    # dataLoader for train set
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    # wrap tensors
    val_data = TensorDataset(val_seq, val_mask, val_y)
    # sampler for sampling the data during training
    val_sampler = SequentialSampler(val_data)
    # dataLoader for validation set
    val_dataloader = DataLoader(val_data, sampler = val_sampler, batch_size=batch_size)

    # freeze all the parameters
    for param in bert.parameters():
        param.requires_grad = False


    # pass the pre-trained BERT to our define architecture
    model = BERT_Arch(bert)
    # push the model to GPU or CPU as selected in the begining
    model = model.to(device)
    # define the optimizer
    optimizer = AdamW(model.parameters(), lr = PARAM8)
    class_wts = compute_class_weight('balanced', classes=np.unique(train_labels), y=train_labels)
    print("Class wts ", class_wts)
    # convert class weights to tensor
    weights= torch.tensor(class_wts,dtype=torch.float)
    weights = weights.to(device)
    # loss function
    cross_entropy  = nn.NLLLoss(weight=weights)
    # number of training epochs
    epochs = PARAM5

    # set initial loss to infinite
    best_valid_loss = float('inf')
    # empty lists to store training and validation loss of each epoch
    train_losses=[]
    valid_losses=[]

# TRAIN MODEL
    #for each epoch
    for epoch in range(epochs):
        print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
        train_loss, _ = train(model, train_dataloader, optimizer, cross_entropy) #train model
        valid_loss, _ = evaluate(model, val_dataloader, cross_entropy) #evaluate model
        #save the best model
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'saved_weights.pt')
        # append training and validation loss
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        print(f'\nTraining Loss: {train_loss:.3f}')
        print(f'Validation Loss: {valid_loss:.3f}')
# TEST MODEL
    #load weights of best model
    path = 'saved_weights.pt'
    model.load_state_dict(torch.load(path))
    # get predictions for test data
    with torch.no_grad():
        preds = model(test_seq.to(device), test_mask.to(device))
        preds = preds.detach().cpu().numpy()
    # model's performance
    preds = np.argmax(preds, axis = 1)
    print(classification_report(test_y, preds))

    # confusion matrix
    pd.crosstab(test_y, preds)

main()


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


cuda
label
0    0.605143
1    0.394857
Name: proportion, dtype: float64


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

max len mniejszy falsz
426


RuntimeError: Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx