In [1]:
!pip install transformers datasets evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m60.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>

In [2]:
# Importing the libraries needed
import pandas as pd
import torch
import transformers
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertModel, DistilBertTokenizer

In [3]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [4]:
# Import the csv into pandas dataframe and add the headers
# df = pd.read_csv('./data/newsCorpora.csv', sep='\t', names=['ID','TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df = pd.read_csv('train.csv')
# df.head()
# # Removing unwanted columns and only leaving title of news and the category which will be the target
# df = df[['TITLE','CATEGORY']]
df = df[['category', 'noisyTextDescription']]
# df.head()

# # Converting the codes to appropriate categories using a dictionary
# my_dict = {
#     'e':'Entertainment',
#     'b':'Business',
#     't':'Science',
#     'm':'Health'
# }

# def update_cat(x):
#     return my_dict[x]

# df['CATEGORY'] = df['CATEGORY'].apply(lambda x: update_cat(x))

encode_dict = {}

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

df['encode_cat'] = df['category'].apply(lambda x: encode_cat(x))

In [5]:
NUM_CATEGORIES = len(encode_dict)

In [6]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

In [7]:
class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.noisyTextDescription[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.encode_cat[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [8]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = Triage(train_dataset, tokenizer, MAX_LEN)
testing_set = Triage(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (21627, 3)
TRAIN Dataset: (17302, 3)
TEST Dataset: (4325, 3)


In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [10]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, NUM_CATEGORIES)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [11]:
model = DistillBERTClass()
model.to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [12]:
# Creating the loss function and optimizer
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [13]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [16]:
# Defining the training function on the 80% of the dataset for tuning the distilbert model

def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%100==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 100 steps: {loss_step}")
            print(f"Training Accuracy per 100 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [17]:
for epoch in range(EPOCHS):
    train(epoch)

Training Loss per 100 steps: 1.7249985933303833
Training Accuracy per 100 steps: 50.0
Training Loss per 100 steps: 1.7913762165768312
Training Accuracy per 100 steps: 54.20792079207921
Training Loss per 100 steps: 1.8023425852926216
Training Accuracy per 100 steps: 55.472636815920396
Training Loss per 100 steps: 1.7671549383092957
Training Accuracy per 100 steps: 56.81063122923588
Training Loss per 100 steps: 1.7211507552505432
Training Accuracy per 100 steps: 57.98004987531172
Training Loss per 100 steps: 1.6819196656911435
Training Accuracy per 100 steps: 59.231536926147704
Training Loss per 100 steps: 1.6464695735600545
Training Accuracy per 100 steps: 60.56572379367721
Training Loss per 100 steps: 1.6054334724255532
Training Accuracy per 100 steps: 61.84022824536377
Training Loss per 100 steps: 1.5653174461868966
Training Accuracy per 100 steps: 62.92134831460674
Training Loss per 100 steps: 1.5359616835460017
Training Accuracy per 100 steps: 63.73473917869035
Training Loss per 100

In [26]:
def valid(model, testing_loader):
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    tr_loss = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%100==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    
    return epoch_accu


In [27]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
Validation Loss per 100 steps: 1.251245379447937
Validation Accuracy per 100 steps: 50.0




Validation Loss per 100 steps: 1.136500413538796
Validation Accuracy per 100 steps: 75.24752475247524
Validation Loss per 100 steps: 1.1780003362997848
Validation Accuracy per 100 steps: 74.12935323383084
Validation Loss per 100 steps: 1.176026630936271
Validation Accuracy per 100 steps: 74.4186046511628
Validation Loss per 100 steps: 1.1324862592442524
Validation Accuracy per 100 steps: 75.06234413965088
Validation Loss per 100 steps: 1.0876250431581054
Validation Accuracy per 100 steps: 76.24750499001996
Validation Loss per 100 steps: 1.095513837005999
Validation Accuracy per 100 steps: 76.37271214642263
Validation Loss per 100 steps: 1.0832101873490168
Validation Accuracy per 100 steps: 76.67617689015692
Validation Loss per 100 steps: 1.098886678616206
Validation Accuracy per 100 steps: 76.34207240948814
Validation Loss per 100 steps: 1.1027598477627143
Validation Accuracy per 100 steps: 76.19311875693674
Validation Loss per 100 steps: 1.0791728045646247
Validation Accuracy per 100 

RuntimeError: ignored

In [None]:
# import pandas as pd
# from datasets import Dataset
# import torch
# from torch import nn
# from torch.utils.data import Dataset, DataLoader
# import torch.nn.functional as F

In [None]:
# from torch import cuda
# device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
# from sklearn.model_selection import train_test_split

# from transformers import DistilBertModel, DistilBertTokenizer
# tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [None]:
# raw_data = pd.read_csv('train.csv')
# print(raw_data.iloc[1])
# train_texts = raw_data['noisyTextDescription']
# print(train_texts[1])

# cat_2_ind = {}
# ind_2_cat = {}
# def convert_category(category):
#     if category not in cat_2_ind:
#         cat_2_ind[category] = len(cat_2_ind)
#         ind_2_cat[cat_2_ind[category]] = category
#     return cat_2_ind[category]

# NUM_CATEGORIES = len(cat_2_ind)

# train_labels = raw_data['category'].apply(convert_category)
# print(train_labels[1])

# train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.2)
# print(train_texts[0])

# train_texts.reset_index(drop=True)
# val_texts.reset_index(drop=True)
# train_labels.reset_index(drop=True)
# val_labels.reset_index(drop=True)

In [None]:
# class DescriptionData(Dataset):
#     def __init__(self, texts, labels, tokenizer):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.len = len(self.labels)

#     def __getitem__(self, idx):
#         desc = self.texts.iloc[idx]
#         inputs = self.tokenizer.encode_plus(
#             desc,
#             None,
#             add_special_tokens=True,
#             max_length=512,
#             pad_to_max_length=True,
#             return_token_type_ids=True,
#             truncation=True
#         )
#         ids = inputs['input_ids']
#         mask = inputs['attention_mask']

#         return {
#             'ids': torch.tensor(ids, dtype=torch.long),
#             'mask': torch.tensor(mask, dtype=torch.long),
#             'targets': torch.tensor(self.labels[idx], dtype=torch.long)
#         }

#     def __len__(self):
#         return self.len

# train_dataset = DescriptionData(train_texts, train_labels, tokenizer)
# val_dataset = DescriptionData(val_texts, val_labels, tokenizer)

# TRAIN_BATCH_SIZE = 4
# VALID_BATCH_SIZE = 2
# EPOCHS = 1
# LEARNING_RATE = 1e-05

# train_params = {
#     'batch_size': TRAIN_BATCH_SIZE,
#     'shuffle': True,
#     'num_workers': 0
#     }

# val_params = {
#     'batch_size': VALID_BATCH_SIZE,
#     'shuffle': True,
#     'num_workers': 0
# }

# train_loader = DataLoader(train_dataset, **train_params)
# val_loader = DataLoader(val_dataset, **val_params)

In [None]:
# class BERTClass(nn.Module):
#     def __init__(self):
#         super(BERTClass, self).__init__()
#         self.bert = DistilBertModel.from_pretrained("distilbert-base-uncased")
#         self.pre_classifier = torch.nn.Linear(768, 768)
#         self.dropout = torch.nn.Dropout(0.3)
#         self.classifier = torch.nn.Linear(768, NUM_CATEGORIES)

#     def forward(self, input_ids, attention_mask):
#         output_1 = self.bert(input_ids=input_ids, attention_mask=attention_mask)
#         hidden_state = output_1[0]
#         pooler = hidden_state[:, 0]
#         pooler = self.pre_classifier(pooler)
#         pooler = F.relu(pooler)
#         pooler = self.dropout(pooler)
#         output = self.classifier(pooler)
#         return output

# model = BERTClass()
# model.to(device)

In [None]:
# loss_function = torch.nn.CrossEntropyLoss()
# optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [None]:
# def calcuate_accu(big_idx, targets):
#     n_correct = (big_idx==targets).sum().item()
#     return n_correct

In [None]:
# def train(epoch):
#     tr_loss = 0
#     n_correct = 0
#     nb_tr_steps = 0
#     nb_tr_examples = 0
#     model.train()
#     for _,data in enumerate(train_loader, 0):
#         ids = data['ids'].to(device, dtype = torch.long)
#         mask = data['mask'].to(device, dtype = torch.long)
#         targets = data['targets'].to(device, dtype = torch.long)

#         outputs = model(ids, mask)
#         loss = loss_function(outputs, targets)
#         tr_loss += loss.item()
#         big_val, big_idx = torch.max(outputs.data, dim=1)
#         n_correct += calcuate_accu(big_idx, targets)

#         nb_tr_steps += 1
#         nb_tr_examples+=targets.size(0)
        
#         if _%5000==0:
#             loss_step = tr_loss/nb_tr_steps
#             accu_step = (n_correct*100)/nb_tr_examples 
#             print(f"Training Loss per 5000 steps: {loss_step}")
#             print(f"Training Accuracy per 5000 steps: {accu_step}")

#         optimizer.zero_grad()
#         loss.backward()
#         # # When using GPU
#         optimizer.step()

#     print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
#     epoch_loss = tr_loss/nb_tr_steps
#     epoch_accu = (n_correct*100)/nb_tr_examples
#     print(f"Training Loss Epoch: {epoch_loss}")
#     print(f"Training Accuracy Epoch: {epoch_accu}")

#     return 

In [None]:
# for epoch in range(EPOCHS):
#     train(epoch)

In [None]:
# def valid(model, testing_loader):
#     model.eval()
#     n_correct = 0; n_wrong = 0; total = 0
#     with torch.no_grad():
#         for _, data in enumerate(testing_loader, 0):
#             ids = data['ids'].to(device, dtype = torch.long)
#             mask = data['mask'].to(device, dtype = torch.long)
#             targets = data['targets'].to(device, dtype = torch.long)
#             outputs = model(ids, mask).squeeze()
#             loss = loss_function(outputs, targets)
#             tr_loss += loss.item()
#             big_val, big_idx = torch.max(outputs.data, dim=1)
#             n_correct += calcuate_accu(big_idx, targets)

#             nb_tr_steps += 1
#             nb_tr_examples+=targets.size(0)
            
#             if _%5000==0:
#                 loss_step = tr_loss/nb_tr_steps
#                 accu_step = (n_correct*100)/nb_tr_examples
#                 print(f"Validation Loss per 100 steps: {loss_step}")
#                 print(f"Validation Accuracy per 100 steps: {accu_step}")
#     epoch_loss = tr_loss/nb_tr_steps
#     epoch_accu = (n_correct*100)/nb_tr_examples
#     print(f"Validation Loss Epoch: {epoch_loss}")
#     print(f"Validation Accuracy Epoch: {epoch_accu}")
    
#     return epoch_accu


In [None]:
# print('This is the validation section to print the accuracy and see how it performs')
# print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

# acc = valid(model, testing_loader)
# print("Accuracy on test data = %0.2f%%" % acc)