In [15]:
import pandas as pd
import numpy as np
import spacy
import re
import random
import time

from sklearn.feature_extraction.text import CountVectorizer
import torch

from datasets import load_dataset



In [2]:
from transformers import BertModel, BertConfig,BertTokenizer, BertForSequenceClassification,AdamW, get_linear_schedule_with_warmup
from transformers.models.bert.modeling_bert import BertForPreTrainingOutput
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [3]:
#configuration = BertConfig()
#print(configuration)
#model = BertModel(configuration)
#print(model)
#config = model.config    
#print(config)

In [None]:

#dataset = load_dataset(
#   'imdb')


#dataset['train'].to_csv('imdb_train.csv')
#dataset['test'].to_csv('imdb_test.csv')

In [None]:
#dataset['train'][0]

In [4]:
df_train = pd.read_csv('imdb_train.csv')
df_test = pd.read_csv('imdb_test.csv')

print(df_train.head(10))
print(df_test.head(10))

   Unnamed: 0                                               text  label
0           0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1           1  "I Am Curious: Yellow" is a risible and preten...      0
2           2  If only to avoid making this type of film in t...      0
3           3  This film was probably inspired by Godard's Ma...      0
4           4  Oh, brother...after hearing about this ridicul...      0
5           5  I would put this at the top of my list of film...      0
6           6  Whoever wrote the screenplay for this movie ob...      0
7           7  When I first saw a glimpse of this movie, I qu...      0
8           8  Who are these "They"- the actors? the filmmake...      0
9           9  This is said to be a personal film for Peter B...      0
   Unnamed: 0                                               text  label
0           0  I love sci-fi and am willing to put up with a ...      0
1           1  Worth the entertainment value of a rental, esp...

In [5]:
df_train['label'].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [6]:
df_test['label'].value_counts()

1    12500
0    12500
Name: label, dtype: int64

In [7]:
df_train['text'].to_csv("df_train.txt",index=False,header=True)
df_test['text'].to_csv("df_test.txt",index=False,header=True)


In [7]:
train_input = pd.read_csv("df_train.txt",delimiter="\t")
print("Training Data.....")
print(len(train_input))
print(train_input.iloc[0,:])

test_input = pd.read_csv("df_test.txt",delimiter="\t")
print("Test Data.....")
print(len(test_input))
print(test_input.iloc[0,:])

Training Data.....
25000
text    I rented I AM CURIOUS-YELLOW from my video sto...
Name: 0, dtype: object
Test Data.....
25000
text    I love sci-fi and am willing to put up with a ...
Name: 0, dtype: object


In [8]:
vocab = BertForPreTrainingOutput()
print(vocab)
token = BertTokenizer("df_train.txt")
print(token)

BertForPreTrainingOutput(loss=None, prediction_logits=None, seq_relationship_logits=None, hidden_states=None, attentions=None)
PreTrainedTokenizer(name_or_path='', vocab_size=24905, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})


In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [10]:
def preprocessing_data(df_input): 
    
    Ids = []
    Masks = []

    for i in range(len(df_input)):
        
        inputs = tokenizer.encode_plus(str(df_input.iloc[i,:]),add_special_tokens=True,
                padding='max_length',
                return_attention_mask=True,
                return_tensors="pt")
        Ids.append(inputs.get('input_ids'))
        Masks.append(inputs.get('attention_mask'))
    
    Ids = torch.cat(Ids)
    Masks = torch.cat(Masks)
    return Ids, Masks

In [23]:
### Finding the loss based on the predicted label and target.

def loss_function_1(logit,label):
    predict = np.argmax(logit['logits'].detach().numpy(),axis=1).flatten()
    labels = label.numpy()
    loss = np.sum(predict==labels)/len(labels)
    return loss

def loss_function_2(logit,label):
    func = torch.nn.CrossEntropyLoss()
    loss = func(logit['logits'],label)
    return(loss)
    

In [None]:
##### As we have a huge input data, before training the entire data, we will take a sample of 100 records and train the model.
#### This section will take first 100 records from train data and test the model and find the loss.
#### Then predict label of 50 sample records from train data and learn the metrics. 

sample_input = train_input[0:100]
sample_ids,sample_masks = preprocessing_data(sample_input)
sample_label = torch.tensor(df_train['label'][0:100])

batch_size = 32

sample_data = TensorDataset(sample_ids, sample_masks, sample_label)
sample_sampler = RandomSampler(sample_data)
sample_dataloader = DataLoader(sample_data, sampler=sample_sampler, batch_size=batch_size)


# Get the GPU device name.
device = torch.device("cuda")

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


loss_1 = 0
loss_2 = 0
sample_loss_1 = 0
sample_loss_2 = 0

epochs = 5


seed_num = 42

random.seed(seed_num)
np.random.seed(seed_num)
torch.manual_seed(seed_num)


for epoch in range(0,epochs):
    
    initial_time = time.time()
    
    print(f"epochs: {epoch} ")
    print("   ")
    
    for sample_step,sample_batch in enumerate(sample_dataloader):
        
      
        sample_input_ids = sample_batch[0].to(device)    
        sample_atten_masks = sample_batch[1].to(device)
        sample_labels = sample_batch[2].to(device)

        sample_model = BertForSequenceClassification.from_pretrained(
                "bert-base-uncased", 
                num_labels = 2,                     
                output_attentions = False, 
                output_hidden_states = False, 
                    )
        
        total_steps = len(sample_dataloader) * epoch

        sample_optimizer = AdamW(sample_model.parameters(),
                          lr=5e-5,    # Default learning rate
                          eps=1e-8    # Default epsilon value
                          )
        
        sample_scheduler = get_linear_schedule_with_warmup(sample_optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
        sample_model.zero_grad()

        sample_logits = sample_model(sample_input_ids,sample_atten_masks)

        loss_1 = loss_function_1(sample_logits,sample_labels)
        sample_loss_1 += loss_1
        loss_2 = loss_function_2(sample_logits,sample_labels)
        sample_loss_2 += loss_2.item()

        loss_2.backward()

        avg_train_loss = sample_loss_2 / len(sample_dataloader)

        torch.nn.utils.clip_grad_norm_(sample_model.parameters(), 1.0)

        sample_optimizer.step()

        sample_scheduler.step()
        
        final_time = time.time() - initial_time
        
        if (sample_step % 20 and step != 0):            
                        
            print(f"step: {sample_step},  sample_batch: {sample_batch}, average_training_loss: {avg_train_loss}, time_taken: {final_time}")



No GPU available, using the CPU instead.
epochs: 0 
   


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

epochs: 1 
   


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
## Get the Input Ids and Attention Masks for train and test data.

train_ids, train_masks = preprocessing_data(train_input)
test_ids, test_masks = preprocessing_data(test_input)

In [None]:
train_label = torch.tensor(df_train['label'])
test_label  = torch.tensor(df_test['label'])


In [None]:
## Create Train and Test dataset using DataLoader. 

batch_size = 32

train_data = TensorDataset(train_ids, train_masks, train_label)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


test_data = TensorDataset(test_ids, test_masks, test_label)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
# Get the GPU device name.
device = torch.device("cuda")

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


for step,batch in enumerate(train_dataloader):
    
    input_ids = batch[0].to(device)    
    atten_masks = batch[1].to(device)
    labels = batch[2].to(device)
    
    model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased", 
            num_labels = 2,                     
            output_attentions = False, 
            output_hidden_states = False, 
                )
    
    optimizer = AdamW(model.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )
    model.zero_grad()
    
    loss,logits = model(input_ids,atten_masks,labels=labels)
    
    print(logits)