In [None]:
import pandas as pd
import numpy as np
import spacy
import re

from sklearn.feature_extraction.text import CountVectorizer
import torch

from datasets import load_dataset



In [None]:
from transformers import BertModel, BertConfig,BertTokenizer, BertForSequenceClassification,AdamW, get_linear_schedule_with_warmup
from transformers.models.bert.modeling_bert import BertForPreTrainingOutput
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
#configuration = BertConfig()
#print(configuration)
#model = BertModel(configuration)
#print(model)
#config = model.config    
#print(config)

In [None]:

#dataset = load_dataset(
#   'imdb')


#dataset['train'].to_csv('imdb_train.csv')
#dataset['test'].to_csv('imdb_test.csv')

In [None]:
#dataset['train'][0]

In [None]:
df_train = pd.read_csv('imdb_train.csv')
df_test = pd.read_csv('imdb_test.csv')

print(df_train.head(10))
print(df_test.head(10))

In [None]:
df_train['label'].value_counts()

In [None]:
df_test['label'].value_counts()

In [None]:
df_train['text'].to_csv("df_train.txt",index=False,header=True)
df_test['text'].to_csv("df_test.txt",index=False,header=True)


In [None]:
train_input = pd.read_csv("df_train.txt",delimiter="\t")
print("Training Data.....")
print(len(train_input))
print(train_input.iloc[0,:])

test_input = pd.read_csv("df_test.txt",delimiter="\t")
print("Test Data.....")
print(len(test_input))
print(test_input.iloc[0,:])

In [None]:
vocab = BertForPreTrainingOutput()
print(vocab)
token = BertTokenizer("df_train.txt")
print(token)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def preprocessing_data(df_input): 
    
    Ids = []
    Masks = []

    for i in range(len(df_input)):
        
        inputs = tokenizer.encode_plus(str(df_input.iloc[i,:]),add_special_tokens=True,
                padding='max_length',
                return_attention_mask=True,
                return_tensors="pt")
        Ids.append(inputs.get('input_ids'))
        Masks.append(inputs.get('attention_mask'))
    
    Ids = torch.cat(Ids)
    Masks = torch.cat(Masks)
    return Ids, Masks

In [None]:
##### As we have a huge input data, before training the entire data, we will take a sample of 100 records and train the model.
#### This section will take first 100 records from train data and test the model and find the loss.
#### Then predict label of 50 sample records from train data and learn the metrics. 

sample_input = train_input[0:100]
sample_ids,sample_masks = preprocessing_data(sample_input)
sample_label = torch.tensor(df_train['label'][0:100])

batch_size = 32

sample_data = TensorDataset(sample_ids, sample_masks, sample_label)
sample_sampler = RandomSampler(sample_data)
sample_dataloader = DataLoader(sample_data, sampler=sample_sampler, batch_size=batch_size)


# Get the GPU device name.
device = torch.device("cuda")

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


for sample_step,sample_batch in enumerate(sample_dataloader):
    
    sample_input_ids = sample_batch[0].to(device)    
    sample_atten_masks = sample_batch[1].to(device)
    sample_labels = sample_batch[2].to(device)
    
    sample_model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased", 
            num_labels = 2,                     
            output_attentions = False, 
            output_hidden_states = False, 
                )
    
    sample_optimizer = AdamW(model.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )
    sample_model.zero_grad()
    
    sample_logits = model(sample_input_ids,sample_atten_masks)
    
    print(sample_logits)

In [None]:
## Get the Input Ids and Attention Masks for train and test data.

train_ids, train_masks = preprocessing_data(train_input)
test_ids, test_masks = preprocessing_data(test_input)

In [None]:
train_label = torch.tensor(df_train['label'])
test_label  = torch.tensor(df_test['label'])


In [None]:
## Create Train and Test dataset using DataLoader. 

batch_size = 32

train_data = TensorDataset(train_ids, train_masks, train_label)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)


test_data = TensorDataset(test_ids, test_masks, test_label)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
# Get the GPU device name.
device = torch.device("cuda")

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


for step,batch in enumerate(train_dataloader):
    
    input_ids = batch[0].to(device)    
    atten_masks = batch[1].to(device)
    labels = batch[2].to(device)
    
    model = BertForSequenceClassification.from_pretrained(
            "bert-base-uncased", 
            num_labels = 2,                     
            output_attentions = False, 
            output_hidden_states = False, 
                )
    
    optimizer = AdamW(model.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )
    model.zero_grad()
    
    loss,logits = model(input_ids,atten_masks,labels=labels)
    
    print(logits)