In [1]:
import sys

import numpy as np
import torch
import random

import matplotlib.pyplot as plt


import torch
from torch import nn, optim
from torch import load
from torch.nn import functional as F
from torch import autograd

from torchvision import datasets

from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader

import time

import sys
from pathlib import Path

from numba import njit

import os

import gc

from tqdm import tqdm

#from easyntk.explicit import explicit_ntk

In [None]:
import pandas as pd
# because the dataset is int tsv format we have to use delimeter.
df = pd.read_csv("../DATA/cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_sources', 'label', 'label_note', 'sentence'])

# creating a copy so we don't messed up our original dataset.
data=df.copy()

data.drop(['sentence_sources','label_note'],axis=1,inplace=True)
sentences=data.sentence.values
labels = data.label.values
data.head()

from transformers import BertTokenizer
# using the low level BERT for our task.
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

# Printing the original sentence.
print(' Original: ', sentences[0])

# Printing the tokenized sentence in form of list.
print('Tokenized: ', tokenizer.tokenize(sentences[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sentences[0])))

input_ids = []
for sent in sentences:
    # so basically encode tokenizing , mapping sentences to thier token ids after adding special tokens.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence which are encoding.
                        add_special_tokens = True, # Adding special tokens '[CLS]' and '[SEP]'

                         )
    
 
    input_ids.append(encoded_sent)
    
from keras.preprocessing.sequence import pad_sequences

MAX_LEN = 128

input_ids = pad_sequences(input_ids, maxlen=MAX_LEN , truncating="post", padding="post")

attention_masks = []

for sent in input_ids:
    
    # Generating attention mask for sentences.
    #   - when there is 0 present as token id we are going to set mask as 0.
    #   - we are going to set mask 1 for all non-zero positive input id.
    att_mask = [int(token_id > 0) for token_id in sent]
    
   
    attention_masks.append(att_mask)
    
from sklearn.model_selection import train_test_split

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, test_size=0.2, random_state=0)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, labels,test_size=0.2, random_state=0)

#changing the numpy arrays into tensors for working on GPU. 
train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)

train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)

train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

# Deciding the batch size for training.

batch_size = 32

#DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = SequentialSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size, shuffle=False)

In [3]:
from transformers import BertForSequenceClassification, AdamW, BertConfig

In [None]:
for SEED in range(3,11):
    torch.manual_seed(SEED)

    model = BertForSequenceClassification.from_pretrained(
        "bert-base-uncased",
        num_labels = 2,   
        output_attentions = False,
        output_hidden_states = False,
    )
    
    model.to('cuda')

    optimizer = AdamW(model.parameters(),
                  lr = 2e-5,
                  eps = 1e-8)

    train_data = TensorDataset(train_inputs, train_labels, train_masks)
    train_dataloader = DataLoader(train_data, batch_size=64, shuffle=False)

    total_loss = 0
        # putting model in traing mode there are two model eval and train for model
    model.train()
    device='cuda'
    for epoch in range(10):
        for step, batch in enumerate(train_dataloader):
            optimizer.zero_grad()
            #getting ids,mask,labes for every batch
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[2].to(device)
            b_labels = batch[1].to(device)

            outputs = model(b_input_ids,
                   token_type_ids=None,
                   attention_mask=b_input_mask,
                   labels=b_labels)

            loss = outputs[0]
            loss.backward()
            optimizer.step()

    torch.save(model.state_dict(),'./MANY_BERT_MODELS/BERT-base_SEED{}.pt'.format(SEED))