In [None]:
"""
Installing Pretrained Bert
"""
pip install pytorch_pretrained_bert



#Importing Required Libraries

In [None]:
import os
from tqdm import tqdm_notebook as tqdm
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
import torch.optim as optim
from pytorch_pretrained_bert import BertTokenizer
import nltk
import pdb
from pytorch_pretrained_bert import BertModel
from sklearn.model_selection import train_test_split
import sys
import pickle
from sklearn.metrics import accuracy_score
import csv



"""
Checking if the machine has the "GPU" unit for the computation otherwise selecting the "CPU"
"""

def get_device():
	device = 'cuda' if torch.cuda.is_available() else 'cpu'
	return device

"""
Getting BERT Tokenizer from the "bert-based-cased" mode. 
This model is pretrained on thousands of Books and Wikipedia Articles.
"""

def get_tokenizer():
	tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
	return tokenizer

class PosDataset(data.Dataset):
    def __init__(self, tagged_sents,tokenizer,tag2idx,idx2tag):
        sents, tags_li = [], [] # list of lists
        self.tokenizer = tokenizer
        self.tag2idx = tag2idx
        self.idx2tag = idx2tag
        for sent in tagged_sents:
            words = [word_pos[0] for word_pos in sent]
            tags = [word_pos[1] for word_pos in sent]
            sents.append(["[CLS]"] + words + ["[SEP]"])
            tags_li.append(["<pad>"] + tags + ["<pad>"])
        self.sents, self.tags_li = sents, tags_li

    def __len__(self):
        return len(self.sents)

    def __getitem__(self, idx):
        words, tags = self.sents[idx], self.tags_li[idx] # words, tags: string list

        # We give credits only to the first piece.
        x, y = [], [] # list of ids
        is_heads = [] # list. 1: the token is the first piece of a word
        for w, t in zip(words, tags):
            tokens = self.tokenizer.tokenize(w) if w not in ("[CLS]", "[SEP]") else [w]
            xx = self.tokenizer.convert_tokens_to_ids(tokens)

            is_head = [1] + [0]*(len(tokens) - 1)

            t = [t] + ["<pad>"] * (len(tokens) - 1)  # <PAD>: no decision
            yy = [self.tag2idx[each] for each in t]  # (T,)

            x.extend(xx)
            is_heads.extend(is_head)
            y.extend(yy)

        assert len(x)==len(y)==len(is_heads), "len(x)={}, len(y)={}, len(is_heads)={}".format(len(x), len(y), len(is_heads))

        # seqlen
        seqlen = len(y)

        # to string
        words = " ".join(words)
        tags = " ".join(tags)
        return words, x, is_heads, tags, y, seqlen

"""
This function is responsible for providing proper padding to the sentences according to the batch size.
"""

def pad(batch):
    '''Pads to the longest sample'''
    f = lambda x: [sample[x] for sample in batch]
    words = f(0)
    is_heads = f(2)
    tags = f(3)
    seqlens = f(-1)
    maxlen = np.array(seqlens).max()

    f = lambda x, seqlen: [sample[x] + [0] * (seqlen - len(sample[x])) for sample in batch] # 0: <pad>
    x = f(1, maxlen)
    y = f(-2, maxlen)


    f = torch.LongTensor

    return words, f(x), is_heads, tags, f(y), seqlens

"""
BERT Layer Architecture
This class is Deep Neural Network class in which we are using BERT implementation and 
adding a Linear layer which is converting the BERT 768 vector output to the size of the tags.
"""

class Net(nn.Module):
    def __init__(self, vocab_size=None,device = None):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-cased')

        self.fc = nn.Linear(768, vocab_size)
        self.device = device

    def forward(self, x, y):
        x = x.to(self.device)
        y = y.to(self.device)
        
        if self.training:
            self.bert.train()
            encoded_layers, _ = self.bert(x)
            enc = encoded_layers[-1]
        else:
            self.bert.eval()
            with torch.no_grad():
                encoded_layers, _ = self.bert(x)
                enc = encoded_layers[-1]
        
        logits = self.fc(enc)
        y_hat = logits.argmax(-1)
        return logits, y, y_hat

"""
This function is responsible for the training the extra 1 layer on the top of the pretrained BERT model
to fine-tune the BERT model on our dataset.
For each epoch, we are using batching to optimize the trainign speed.
"""

def train(model, iterator, optimizer, criterion):
    model.train()
    for i, batch in enumerate(iterator):
        words, x, is_heads, tags, y, seqlens = batch
        _y = y # for monitoring
        optimizer.zero_grad()
        logits, y, _ = model(x, y) # logits: (N, T, VOCAB), y: (N, T)

        logits = logits.view(-1, logits.shape[-1]) # (N*T, VOCAB)
        y = y.view(-1)  # (N*T,)

        loss = criterion(logits, y)
        loss.backward()

        optimizer.step()

        if i%10==0: # monitoring
            print("step: {}, loss: {}".format(i, loss.item()))

"""
This function is responsible for evaluating the test results and saving the 
predictions in results file which can be further used for calculating the 
Accuracy.
At the end, this function also calculated the accuracy on the test dataset
"""


def eval(model, iterator,tag2idx,idx2tag):
    model.eval()

    Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            words, x, is_heads, tags, y, seqlens = batch

            _, _, y_hat = model(x, y)  # y_hat: (N, T)

            Words.extend(words)
            Is_heads.extend(is_heads)
            Tags.extend(tags)
            Y.extend(y.numpy().tolist())
            Y_hat.extend(y_hat.cpu().numpy().tolist())

    ## gets results and save
    with open("result", 'w') as fout:
        for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
            y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
            preds = [idx2tag[hat] for hat in y_hat]
            assert len(preds)==len(words.split())==len(tags.split())
            for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
                fout.write("{} {} {}\n".format(w, t, p))
            fout.write("\n")
            
    ## calc metric
    y_true =  np.array([tag2idx[line.split()[1]] for line in open('result', 'r').read().splitlines() if len(line) > 0])
    y_pred =  np.array([tag2idx[line.split()[2]] for line in open('result', 'r').read().splitlines() if len(line) > 0])

    acc = (y_true==y_pred).astype(np.int32).sum() / len(y_true)

    print("acc=%.2f"%acc)

def test(model, iterator,tag2idx,idx2tag):
	model.eval()

	Words, Is_heads, Tags, Y, Y_hat = [], [], [], [], []
	with torch.no_grad():
		for i, batch in enumerate(iterator):
			words, x, is_heads, tags, y, seqlens = batch

		_, _, y_hat = model(torch.tensor(x), torch.tensor(y))  # y_hat: (N, T)

		Words.extend(words)
		Is_heads.extend(is_heads)
		Tags.extend(tags)
		Y.extend(y.numpy().tolist())
		Y_hat.extend(y_hat.cpu().numpy().tolist())

	## get results
	for words, is_heads, tags, y_hat in zip(Words, Is_heads, Tags, Y_hat):
		y_hat = [hat for head, hat in zip(is_heads, y_hat) if head == 1]
		preds = [idx2tag[hat] for hat in y_hat]
		assert len(preds)==len(words.split())==len(tags.split())
		ret_arr = []
		for w, t, p in zip(words.split()[1:-1], tags.split()[1:-1], preds[1:-1]):
			#print("{} {}".format(w, p))
			ret_arr.append(tuple((w,p)))
	return ret_arr
		
            

def construct_input(sent):
    words = [word_pos for word_pos in sent.split()]
    tags = ['-NONE-' for word_pos in sent.split()]
    #print(tags)
    ret_arr = []
    for i,j in zip(words,tags):
      ret_arr.append(tuple((i,j)))
    return [ret_arr]


In [None]:
"""
Mounting the Drive
"""
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
"""
Function to get train_data from the data file
"""
def create_train_data():
  data_path = "/content/drive/MyDrive/NLP_Project/wsj1.train.original" 
  #predicted_tags = []
  sentence_in=[]
  actual_tag=[]
  with open(data_path) as f:
      sentences = f.readlines()
      temp=[]
      for sen in sentences:
        sentenceSplit = sen.strip("\n").split("\t")
        if sentenceSplit[0]=="":
          sentence_in.append(temp)
          temp=[]
          #print(sentence_in)
        else:
          temp.append((sentenceSplit[1], sentenceSplit[4]))
          actual_tag.append(sentenceSplit[4])
  return sentence_in      

Training and Saving the Model using the Training Dataset


In [None]:
"""
Training the Model
Using Train_iter batch size=8,Eval_iter batch size=8,Adams Optimizer,learning rate=0.000012

Evaluating using the built model on test_iter and using tag2idx and idx2tag
"""



def train_model(model_dir):

	
	tagged_sents= create_train_data()
	print("tagged are:",tagged_sents[:10])
	tags = list(set(word_pos[1] for sent in tagged_sents for word_pos in sent))
	tags = ["<pad>"] + tags
	tags_str = ','.join(tags)
	# print(len(tags_str))
	# print(tags_str)
	tag2idx = {tag:idx for idx, tag in enumerate(tags)}
	idx2tag = {idx:tag for idx, tag in enumerate(tags)}
	tag2idx["-NONE-"]=len(tag2idx)
	
	
	train_data, test_data = train_test_split(tagged_sents, test_size=.1)
	


	device = get_device() 
	tokenizer = get_tokenizer()
	print(device)


	model = Net(vocab_size=len(tag2idx),device = device)
	model.to(device)
	model = nn.DataParallel(model)


	train_dataset = PosDataset(train_data,tokenizer,tag2idx,idx2tag)
	eval_dataset = PosDataset(test_data,tokenizer,tag2idx,idx2tag)

	train_iter = data.DataLoader(dataset=train_dataset,
                             batch_size=8,
                             shuffle=True,
                             num_workers=1,
                             collate_fn=pad)
	test_iter = data.DataLoader(dataset=eval_dataset,
                             batch_size=8,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=pad)


	optimizer = optim.Adam(model.parameters(), lr = 0.000012)

	criterion = nn.CrossEntropyLoss(ignore_index=0)

	train(model, train_iter, optimizer, criterion)
	eval(model, test_iter,tag2idx,idx2tag)


	print("Saving model...")
	torch.save(model, model_dir + "/pytorch_model.bin")
	print("Model saved")
	tags_arr = [tag2idx,idx2tag]
	print("Pickling tags...")
	fp = open(model_dir +"/tags.pkl","wb")
	pickle.dump(tags_arr,fp)
	fp.close()
	print("Pickling complete...")
	#print(open('result', 'r').read().splitlines()[:100])


if __name__== "__main__":
	if (len(sys.argv) < 2):
		print("Specify model dir to save")
	else:
		try:
			os.mkdir(sys.argv[1])
		except:
			print("Directory already exists")
		train_model(sys.argv[1])

tagged are: [[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], [('Rudolph', 'NNP'), ('Agnew', 'NNP'), (',', ','), ('55', 'CD'), ('years', 'NNS'), ('old', 'JJ'), ('and', 'CC'), ('former', 'JJ'), ('chairman', 'NN'), ('of', 'IN'), ('Consolidated', 'NNP'), ('Gold', 'NNP'), ('Fields', 'NNP'), ('PLC', 'NNP'), (',', ','), ('was', 'VBD'), ('named', 'VBN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('of', 'IN'), ('this', 'DT'), ('British', 'JJ'), ('industrial', 'JJ'), ('conglomerate', 'NN'), ('.', '.')], [('A', 'DT'), ('form', '

100%|██████████| 213450/213450 [00:00<00:00, 293339.88B/s]


cuda


100%|██████████| 404400730/404400730 [00:34<00:00, 11603897.37B/s]


step: 0, loss: 3.8522393703460693
step: 10, loss: 3.4179584980010986
step: 20, loss: 3.025223731994629
step: 30, loss: 2.6896438598632812
step: 40, loss: 2.2808938026428223
step: 50, loss: 2.1795730590820312
step: 60, loss: 1.6330621242523193
step: 70, loss: 1.4763284921646118
step: 80, loss: 1.1412686109542847
step: 90, loss: 1.1118792295455933
step: 100, loss: 0.9331042766571045
step: 110, loss: 0.6677235960960388
step: 120, loss: 0.5689499974250793
step: 130, loss: 0.5204455256462097
step: 140, loss: 0.6345505118370056
step: 150, loss: 0.39207538962364197
step: 160, loss: 0.5753714442253113
step: 170, loss: 0.47636979818344116
step: 180, loss: 0.25342056155204773
step: 190, loss: 0.2884799540042877
step: 200, loss: 0.3245893120765686
step: 210, loss: 0.2099679857492447
step: 220, loss: 0.26114901900291443
step: 230, loss: 0.3720555007457733
step: 240, loss: 0.21794283390045166
step: 250, loss: 0.1581994891166687
step: 260, loss: 0.2596586048603058
step: 270, loss: 0.1857927143573761

In [None]:
!pip install bert

Collecting bert
  Downloading bert-2.2.0.tar.gz (3.5 kB)
Collecting erlastic
  Downloading erlastic-2.0.0.tar.gz (6.8 kB)
Building wheels for collected packages: bert, erlastic
  Building wheel for bert (setup.py) ... [?25l[?25hdone
  Created wheel for bert: filename=bert-2.2.0-py3-none-any.whl size=3766 sha256=9a789f7ca59880eab09c43a69f98b603d0934082e26199bfc1980ee77ed41aa4
  Stored in directory: /root/.cache/pip/wheels/bb/31/1b/c05f362e347429b7436954d1a2280fe464731e8f569123a848
  Building wheel for erlastic (setup.py) ... [?25l[?25hdone
  Created wheel for erlastic: filename=erlastic-2.0.0-py3-none-any.whl size=6795 sha256=462765c7f0f539cc712c545a808e695e437de0970967705eec8a585a892c1dff
  Stored in directory: /root/.cache/pip/wheels/94/f1/b4/0b98b1e94775da6a0b1130e342d22af05cd269e1172c19f40f
Successfully built bert erlastic
Installing collected packages: erlastic, bert
Successfully installed bert-2.2.0 erlastic-2.0.0


In [None]:
"""
Getting the Dev data from WSJ Penn treebank dataset
"""

data_path = "/content/drive/MyDrive/NLP_Project/wsj1.dev.original"
#predicted_tags = []
sentence_in=[]
actual_tag=[]
with open(data_path) as f:
    sentences = f.readlines()
    temp=[]
    for sen in sentences:
      sentenceSplit = sen.strip("\n").split("\t")
      if sentenceSplit[0]=="":
        sentence_in.append(temp)
        temp=[]
        #print(sentence_in)
      else:
        temp.append(sentenceSplit[1])
        actual_tag.append(sentenceSplit[4])

res= [' '.join(i) for i in sentence_in]

In [None]:
"""
Predciting the Dev data tags using the loaded model 
batchsize=8,shuffle=False,num_workers=1,collate_fn=pad
"""

predicted_tags=[]
def test_model(model_dir):
    device = get_device() 
    tokenizer = get_tokenizer()

    print("Loading model ...")
    model= torch.load("/content/-f/pytorch_model.bin")
    #model = torch.load(model_dir + "/pytorch_model.bin")
    print("Loading model complete")
    print("Loading Pickling tags...")
    fp = open("/content/-f/tags.pkl","rb")
    tags_arr = pickle.load(fp)
    print("Loading Pickling tags complete")
    fp.close()
    
    #while True:
    for text in res:
  
      rt_test_dataset = PosDataset(construct_input(text),tokenizer,tags_arr[0],tags_arr[1])
      #rt_test_dataset= res
      rt_test_iter = data.DataLoader(dataset=rt_test_dataset,
                              batch_size=8,
                              shuffle=False,
                              num_workers=1,
                              collate_fn=pad)


      ret_arr = test(model, rt_test_iter,tags_arr[0],tags_arr[1])
      predicted_tags.append(ret_arr)
      #print("ret_arr is:",ret_arr)
if __name__== "__main__":
	if (len(sys.argv) < 2):
		print("Specify model dir to load model")
	else:
		test_model(sys.argv[1])


Loading model ...
Loading model complete
Loading Pickling tags...
Loading Pickling tags complete




In [None]:
res= [[' '.join(i)] for i in sentence_in]

In [None]:
"""
Calculating Accuracy for the Dev dataset using predicted tags and actual tags
"""
tags=[]
flat_list = [item for sublist in predicted_tags for item in sublist]
for res in flat_list:
  #print(res)
  tags.append(res[1])
acc= accuracy_score(actual_tag[:131768], tags)
acc

0.9764510351526926

We can observe that we got an accuracy of 97.64% for the Dev data of WSJ Penn Treebank Dataset

Getting Test data from WSJ Penn treebank dataset

In [None]:
data_path = "/content/drive/MyDrive/NLP_Project/wsj1.test.original"
#predicted_tags = []
sentence_in_test=[]
actual_tag_test=[]
with open(data_path) as f:
    sentences_test = f.readlines()
    temp_test=[]
    for sen in sentences_test:
      sentenceSplit_test = sen.strip("\n").split("\t")
      if sentenceSplit_test[0]=="":
        sentence_in_test.append(temp_test)
        temp_test=[]
        #print(sentence_in)
      else:
        temp_test.append(sentenceSplit_test[1])
        actual_tag_test.append(sentenceSplit_test[4])

res_test= [' '.join(i) for i in sentence_in_test]

Predicting the Tags for test data using the saved model

In [None]:


predicted_tags_test=[]
def test_model(model_dir):
    device = get_device() 
    tokenizer = get_tokenizer()

    print("Loading model ...")
    model= torch.load("/content/-f/pytorch_model.bin")
    
    print("Loading model complete")
    print("Loading Pickling tags...")
    fp = open("/content/-f/tags.pkl","rb")
    tags_arr = pickle.load(fp)
    print("Loading Pickling tags complete")
    fp.close()
    
    #while True:
    for text in res_test:
  
      rt_test_dataset = PosDataset(construct_input(text),tokenizer,tags_arr[0],tags_arr[1])
      #rt_test_dataset= res
      rt_test_iter = data.DataLoader(dataset=rt_test_dataset,
                              batch_size=8,
                              shuffle=False,
                              num_workers=1,
                              collate_fn=pad)


      ret_arr_test = test(model, rt_test_iter,tags_arr[0],tags_arr[1])
      predicted_tags_test.append(ret_arr_test)
      #print("ret_arr is:",ret_arr)
if __name__== "__main__":
	if (len(sys.argv) < 2):
		print("Specify model dir to load model")
	else:
		test_model(sys.argv[1])


Loading model ...
Loading model complete
Loading Pickling tags...
Loading Pickling tags complete




In [None]:
res_test= [[' '.join(i)] for i in sentence_in_test]

Calculating the Accuracy score for Test data using actual_tag_test(Actual tags for test data) and tags_test(Predicted tags for test data)

In [None]:
"""
Calculating the Accuracy score for Test data using actual_tag_test(Actual tags for test data) and tags_test(Predicted tags for test data)

"""
tags_test=[]
flat_list_test = [item for sublist in predicted_tags_test for item in sublist]
for res in flat_list_test:
  #print(res)
  tags_test.append(res[1])
acc= accuracy_score(actual_tag_test[:129654], tags_test)
acc

0.9758048344054175

In [None]:
len(tags_test)

129654

In [None]:
"""
Printing the list of predicted tags
"""
tags_test

['JJ',
 'NNS',
 'IN',
 'DT',
 'NNP',
 'NNPS',
 'CC',
 'NNP',
 'NNP',
 'VBD',
 'NN',
 'WDT',
 'MD',
 'VB',
 'WRB',
 'DT',
 'JJ',
 'JJ',
 'NN',
 'NN',
 'MD',
 'VB',
 'NN',
 ',',
 'VBG',
 'DT',
 'JJ',
 'NN',
 'TO',
 'DT',
 'NN',
 'POS',
 'NN',
 'IN',
 'JJ',
 'NNS',
 '.',
 'DT',
 'NN',
 ',',
 'WP$',
 'NNS',
 'VBP',
 'NNP',
 'NNP',
 'NNP',
 '-LRB-',
 'NNP',
 ',',
 'NNP',
 '-RRB-',
 ',',
 'MD',
 'VB',
 'DT',
 'NNP',
 'NNP',
 'NNP',
 'IN',
 'VBG',
 'JJ',
 'NN',
 'NN',
 'IN',
 'VBG',
 'DT',
 'JJ',
 'NN',
 'CC',
 'NN',
 'NN',
 'NN',
 'WDT',
 'MD',
 'RB',
 'VB',
 'VBN',
 'IN',
 'DT',
 'JJ',
 'NN',
 '.',
 'DT',
 'NN',
 'VBZ',
 'TO',
 'VB',
 'DT',
 'NNP',
 'TO',
 'NNP',
 'NNS',
 'RB',
 ',',
 'IN',
 'DT',
 'NN',
 'VBZ',
 'JJ',
 'JJ',
 'NN',
 '.',
 '``',
 'JJ',
 'NN',
 '``',
 'JJ',
 "''",
 'NN',
 'VBZ',
 'JJ',
 'CC',
 'JJ',
 ',',
 'RB',
 'RBR',
 'JJ',
 'IN',
 'JJ',
 'NNP',
 'NN',
 ',',
 "''",
 'VBD',
 'NNP',
 'NNP',
 'NNP',
 '-LRB-',
 'NNP',
 ',',
 'NNP',
 '-RRB-',
 ',',
 'DT',
 'NN',
 'POS',
 'JJ',

In [None]:
"""
Calculating Accuracy for the test data using tags_test(predicted_tags) list and actual_tag_test(actual tags)
"""
count=0
for i in range(len(tags_test)):
  if tags_test[i]==actual_tag_test[i]:
    count+=1
check=count/len(tags_test)

0.9758048344054175


In [None]:
"""
Converting test data of WSJ Penn tree bank  into CSV
"""
test_2 = csv.reader(open('/content/drive/MyDrive/NLP_Project/wsj1.test.original') , delimiter="\t")
data_test_2=list(test_2)

In [None]:
data_test_2[0]

['1', 'Influential', '_', 'JJ', 'JJ', '_', '2', 'NMOD']

In [None]:
len(data_test_2)

135117

In [None]:
"""
Removing empty lists from the data_test
"""
res = [ele for ele in data_test_2 if ele != []]

In [None]:
len(res)

129654

In [None]:
len(tags_test)

129654

Appending Predicted tags to the Data list

In [None]:
for i in range(len(res)):
  res[i].append(tags_test[i])


Getting the Predictions for Test data to evaluate the performance of the Model Accuracy

In [None]:
"""
Getting the predicted output file for test file 
"""
pred = open('/content/drive/MyDrive/NLP_Project/pred.out', 'w')

for i in range(len(res)):



   pred.write(str(res[i][0])  + '\t' + str(res[i][1]) + '\t' +str(res[i][8])+'\n')
   
pred.close()  