In [1]:
!pip install transformers conllu datasets



In [2]:
from datasets import load_dataset , get_dataset_config_names

In [3]:
# Fetching en_ewts train, validation splits
train_dataset = load_dataset('universal_dependencies', 'en_ewt', split = 'train')
validation_dataset = load_dataset('universal_dependencies', 'en_ewt', split = 'validation')

In [4]:
# Find total pos

total_pos = set()
total_sentences = train_dataset.num_rows

for i in range(0,total_sentences):
  for pos in train_dataset[i]['xpos']:
    total_pos.add(pos)

total_pos

{'$',
 "''",
 ',',
 '-LRB-',
 '-RRB-',
 '.',
 ':',
 'ADD',
 'AFX',
 'CC',
 'CD',
 'DT',
 'EX',
 'FW',
 'GW',
 'HYPH',
 'IN',
 'JJ',
 'JJR',
 'JJS',
 'LS',
 'MD',
 'NFP',
 'NN',
 'NNP',
 'NNPS',
 'NNS',
 None,
 'PDT',
 'POS',
 'PRP',
 'PRP$',
 'RB',
 'RBR',
 'RBS',
 'RP',
 'SYM',
 'TO',
 'UH',
 'VB',
 'VBD',
 'VBG',
 'VBN',
 'VBP',
 'VBZ',
 'WDT',
 'WP',
 'WP$',
 'WRB',
 'XX',
 '``'}

Preprocessing data

UD dataset has arond 50 xpos.For the sake of simplicity I am prepocessing data to have only 6 labels.

1. NOUN - NN
2. ADJECTIVE - JJ
3. VERB - VB
4. INTERJECTION - IN
5. ADVERB - ADV
6. OTHERS - OTHERS

In [5]:
total_sentences = train_dataset.num_rows
train_X = [row['tokens'] for row in train_dataset]
train_Y = [row['xpos'] for row in train_dataset]

def preprocess(train_Y):
  for i in range(0,len(train_Y)):
    for j in range(0, len(train_Y[i])):
      pos = train_Y[i][j]
      if(pos in ['NN', 'NNP', 'NNS']):
        pos = 'NN'
      elif(pos in ['VB', 'VBD', 'VBG']):
        pos = 'VB'
      elif(pos != 'JJ' and pos != 'IN' and pos != 'RB'):
        pos = 'OTHERS'
      train_Y[i][j] = pos
  return train_Y
  
train_Y = preprocess(train_Y)

In [6]:
# Find total pos

total_pos = set()
total_sentences = train_dataset.num_rows

for i in range(0,total_sentences):
  for pos in train_Y[i]:
    total_pos.add(pos)

total_pos= sorted(total_pos)
total_pos

['IN', 'JJ', 'NN', 'OTHERS', 'RB', 'VB']

Featuraization

We define some static features to convert the input token as a feature
1. If the given token is a 'NOUN' then the corresponding feature is true else false {0, 1}
2. Similarly for other pos tags - 4 others
3. Transition features : 1 if given token is a NOUN and previous token is VERB.
4. Similarly for other combinations - 24 others


In [7]:
def featurize_word(word, pos, prev_pos):
  feature = [0 for i in range(42)]

  for i in range(0, len(total_pos)):
    if(pos == total_pos[i]):
      feature[i]=1

  # transition features
  for i in range(0,len(total_pos)):
    for j in range(0,len(total_pos)):
      if(total_pos[i] == pos and total_pos[j] == prev_pos):
        feature[len(total_pos)-1+(i+1)*(j+1)] = 1

  return feature

def featurize_sentence(sentence,pos):
  # Sentence is a list of tokens
  sentence_feature = []
  for i in range(0,len(sentence)):
    if(i == 0):
      sentence_feature.append(featurize_word(sentence[i],pos[i],'OTHER'))
    else:
      sentence_feature.append(featurize_word(sentence[i],pos[i],pos[i-1]))
  return sentence_feature

def featurize_dataset(train_X, train_Y):
  dataset_features = []
  for i in range(0, len(train_X)):
    dataset_features.append(featurize_sentence(train_X[i], train_Y[i]))
  return dataset_features

In [8]:
features = featurize_dataset(train_X, train_Y)

len(features)

12543

Baseline model - Most frequent baseline

The baseline model is defined as follows:
1. If the word appeared previously in test dataset , assign the the most frequent label for that word in test dataset
2. For new word assign the overall most frequent label

In [9]:
from collections import defaultdict

# Overall most frequent label
def overall_most_frequent_label(train_Y):
  freq_dict = defaultdict(int)
  for i in range(0, len(train_Y)):
    for j in range(0, len(train_Y[i])):
      freq_dict[train_Y[i][j]]+=1
  frequency_sorted_list = sorted(freq_dict, key= lambda l: (-freq_dict[l], l))
  print(f'labels in the order of frequencies : {frequency_sorted_list}')
  return frequency_sorted_list[0]

overall_most_frequent_label(train_Y)


labels in the order of frequencies : ['OTHERS', 'NN', 'IN', 'VB', 'JJ', 'RB']


'OTHERS'

In [10]:
from collections import Counter

# token wise most frequent label
def tokenwise_most_frequent_label(train_X, train_Y):
  freq_dict = defaultdict(list)
  for i in range(0, len(train_X)):
    for j in range(0, len(train_X[i])):
      freq_dict[train_X[i][j]].append(train_Y[i][j])
  tokenwise_frequent_label = defaultdict(int)
  for key, values in freq_dict.items():
    most_frequent_label = Counter(values).most_common(1)[0][0]
    tokenwise_frequent_label[key] = most_frequent_label
  return tokenwise_frequent_label
 

tokenwise_frequent_labels = tokenwise_most_frequent_label(train_X, train_Y)
print(len(tokenwise_frequent_labels))

20132


In [11]:
class BaseModel():
    def __init__(self, overall_most_frequent_label, tokenwise_most_frequent_label):
      self.find_overall_most_frequent_label = overall_most_frequent_label
      self.find_tokenwise_most_frequent_label = tokenwise_most_frequent_label

    def predict(self, input):
      # input is a sentence -- a list of words/tokens
      output = []
      for word in input:
        if(word in self.tokenwise_most_frequent_label.keys()):
          output.append(self.tokenwise_most_frequent_label[word])
        else:
          output.append(self.overall_most_frequent_label)
      return output

    def train(self, train_X, train_Y):
      self.overall_most_frequent_label = self.find_overall_most_frequent_label(train_Y)
      self.tokenwise_most_frequent_label = self.find_tokenwise_most_frequent_label(train_X, train_Y)

Evaluation metrics

For this I have appended the ground truths for each sentencese into a list. Same for predicted outputs.
Them calculate performance metrics on them.

In [12]:
from sklearn import metrics


def evaluate_metrics(predicted_labels, ground_truth):
  # accuracy
  accuracy = metrics.accuracy_score(ground_truth, predicted_labels)
  print(f'accuracy : {accuracy}')

  # macroprecision
  macro_precision = metrics.precision_score(ground_truth, predicted_labels, average = 'macro')
  print(f'macro precision : {macro_precision}')

  # macrorecall
  macro_recall = metrics.recall_score(ground_truth, predicted_labels, average = 'macro')
  print(f'macro recall : {macro_recall}')

  # microprecision
  micro_precision  = metrics.precision_score(ground_truth, predicted_labels, average = 'micro')
  print(f'micro precision : {micro_precision}')

  # microrecall.
  micro_recall  = metrics.recall_score(ground_truth, predicted_labels, average = 'micro')
  print(f'micro recall : {micro_recall}')

In [13]:
base_model = BaseModel(overall_most_frequent_label, tokenwise_most_frequent_label)
base_model.train(train_X, train_Y)

total_sentences = validation_dataset.num_rows
validation_X = [row['tokens'] for row in validation_dataset]
validation_Y = [row['xpos'] for row in validation_dataset]
validation_Y = preprocess(validation_Y)

prediction_Y = []
for i in range(0, len(validation_X)):
  prediction_Y.append(base_model.predict(validation_X[i]))

labels in the order of frequencies : ['OTHERS', 'NN', 'IN', 'VB', 'JJ', 'RB']


In [14]:
validation_Y[0]

['IN', 'OTHERS', 'NN', 'OTHERS', 'OTHERS', 'NN', 'OTHERS']

In [15]:
prediction_Y[0]

['IN', 'OTHERS', 'NN', 'OTHERS', 'OTHERS', 'NN', 'OTHERS']

In [16]:
appended_prediction_labels = []
for p in prediction_Y:
  appended_prediction_labels+=p
appended_ground_truth = []
for g in validation_Y:
  appended_ground_truth+=g
evaluate_metrics(appended_prediction_labels, appended_ground_truth)

accuracy : 0.8603893555224115
macro precision : 0.8775669633288364
macro recall : 0.817050865808997
micro precision : 0.8603893555224115
micro recall : 0.8603893555224115


In [None]:
import torch
import torch.nn as nn

class LinearChainCRF(nn.Module):
    def __init__(self, num_tags, num_features):
        super().__init__()
        # Initialize weights
        self.num_tags = num_tags
        self.feature_weights = nn.Parameter(torch.randn(1, num_features))

    def _forward_alg(self, features):
        # Forward algorithm for computing the partition function (Z score)
        # TODO: implement this
        pass

    def _score_sequence(self, features, tags):
        # Calculate the score of a given sequence
        score = torch.zeros(1)
        for feature in features:
            score += torch.dot(feature, self.feature_weights)
        return score

    def neg_log_likelihood_loss(self, features, tags):
        # Negative log likelihood loss
        # Z score - numerator
        pass

    def _viterbi_decode(self, features):
        # TODO: implement this
        pass

    def forward(self, features):
        # TODO: implement this 
        pass
