In [1]:
import os
import sys,re
import numpy as np
from copy import deepcopy
from scipy import sparse
from tqdm.notebook import tqdm
from collections import Counter,defaultdict

In [2]:
# import lightgbm as lgb
# from lightgbm import LGBMClassifier 


from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier


from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB


from sklearn.model_selection import GridSearchCV
from sklearn import model_selection

In [3]:
train_file_conllu  = r'.\TreeBanks\universal_hindi\hi_hdtb-ud-train.conllu'
train_file_sentences  = r'.\TreeBanks\universal_hindi\hi_hdtb-ud-train.txt'


val_file_conllu  = r'.\TreeBanks\universal_hindi\hi_hdtb-ud-dev.conllu'
val_file_sentences  = r'.\TreeBanks\universal_hindi\hi_hdtb-ud-dev.txt'


test_file_conllu  = r'.\TreeBanks\universal_hindi\hi_hdtb-ud-test.conllu'
test_file_sentences  = r'.\TreeBanks\universal_hindi\hi_hdtb-ud-test.txt'


In [4]:
def read_file(file_name):
    f = open(file_name,encoding = 'utf-8')
    lines = f.readlines()
    for i in range(len(lines)):
        lines[i] = lines[i].rstrip().split('\t')
    f.close()
    return lines

In [5]:
class defaultList(list):
    """A list that returns a default value if index out of bounds."""
    def __init__(self, default=None):
        self.default = default
        list.__init__(self)

    def __getitem__(self, index):
        try:
            return list.__getitem__(self, index)
        except IndexError:
            return self.default

In [6]:
SHIFT = 0; RIGHT = 1; LEFT = 2
MOVES = [SHIFT, RIGHT, LEFT]

In [7]:
class Parse(object):
    def __init__(self,size):
        
        self.size = size
        self.heads = [None]*(size + 1)
        self.left_arc = []
        self.right_arc = []
        
        for i in range(size+1):
            self.left_arc.append(defaultList(0))
            self.right_arc.append(defaultList(0))
            
        
        
    def add_arc(self,head,child):
        
        self.heads[child] = head
        
        if child < head:
            self.left_arc[head].append(child)
        else:
            self.right_arc[head].append(child)

In [8]:
def transition(move, i, stack, parse):
    
    
    global SHIFT, RIGHT, LEFT
    if move == SHIFT:
        stack.append(i)
        return i + 1
    elif move == RIGHT:
        parse.add_arc(stack[-2], stack.pop())
        return i
    elif move == LEFT:
        parse.add_arc(i, stack.pop())
        return i

In [9]:
class DependencyParser:
    def __init__(self,train_treebank,test_treebank=None):
        self.features = list()
        self.labels = list()
        self.train_treebank = train_treebank
        self.test_treebank = test_treebank
        
        self.feature_vocabulary = defaultdict()
        self.label_vocabulary = defaultdict()
        
        self.reverse_features = list()
        self.reverse_labels = list()
        
        self.classifier = None
        self.max_features = 0
    
    def check_if_tree_is_connected(self,begin,dependent_head,token_heads):     
        current = begin
        while current is not 0:
            current = token_heads[current]
            if current == dependent_head:
                return True
        
        return False
    
    def check_projectivity(self,line_tokens):
        token_heads = defaultdict()
        
        for fields in line_tokens:
            token_id = fields[0]
            token_head = fields[3]
            
            if token_heads.get(token_id) == None: # not seen before
                token_heads[token_id] = token_head
        
        for dependent in token_heads:
            
            dependent_head = token_heads[dependent]
            left = 1 + min(dependent_head,dependent)
            right = max(dependent_head,dependent)
            
            for i in range(left,right):
                
                if self.check_if_tree_is_connected(i,dependent_head,token_heads) == False:
                    return False
        
        return True
    
    def check_condition(self,buffer,stack,dependencies = {},condition=None):
        
        if(condition == '0'):
            return len(buffer) is 0 and len(stack) is 1 and stack[0] == 0
        
        elif(condition == '1'):
            #condition for shift
            return len(stack) < 2 and len(buffer) > 0 #stack has only root and items are still in buffer
    
        elif(condition == '2'): #condition for left arc
            stack_top = stack[-1]
            stack_2nd = stack[-2]
            
            return dependencies.get(stack_top) is not None and (stack_top,stack_2nd) in dependencies[stack_top]
        
        elif(condition == '3'): #condition for right arc
            stack_top = stack[-1]
            stack_2nd = stack[-2]
            
            return dependencies.get(stack_2nd) is not None and (stack_2nd,stack_top) in dependencies[stack_2nd]
        

                
    def tree_to_actions(self,buffer,stack,arcs,dependencies):
        def perform_shift(buffer,stack,arcs,configuration_states,gold_transitions):
                configuration_states.append((list(buffer), list(stack), list(arcs)))
    
                gold_transitions.append("shift")
    
                stack.append(buffer.pop())
        
        def perform_arc(arc_label,buffer,stack,arcs,configuration_states,gold_transitions,arc_type):
            configuration_states.append((list(buffer), list(stack), list(arcs)))
            
            action = f'{arc_type}_{arc_label}'
            
            gold_transitions.append(action)
            
            
            left = stack[-1]
            right = stack[-2]
            
            if arc_type == 'left':
                arcs.append((arc_label,left,right))
                stack.pop(-2)
                
            elif arc_type == 'right':
                arcs.append((arc_label,right,left))
                stack.pop()
        
        configuration_states = list() #[(buf,stk,arc)...]
        gold_transitions = list() #[shift or left_rel or right_rel] actions for each state
        
        seen_dependencies = defaultdict()
        
        while( len(buffer) >= 0 ):
            if self.check_condition(buffer,stack,condition = '0'):
                return configuration_states , gold_transitions
            
            if self.check_condition(buffer,stack,condition = '1'):
                perform_shift(buffer,stack,arcs,configuration_states,gold_transitions)
                continue
                
            stack_top = stack[-1]
            stack_2nd = stack[-2]
            
            if self.check_condition(buffer,stack,dependencies,condition = '2'):
                arc_label = dependencies[stack_top][(stack_top,stack_2nd)]
                
                perform_arc(arc_label, buffer,stack,arcs,
                           configuration_states,gold_transitions,
                           arc_type = 'left')
                
                seen_dependencies[stack_2nd] = 1
                
            elif self.check_condition(buffer,stack,dependencies,condition = '3'):
                if( dependencies.get(stack_top) is not None and any([child not in seen_dependencies for _,child in dependencies[stack_top]]) ):
                    
                    perform_shift(buffer,stack,arcs,configuration_states,gold_transitions)
                    
                else:                 
                    arc_label = dependencies[stack_2nd][(stack_2nd,stack_top)]
                
                    perform_arc(arc_label, buffer,stack,arcs,
                           configuration_states,gold_transitions,
                           arc_type = 'right')
                
                    seen_dependencies[stack_top] = 1

            else:
                perform_shift(buffer,stack,arcs,configuration_states,gold_transitions)
                
        
    def extract_configs_transitions(self,line_tokens):
        
        stack,buffer,arcs = list(),list(),list()
        dependencies = defaultdict(lambda: defaultdict())
        
        stack.append(0) #add root
        
        for tokens in reversed(line_tokens):
            tok_id,word,x_pos,head,dependecy_relation = tokens
            
            head_tok_id = (head,tok_id)
                
            dependencies[head][head_tok_id] = dependecy_relation
            
            buffer.append(tok_id)
            
        return self.tree_to_actions(buffer,stack,arcs,dependencies)
    
    def extract_features(self,current_configuration,current_transtion,
                            tree_dependencies,tokens_dict,POS_tags):

        
        def get_stackcontext_features(context_type,stack):
            
            stack_len = len(stack)
            if context_type == 'words':
                if stack_len >= 4:
                    return tokens_dict[stack[-1]],tokens_dict[stack[-2]],tokens_dict[stack[-3]]
                elif stack_len >=3:
                    return tokens_dict[stack[-1]],tokens_dict[stack[-2]],''
                elif stack_len >= 2:
                    return tokens_dict[stack[-1]],'',''
                else:
                    return '','',''
            
            if context_type == 'POS_tags':
                if stack_len >= 4:
                    return POS_tags[stack[-1]],POS_tags[stack[-2]],POS_tags[stack[-3]]
                elif stack_len >=3:
                    return POS_tags[stack[-1]],POS_tags[stack[-2]],''
                elif stack_len >= 2:
                    return POS_tags[stack[-1]],'',''
                else:
                    return '','',''
                
                
        def get_buffercontext_features(context_type,buffer):
            buffer_len = len(buffer)
            num_words = len(tokens_dict) - 1
            
            if buffer_len > 0:
                if context_type == 'words':
                    if buffer_len >=3:
                        return tokens_dict[buffer[-1]],tokens_dict[buffer[-2]],tokens_dict[buffer[-3]]

                    elif buffer_len >= 2:
                        return tokens_dict[buffer[-1]], tokens_dict[buffer[-2]],''

                    elif buffer_len>=1:
                        return tokens_dict[buffer[-1]],'',''

                if context_type == 'POS_tags':
                    if buffer_len >=3:
                        return POS_tags[buffer[-1]],POS_tags[buffer[-2]],POS_tags[buffer[-3]]

                    elif buffer_len >= 2:
                        return POS_tags[buffer[-1]], POS_tags[buffer[-2]],''

                    elif buffer_len>=1:
                        return POS_tags[buffer[-1]],'',''
            else:
                return '','',''
                        
        def get_node_children(word,data,subtree):
            
            if word == -1 :
                return 0,'',''
            
            dependents = subtree[word]
            valency = len(dependents)
            
            if not valency:
                return 0,'',''
            
            elif valency == 1:
                return 1,data[dependents[-1]],''
            else:
                return valency,data[dependents[-1]],data[dependents[-2]]
            
            
        def get_stack_features(idx,stack,tokens_dict,POS_tags):
            if idx == 'top':
                return tokens_dict[stack[-1]],POS_tags[stack[-1]]
            elif idx == '2nd_top':
                return tokens_dict[stack[-2]],POS_tags[stack[-2]]
        
        def get_buffer_features(buffer,tokens_dict,POS_tags):
                return tokens_dict[buffer[-1]],POS_tags[buffer[-1]]
            
        def get_two_word_features(tokens_dict,stack,POS_tags):
            return tokens_dict[stack[-1]],POS_tags[stack[-1]],POS_tags[stack[-2]]
        
        def insert_to_features(features,feat_i,feat_j,feat_k):
            features[feat_i] = 1
            features[feat_j] = 1
            features[feat_k] = 1
              
        
        label = current_transtion
        buffer,stack,arcs = current_configuration       
        features = defaultdict()
        
        stack_len = len(stack)
        buff_len = len(buffer)
        
        if stack_len > 1:
            stack_top = stack[-1] 
            
        else:
            stack_top = -1
        
        if len(buffer)>=1:
            curr_word = buffer[-1]
        else:
            curr_word = -1
            
        
        #print(f'st top {stack_top}, curr word {curr_word}')
        
        if len(stack) > 0:
            top_token,top_pos = get_stack_features('top',stack,tokens_dict,POS_tags)
        
            top_feature_1 = f'stack_top_{top_token}'
            top_feature_2 = f'stack_pos_{top_pos}'
            top_feature_3 = f'stack_top_pos_{top_token}_{top_pos}'
            
            insert_to_features(features,top_feature_1,top_feature_2,top_feature_3)
        
        if len(stack) > 1:
            top2nd_token,top2nd_pos = get_stack_features('2nd_top',stack,tokens_dict,POS_tags)
        
            top2nd_feature_4 = f'stack2nd_top_{top2nd_token}'
            top2nd_feature_5 = f'stack2nd_pos_{top2nd_pos}'
            top2nd_feature_6 = f'stack2nd_top_pos_{top2nd_token}_{top2nd_pos}'
            
            insert_to_features(features,top2nd_feature_4,top2nd_feature_5,top2nd_feature_6)
            
        if len(buffer) > 0:
            buf_token,buf_pos = get_buffer_features(buffer,tokens_dict,POS_tags)
        
            buf_feature_7 = f'buffer_top_{buf_token}'
            buf_feature_8 = f'buffer_pos_{buf_pos}'
            buf_feature_9 = f'buffer_top_pos_{buf_token}_{buf_pos}'
            
            insert_to_features(features,buf_feature_7,buf_feature_8,buf_feature_9)
            
            
        if len(stack) > 1 : 
            stack_top1 , pos_top1 , pos_2nd_top1 = get_two_word_features(tokens_dict,stack,POS_tags)
            
            two_word_feature_10 = f'stack_top_pos_top_pos_2ndtop_{stack_top1}_{pos_top1}_{pos_2nd_top1}'
            features[two_word_feature_10] = 1
        
        
        word_st_top,word_st_2ndtop,word_st_3rdtop = get_stackcontext_features('words',stack)
        POS_st_top,POS_st_2ndtop,POS_st_3rdtop = get_stackcontext_features('POS_tags',stack)
        
        
        word_buf_top,word_buf_2ndtop,word_buf_3rdtop = get_buffercontext_features('words',buffer)
        POS_buf_top,POS_buf_2ndtop,POS_buf_3rdtop = get_buffercontext_features('POS_tags',buffer)
        
        
        bufleft_word_val,bufleft_word_child1,bufleft_word_child2 = get_node_children(curr_word,tokens_dict,
                                                                            tree_dependencies.left_arc)
        bufleft_tag_val,bufleft_tag_child1,bufleft_tag_child2 = get_node_children(curr_word,POS_tags,
                                                                            tree_dependencies.left_arc)
        
        bufright_word_val,bufright_word_child1,bufright_word_child2 = get_node_children(curr_word,tokens_dict,
                                                                            tree_dependencies.right_arc)
        _,bufright_tag_child1,bufright_tag_child2 = get_node_children(curr_word,POS_tags,
                                                                            tree_dependencies.right_arc)

        
        stleft_word_val,stleft_word_child1,stleft_word_child2 = get_node_children(stack_top,tokens_dict,
                                                                            tree_dependencies.left_arc)
        _,stleft_tag_child1,stleft_tag_child2 = get_node_children(stack_top,POS_tags,
                                                                            tree_dependencies.left_arc)
        
        
        stright_word_val,stright_word_child1,stright_word_child2 = get_node_children(stack_top,tokens_dict,
                                                                            tree_dependencies.right_arc)
        _,stright_tag_child1,stright_tag_child2 = get_node_children(stack_top,POS_tags,
                                                                            tree_dependencies.right_arc)
        
        distance = 0
        
        if stack_top != 0 and curr_word != -1:
            distance  = min(curr_word - stack_top,5)
        
        word_feats = defaultdict(list)
        tag_feats = defaultdict(list)
        
        word_feats['stack_word'] = [word_st_top,word_st_2ndtop,word_st_3rdtop]
        word_feats['buffer_word'] = [word_buf_top,word_buf_2ndtop,word_buf_3rdtop]
        
        word_feats['left_parse_buffer'] = [bufleft_word_child1,bufleft_word_child2]
        word_feats['right_parse_buffer'] = [bufright_word_child1,bufright_word_child2]
        
        word_feats['left_parse_stack'] = [stleft_word_child1,stleft_word_child2]
        word_feats['right_parse_stack'] = [stright_word_child1,stright_word_child2]
        
        tag_feats['stack_pos'] =  [POS_st_top,POS_st_2ndtop,POS_st_3rdtop]
        tag_feats['buffer_pos'] = [POS_buf_top,POS_buf_2ndtop,POS_buf_3rdtop]
        
        tag_feats['left_parse_buffer'] = [bufleft_tag_child1,bufleft_tag_child2]
        tag_feats['right_parse_buffer'] = [bufright_tag_child1,bufright_tag_child2]
        
        tag_feats['left_parse_stack'] = [stleft_tag_child1,stleft_tag_child2]
        tag_feats['right_parse_stack'] =[stright_tag_child1,stright_tag_child2]
                
#         for feat_name,curr_feat in word_feats.items():
#             for word in curr_feat:
#                 if word:
#                     features[f'word = {word}']=1
                    
#         for feat_name,curr_feat in tag_feats.items():
#             for tag in curr_feat:
#                 if word:
#                     features[f'tag = {tag}']=1
        
        context_features = defaultdict(tuple)
        
        context_features['word_tag_pairs'] = (
            (word_st_top,POS_st_top),
            (word_buf_top,POS_buf_top),
            (word_buf_2ndtop,POS_buf_2ndtop),
            (word_buf_3rdtop,POS_buf_3rdtop)
                         )
        
        context_features['bigram_features'] = (
            (word_st_top,word_buf_top),
            (POS_st_top,POS_buf_top),
            (POS_buf_top,POS_buf_2ndtop),
#             (word_buf_top,POS_buf_top,word_st_top),
#             (word_buf_top,POS_buf_top,POS_st_top),
#             (word_st_top,POS_st_top,word_buf_top),
#             (word_st_top,POS_st_top,POS_buf_top),
#             (word_st_top,POS_st_top,word_buf_top,POS_buf_top)
        )
        
        context_features['trigram_features'] = (
            (POS_buf_top,POS_buf_2ndtop,POS_buf_3rdtop),
            (POS_st_top,POS_buf_top,POS_buf_2ndtop),
            (POS_st_top,POS_st_2ndtop,POS_buf_top),
            (POS_st_top,stright_tag_child1,POS_buf_top),
            (POS_st_top,POS_buf_top,bufleft_tag_child1),
            (POS_st_top,stleft_tag_child1,stleft_tag_child2),
            (POS_st_top,stright_tag_child1,stright_tag_child2),
            (POS_buf_top,bufleft_tag_child1,bufleft_tag_child2),
            (POS_st_top,POS_st_2ndtop,POS_st_3rdtop)
                            )
        
        context_features['word_val_feat'] = (
            (word_st_top,stright_word_val),
            (word_st_top,stleft_word_val),
            (word_buf_top,bufleft_word_val)
                        )
        
        context_features['tag_val_feat'] = (
            (POS_st_top,stright_word_val),
            (POS_st_top,stleft_word_val),
            (POS_buf_top,bufleft_word_val)
                       )
        
        context_features['distance_feat'] = (
            (word_st_top,distance),
            (word_buf_top,distance),
            (POS_st_top,distance),
            (POS_buf_top,distance),
            ('tag '+ POS_st_top + POS_buf_top,distance),
            ('word_ft ' + word_st_top + word_buf_top,distance)
        )
        
        
        for k,v in context_features.items():
            if k == 'word_val_feat':
                for (word,val) in v:
                    #print((word,val))
                    if word or val:
                        features[f'word = {word} val = {val}']=1
                        
            elif k == 'tag_val_feat':
                for (tag,val) in v:
                    #print((tag,val))
                    if tag or val:
                        features[f'tag = {tag} val = {val}']=1
                        
            elif k == 'distance_feat':
                for (word_ft,dist) in v:
                    #print((word_ft,dist))
                    if word_ft or dist:
                        features[f'{word_ft} distance = {dist}']=1

            
#             elif k == 'trigrams_features':
#                 for (ft1,ft2,ft3) in v:
#                     #print((ft1,ft2,ft3))
#                     if ft1 or ft2 or ft3:
#                         features[f'trigram = {ft1} {ft2} {ft3}']=1
                        
                        
#             elif k == 'bigram_features':
#                 for ft1,ft2 in v:
#                     if ft1 or ft2:
#                         features[f'biigram = {ft1} {ft2}']=1

#              if k == 'word_tag_pairs':
#                 for (word,tag) in v:
#                     #print((word,tag))
#                     if word or tag:
#                         features[f'word = {word} tag = {tag}']=1

                                                
        label = current_transtion
        
        return features,label

       
    def create_reverse_mappings(self,label_id,feature_id):
        self.reverse_labels = [0] * label_id # empty array of number of labels
        self.reverse_features = [0] * feature_id # empty array of number of features
        
        for feature in tqdm(self.feature_vocabulary):
            self.reverse_features[self.feature_vocabulary[feature]] = feature
        for label in tqdm(self.label_vocabulary):
            self.reverse_labels[self.label_vocabulary[label]] = label

        
    
    def convert_to_training_data(self,):
        
        label_id = 0
        feature_id = 0 
        
        feature_counts = defaultdict(int)
        total_features = len(self.features)
        
        print('Building Training Data...')
        
        for i in tqdm(range(total_features)):
            for feature in self.features[i]:
                feature_counts[feature] += 1
                
            if self.label_vocabulary.get(self.labels[i]) is None:
                self.label_vocabulary[self.labels[i]] = label_id
                label_id += 1
                
        for feature in tqdm(feature_counts):
            if feature_counts[ feature ] > 5 and self.feature_vocabulary.get(feature) is None:
                self.feature_vocabulary[ feature ] = feature_id
                feature_id += 1
        
        print('building cache')
        self.create_reverse_mappings(label_id,feature_id)
        
        self.max_features = feature_id
        x_train = sparse.lil_matrix((total_features,feature_id))
        y_train = list()
        
        for i in tqdm(range(total_features)):
            for feature in self.features[i]:
                
                if self.feature_vocabulary.get(feature) is not None:
                    f_count = self.feature_vocabulary[feature]
                    x_train[i,f_count] = 1
                
            y_train.append(self.label_vocabulary[self.labels[i]])
        
        return x_train,y_train

    
        
    def getChildren(self,state,transitions):
        
        num_words = len(state[0][0])
        dependencies = Parse(num_words)


        for i in range(1,len(state)):
            
            curr_buff = deepcopy(state[i][0])
            curr_stack = deepcopy(state[i][1])
            curr_arc = deepcopy(state[i][2])
            
            
            #print(transitions[i-1])
            #print(curr_buff)
            #print(curr_stack)
            #print(curr_arc)

            if transitions[i-1] == 'shift':
                #print('-'*50)
                continue
            
            elif transitions[i-1].startswith('left'):
                head = curr_arc[-1][1]
                child = curr_arc[-1][2]
                
                dependencies.add_arc(head,child)
                
            elif transitions[i-1].startswith('right'):
                head = curr_arc[-1][1]
                child = curr_arc[-1][2]
                
                dependencies.add_arc(head,child)
                
            #print(dependencies.heads)
            #print(dependencies.left_arc)
            #print(dependencies.right_arc)
            #print('-'*50)
            
        return dependencies
    

    def extract_line_tokens(self,line):
        line_id,word,stem_word,u_pos,x_pos = line[0],line[1],line[2],line[3],line[4]
        feat,head,dep_rel,dep,misc = line[5],line[6],line[7],line[8],line[9]
    
        return int(line_id),word,x_pos,int(head),dep_rel
                
    def build_oracles(self,):
        
        current_tokens = []
        tokens_dict = {}
        POS_tags = {}
        
        tokens_dict[0]= 'root'
        POS_tags[0]= 'root'
        
        for current_line in self.train_treebank:
    
            if len(current_line) < 2 : # useful fields
                
                if len(current_tokens) > 0 : #have found some tokens before
                    
                        
                    if self.check_projectivity(current_tokens) == True: #if current tree is projective
            
                        
                        configuration_state,gold_transitions = self.extract_configs_transitions(current_tokens)
                        
                        tree_dependencies = self.getChildren(configuration_state,gold_transitions)
                        
                        for i in range(len(configuration_state)):
                            #print(f'{configuration_state[i]}------{gold_transitions[i]}')
#                             features , labels = self.get_features_labels(configuration_state[i],gold_transitions[i],
#                                                                          tokens_dict,POS_tags)

                            
                            features , labels = self.extract_features(configuration_state[i],gold_transitions[i],
                                                  tree_dependencies,tokens_dict,POS_tags)

                            
                            self.features.append(features)
                            self.labels.append(labels)
                    

                    
                    current_tokens = []
                    tokens_dict = {}
                    POS_tags = {}
                    
                    tokens_dict[0]= 'root'
                    POS_tags[0]= 'root'
                    
                continue
                
            if current_line[0].startswith('#'):
                continue
            
            tok_id,word,x_pos,head_id,dep_rel = self.extract_line_tokens(current_line)
            current_tokens.append((tok_id,word,x_pos,head_id,dep_rel))
            tokens_dict[tok_id] = word
            POS_tags[tok_id] = x_pos 
            
            
       
        X_train,y_train = self.convert_to_training_data()
        
        print('finished')
        return X_train,y_train,self.features
    
    def get_model(self,model_name):
        
        model = None
        if model_name == 'lr_l2':
            model = LogisticRegression(verbose = 2,n_jobs = 40)
        
        elif model_name == 'lr_el_saga':
            model = LogisticRegression(penalty = 'elasticnet',verbose = 2,n_jobs = 4,solver = 'saga',l1_ratio = 0.25)

        elif model_name == 'svm_rbf':
            model = SVC(kernel = 'rbf', C=1.6)
        
        elif model_name == 'svm_poly':
            model = SVC(kernel = 'poly', C=1.5)
        
        elif model_name == 'sgd_l2':
            model = SGDClassifier(shuffle = False)
        
        elif model_name == 'sgd_el':
            model = SGDClassifier(penalty = 'elasticnet', shuffle=False)
        
        elif model_name == 'rf_gini':
            model = RandomForestClassifier(n_estimators = 552, criterion = 'gini', n_jobs = 4 )
        
        elif model_name == 'rf_ent':
            model = RandomForestClassifier(n_estimators = 552, criterion = 'entropy', n_jobs = 4 )
            
        elif model_name == 'pac':
            model = PassiveAggressiveClassifier(n_jobs = 4,vebose = 2)
        
        elif model_name == 'gnb':
            model = GaussianNB()
        
        elif model_name == 'bnb':
            model = BernoulliNB()
        
        elif model_name == 'lgb':
            model = LGBMClassifier()
            
        return model   

    def train(self,x_train,y_train,model_name = None):
        
        self.classifier = model_name
        
        model = self.get_model(model_name)
        
        model.fit(x_train,y_train)
        
        return model
    
    def action_to_tree(self,test_dependency_tree,prediction_probabs,test_buffer,test_stack,test_arcs):
        
        def is_action_valid(test_stack,test_buffer,action_taken):
            if action_taken == "shift" and len(test_buffer) > 0 :
                return True
            
            elif action_taken.startswith('left') and len(test_stack) > 1 and test_stack[-1] != 0: #0 is root
                return True 
            
            elif action_taken.startswith('right') and len(test_stack) > 1 and test_stack[-2] != 0: #0 is root
                return True 
            
            else:
                return False
            
            
        for label_id_prob in np.argsort(prediction_probabs, kind = 'quicksort')[0]:
            
            action_taken = self.reverse_labels[label_id_prob]
            
            if is_action_valid(test_stack,test_buffer,action_taken):
                
                if action_taken == "shift":
                    test_stack.append(test_buffer.pop())
                
                elif action_taken.startswith('left'):
                    arc_label = action_taken.split('_')[1]
                    test_dependency_tree[test_stack[-2]] = (test_stack[-1], arc_label)
                    test_stack.pop(-2)
                    
                elif action_taken.startswith('right'):
                    arc_label = action_taken.split('_')[1]
                    test_dependency_tree[test_stack[-1]] = (test_stack[-2], arc_label)
                    test_stack.pop()
                return
    
    def evaluate_oracles_and_predictions(self,test_tokens,model):
        
        test_tokens_dict = defaultdict()
        test_POS_tags = defaultdict()
        
        test_tokens_dict[0] = 'root'
        test_POS_tags[0] = 'root'
        
        test_heads = defaultdict()
        test_labels = defaultdict()
        test_heads[0] = 'root'
        
        test_buffer = list()
        
        test_stack = list()
        test_stack.append(0)
        
        test_arcs = list()
        
        
        
        for test_token in reversed(test_tokens):
            
            token_id,token_word,token_tag,token_head,token_label = test_token
            
            test_tokens_dict[token_id] = token_word
            test_POS_tags[token_id] = token_tag
            
            test_heads[token_id] = token_head
            test_labels[token_id] = token_label
            
            test_buffer.append(token_id)
            
        i = 0
        test_config_state,test_gold_transitions = self.extract_configs_transitions(test_tokens)     
        test_tree_dep = self.getChildren(test_config_state,test_gold_transitions)
        
        test_dependency_tree = defaultdict()
                
        while len(test_buffer) > 0 :
            
            if len(test_buffer) == 0  and len(test_stack)==0 : #nothing available to perform an action
                break
                
            if len(test_buffer) == 0  and len(test_stack)==1 and test_stack[0] == 0 : #nothing but root
                break
            
            current_config =  (test_buffer,test_stack,test_arcs)
                        
            test_features,_ = self.extract_features(current_config,None,
                                                  test_tree_dep,test_tokens_dict,test_POS_tags)
            
            x_test = sparse.lil_matrix((1,self.max_features))
            
            for feature in test_features:
                if self.feature_vocabulary.get(feature) is not None:
                    x_test[0,self.feature_vocabulary[feature]] = 1
                    
            prediction_probabs = -1 * model.predict_proba(x_test)
            
            self.action_to_tree(test_dependency_tree,prediction_probabs,test_buffer,test_stack,test_arcs)
            
        unlabelled_attachment_score = 0
        labelled_attachment_score = 0
            
        total_children = 0
        
        for child in test_dependency_tree:
                
    
            token_head,arc_label = test_dependency_tree[child]
        


            if token_head == test_heads[child]:
                unlabelled_attachment_score += 1
                    
                if arc_label == test_labels[child]:
                    labelled_attachment_score += 1
                
            total_children += 1
                
        return [unlabelled_attachment_score,labelled_attachment_score,total_children]
        
    
    def evaluate(self,test_treebank,model):
        
        if self.test_treebank is None:
            self.test_treebank = test_treebank
        
        test_tokens = [] #tokens in current line
        current_all_scores = []
        total_scores = np.zeros(3)
        
        i=0
        
        for current_line in tqdm(self.test_treebank):
            i+=1
            if len(current_line) < 2 :
                
                if len(test_tokens) > 0: # has seen tokens
                    #print(test_tokens)
                    if self.check_projectivity(test_tokens) == True:
                        
                            current_all_scores = self.evaluate_oracles_and_predictions(test_tokens,model)
                            total_scores += current_all_scores
                            
                            print(f'Total lines : {total_scores[2]} \n Unlabelled attachment Score {total_scores[0]/total_scores[2]} \n Labelled attachment Score {total_scores[1]/total_scores[2]}')
                            
                            
                    test_tokens = []
                continue
                
            if current_line[0].startswith('#'):
                continue
            
            tok_id,word,x_pos,head_id,dep_rel = self.extract_line_tokens(current_line)
            test_tokens.append((tok_id,word,x_pos,head_id,dep_rel))
        
        accuracy = f'Total lines : {total_scores[2]} \n Unlabelled attachment Score {total_scores[0]/total_scores[2]} \n Unlabelled attachment Score {total_scores[1]/total_scores[2]}'
        
        return accuracy


In [10]:
train_treebank = read_file(train_file_conllu)
test_treebank = read_file(val_file_conllu)

In [11]:
    

dp = DependencyParser(train_treebank)


# (deprel,i1,i2) --> {left : i1 is head,i2 is child}
x_train,y_train,f = dp.build_oracles()


Building Training Data...


HBox(children=(FloatProgress(value=0.0, max=460986.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=394204.0), HTML(value='')))


building cache


HBox(children=(FloatProgress(value=0.0, max=94661.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=53.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=460986.0), HTML(value='')))


finished


In [12]:
%%time

model3 = dp.train(x_train,y_train,'lr_l2')




[Parallel(n_jobs=40)]: Using backend LokyBackend with 40 concurrent workers.


Wall time: 3min 31s


[Parallel(n_jobs=40)]: Done   1 out of   1 | elapsed:  3.5min finished


In [13]:
x_train

<460986x94661 sparse matrix of type '<class 'numpy.float64'>'
	with 8816042 stored elements in LInked List format>

In [14]:

accuracy3 = dp.evaluate(test_treebank,model3)

HBox(children=(FloatProgress(value=0.0, max=40194.0), HTML(value='')))

Total lines : 22.0 
 Unlabelled attachment Score 0.8181818181818182 
 Labelled attachment Score 0.7727272727272727
Total lines : 35.0 
 Unlabelled attachment Score 0.8285714285714286 
 Labelled attachment Score 0.7714285714285715
Total lines : 52.0 
 Unlabelled attachment Score 0.8269230769230769 
 Labelled attachment Score 0.7884615384615384
Total lines : 77.0 
 Unlabelled attachment Score 0.7792207792207793 
 Labelled attachment Score 0.7272727272727273
Total lines : 87.0 
 Unlabelled attachment Score 0.7931034482758621 
 Labelled attachment Score 0.735632183908046
Total lines : 87.0 
 Unlabelled attachment Score 0.7931034482758621 
 Labelled attachment Score 0.735632183908046
Total lines : 99.0 
 Unlabelled attachment Score 0.8080808080808081 
 Labelled attachment Score 0.7575757575757576
Total lines : 112.0 
 Unlabelled attachment Score 0.8303571428571429 
 Labelled attachment Score 0.7767857142857143
Total lines : 118.0 
 Unlabelled attachment Score 0.8389830508474576 
 Labelled a

Total lines : 864.0 
 Unlabelled attachment Score 0.8275462962962963 
 Labelled attachment Score 0.75
Total lines : 877.0 
 Unlabelled attachment Score 0.8289623717217788 
 Labelled attachment Score 0.750285062713797
Total lines : 885.0 
 Unlabelled attachment Score 0.8305084745762712 
 Labelled attachment Score 0.751412429378531
Total lines : 892.0 
 Unlabelled attachment Score 0.827354260089686 
 Labelled attachment Score 0.7488789237668162
Total lines : 903.0 
 Unlabelled attachment Score 0.8272425249169435 
 Labelled attachment Score 0.7497231450719822
Total lines : 913.0 
 Unlabelled attachment Score 0.8291347207009858 
 Labelled attachment Score 0.7524644030668127
Total lines : 927.0 
 Unlabelled attachment Score 0.8306364617044228 
 Labelled attachment Score 0.7529665587918015
Total lines : 943.0 
 Unlabelled attachment Score 0.8303287380699894 
 Labelled attachment Score 0.750795334040297
Total lines : 947.0 
 Unlabelled attachment Score 0.8310454065469906 
 Labelled attachment

Total lines : 1972.0 
 Unlabelled attachment Score 0.8387423935091278 
 Labelled attachment Score 0.7545638945233266
Total lines : 2001.0 
 Unlabelled attachment Score 0.8390804597701149 
 Labelled attachment Score 0.7551224387806097
Total lines : 2008.0 
 Unlabelled attachment Score 0.8396414342629482 
 Labelled attachment Score 0.7554780876494024
Total lines : 2033.0 
 Unlabelled attachment Score 0.839153959665519 
 Labelled attachment Score 0.7555336940482046
Total lines : 2046.0 
 Unlabelled attachment Score 0.8401759530791789 
 Labelled attachment Score 0.7561094819159335
Total lines : 2066.0 
 Unlabelled attachment Score 0.8402710551790901 
 Labelled attachment Score 0.7560503388189739
Total lines : 2076.0 
 Unlabelled attachment Score 0.8386319845857418 
 Labelled attachment Score 0.7548169556840078
Total lines : 2088.0 
 Unlabelled attachment Score 0.8395593869731801 
 Labelled attachment Score 0.7547892720306514
Total lines : 2100.0 
 Unlabelled attachment Score 0.84 
 Labelle

Total lines : 3285.0 
 Unlabelled attachment Score 0.8429223744292238 
 Labelled attachment Score 0.7552511415525114
Total lines : 3297.0 
 Unlabelled attachment Score 0.843190779496512 
 Labelled attachment Score 0.7555353351531695
Total lines : 3306.0 
 Unlabelled attachment Score 0.8433151845130067 
 Labelled attachment Score 0.7558983666061706
Total lines : 3343.0 
 Unlabelled attachment Score 0.8432545617708644 
 Labelled attachment Score 0.7538139395752318
Total lines : 3357.0 
 Unlabelled attachment Score 0.8430145963658028 
 Labelled attachment Score 0.7530533214179327
Total lines : 3372.0 
 Unlabelled attachment Score 0.8431198102016607 
 Labelled attachment Score 0.7529655990510083
Total lines : 3381.0 
 Unlabelled attachment Score 0.8432416444838805 
 Labelled attachment Score 0.7527358769594794
Total lines : 3390.0 
 Unlabelled attachment Score 0.8433628318584071 
 Labelled attachment Score 0.752802359882006
Total lines : 3404.0 
 Unlabelled attachment Score 0.8440070505287

Total lines : 4614.0 
 Unlabelled attachment Score 0.8508885999133073 
 Labelled attachment Score 0.7622453402687472
Total lines : 4622.0 
 Unlabelled attachment Score 0.850713976633492 
 Labelled attachment Score 0.7622241453916053
Total lines : 4640.0 
 Unlabelled attachment Score 0.850646551724138 
 Labelled attachment Score 0.7616379310344827
Total lines : 4649.0 
 Unlabelled attachment Score 0.8505054850505485 
 Labelled attachment Score 0.7614540761454076
Total lines : 4676.0 
 Unlabelled attachment Score 0.850727117194183 
 Labelled attachment Score 0.7619760479041916
Total lines : 4685.0 
 Unlabelled attachment Score 0.8510138740661686 
 Labelled attachment Score 0.7622198505869797
Total lines : 4708.0 
 Unlabelled attachment Score 0.85089209855565 
 Labelled attachment Score 0.762107051826678
Total lines : 4714.0 
 Unlabelled attachment Score 0.8510818837505303 
 Labelled attachment Score 0.7617734408145949
Total lines : 4745.0 
 Unlabelled attachment Score 0.8512118018967334 

Total lines : 5965.0 
 Unlabelled attachment Score 0.8511316010058676 
 Labelled attachment Score 0.7659681475272423
Total lines : 5976.0 
 Unlabelled attachment Score 0.8510709504685409 
 Labelled attachment Score 0.7658969210174029
Total lines : 6006.0 
 Unlabelled attachment Score 0.8508158508158508 
 Labelled attachment Score 0.7652347652347652
Total lines : 6034.0 
 Unlabelled attachment Score 0.8510109380178986 
 Labelled attachment Score 0.765661252900232
Total lines : 6055.0 
 Unlabelled attachment Score 0.8510322047894302 
 Labelled attachment Score 0.7658133773740711
Total lines : 6069.0 
 Unlabelled attachment Score 0.8512110726643599 
 Labelled attachment Score 0.7658592848904268
Total lines : 6104.0 
 Unlabelled attachment Score 0.8510812581913499 
 Labelled attachment Score 0.7660550458715596
Total lines : 6122.0 
 Unlabelled attachment Score 0.8511924207775237 
 Labelled attachment Score 0.7655994772950017
Total lines : 6134.0 
 Unlabelled attachment Score 0.850831431366

 Labelled attachment Score 0.7645192440336598
Total lines : 7275.0 
 Unlabelled attachment Score 0.8492096219931271 
 Labelled attachment Score 0.7646735395189004
Total lines : 7291.0 
 Unlabelled attachment Score 0.8494033740227678 
 Labelled attachment Score 0.7646413386366754
Total lines : 7302.0 
 Unlabelled attachment Score 0.8493563407285675 
 Labelled attachment Score 0.7647219939742537
Total lines : 7320.0 
 Unlabelled attachment Score 0.8495901639344262 
 Labelled attachment Score 0.7650273224043715
Total lines : 7344.0 
 Unlabelled attachment Score 0.8495370370370371 
 Labelled attachment Score 0.7649782135076253
Total lines : 7381.0 
 Unlabelled attachment Score 0.8490719414713453 
 Labelled attachment Score 0.7646660344126812
Total lines : 7404.0 
 Unlabelled attachment Score 0.8495407887628309 
 Labelled attachment Score 0.7649918962722853
Total lines : 7427.0 
 Unlabelled attachment Score 0.8494681567254612 
 Labelled attachment Score 0.7649118082671335
Total lines : 7461

Total lines : 8704.0 
 Unlabelled attachment Score 0.8536305147058824 
 Labelled attachment Score 0.7668887867647058
Total lines : 8728.0 
 Unlabelled attachment Score 0.8538038496791934 
 Labelled attachment Score 0.7669569202566453
Total lines : 8764.0 
 Unlabelled attachment Score 0.8534915563669557 
 Labelled attachment Score 0.7665449566408032
Total lines : 8786.0 
 Unlabelled attachment Score 0.8535169587980879 
 Labelled attachment Score 0.7661051673116321
Total lines : 8792.0 
 Unlabelled attachment Score 0.853616924476797 
 Labelled attachment Score 0.7662647861692448
Total lines : 8826.0 
 Unlabelled attachment Score 0.8535010197144799 
 Labelled attachment Score 0.7660321776569228
Total lines : 8835.0 
 Unlabelled attachment Score 0.8536502546689304 
 Labelled attachment Score 0.7661573288058857
Total lines : 8880.0 
 Unlabelled attachment Score 0.8537162162162162 
 Labelled attachment Score 0.7664414414414414
Total lines : 8902.0 
 Unlabelled attachment Score 0.853628398112

Total lines : 10084.0 
 Unlabelled attachment Score 0.8549186830622769 
 Labelled attachment Score 0.7673542245140818
Total lines : 10105.0 
 Unlabelled attachment Score 0.8549233052944087 
 Labelled attachment Score 0.7675408213755567
Total lines : 10121.0 
 Unlabelled attachment Score 0.8550538484339492 
 Labelled attachment Score 0.7677107005236636
Total lines : 10141.0 
 Unlabelled attachment Score 0.855241100483187 
 Labelled attachment Score 0.7677743812247313
Total lines : 10168.0 
 Unlabelled attachment Score 0.8554287962234461 
 Labelled attachment Score 0.7676042486231314
Total lines : 10190.0 
 Unlabelled attachment Score 0.8554465161923455 
 Labelled attachment Score 0.7676153091265947
Total lines : 10199.0 
 Unlabelled attachment Score 0.8555740758897931 
 Labelled attachment Score 0.7678203745465242
Total lines : 10215.0 
 Unlabelled attachment Score 0.8558002936857563 
 Labelled attachment Score 0.7680861478218306
Total lines : 10224.0 
 Unlabelled attachment Score 0.855

Total lines : 11447.0 
 Unlabelled attachment Score 0.8580414082292304 
 Labelled attachment Score 0.770769633965231
Total lines : 11455.0 
 Unlabelled attachment Score 0.8580532518550851 
 Labelled attachment Score 0.7708424268878219
Total lines : 11467.0 
 Unlabelled attachment Score 0.858027382924915 
 Labelled attachment Score 0.7707334089125316
Total lines : 11487.0 
 Unlabelled attachment Score 0.8580134064594759 
 Labelled attachment Score 0.7708714198659354
Total lines : 11513.0 
 Unlabelled attachment Score 0.8581603404846695 
 Labelled attachment Score 0.7707808564231738
Total lines : 11531.0 
 Unlabelled attachment Score 0.8579481397970687 
 Labelled attachment Score 0.7707050559361721
Total lines : 11541.0 
 Unlabelled attachment Score 0.8578979291222598 
 Labelled attachment Score 0.7706437916991595
Total lines : 11567.0 
 Unlabelled attachment Score 0.8580444367597475 
 Labelled attachment Score 0.7708135212241722
Total lines : 11582.0 
 Unlabelled attachment Score 0.8581

Total lines : 12819.0 
 Unlabelled attachment Score 0.8597394492550121 
 Labelled attachment Score 0.7716670567126921
Total lines : 12827.0 
 Unlabelled attachment Score 0.8596710064707258 
 Labelled attachment Score 0.7716535433070866
Total lines : 12852.0 
 Unlabelled attachment Score 0.8598661686896981 
 Labelled attachment Score 0.7719421101774043
Total lines : 12870.0 
 Unlabelled attachment Score 0.8599844599844599 
 Labelled attachment Score 0.771950271950272
Total lines : 12885.0 
 Unlabelled attachment Score 0.8599922390376407 
 Labelled attachment Score 0.7717500970120295
Total lines : 12912.0 
 Unlabelled attachment Score 0.8596654275092936 
 Labelled attachment Score 0.7713754646840149
Total lines : 12935.0 
 Unlabelled attachment Score 0.859451101662157 
 Labelled attachment Score 0.7712408194820255
Total lines : 12952.0 
 Unlabelled attachment Score 0.8595583693638048 
 Labelled attachment Score 0.7713094502779494
Total lines : 12963.0 
 Unlabelled attachment Score 0.8596

Total lines : 14193.0 
 Unlabelled attachment Score 0.8588740928626788 
 Labelled attachment Score 0.7710138800817304
Total lines : 14222.0 
 Unlabelled attachment Score 0.8589509211081423 
 Labelled attachment Score 0.7711995499929687
Total lines : 14243.0 
 Unlabelled attachment Score 0.8588780453556133 
 Labelled attachment Score 0.771185845678579
Total lines : 14256.0 
 Unlabelled attachment Score 0.859006734006734 
 Labelled attachment Score 0.7712542087542088
Total lines : 14261.0 
 Unlabelled attachment Score 0.8590561671692027 
 Labelled attachment Score 0.7713344085267513
Total lines : 14281.0 
 Unlabelled attachment Score 0.859113507457461 
 Labelled attachment Score 0.7713745536026889
Total lines : 14294.0 
 Unlabelled attachment Score 0.8589618021547503 
 Labelled attachment Score 0.7712326850426753
Total lines : 14315.0 
 Unlabelled attachment Score 0.8587495633950402 
 Labelled attachment Score 0.7710094306671323
Total lines : 14330.0 
 Unlabelled attachment Score 0.85875

Total lines : 15189.0 
 Unlabelled attachment Score 0.8587793798143393 
 Labelled attachment Score 0.7706234775166239
Total lines : 15201.0 
 Unlabelled attachment Score 0.8588908624432603 
 Labelled attachment Score 0.7708045523320834
Total lines : 15210.0 
 Unlabelled attachment Score 0.8589743589743589 
 Labelled attachment Score 0.770940170940171
Total lines : 15218.0 
 Unlabelled attachment Score 0.858982783545801 
 Labelled attachment Score 0.7709291628334867
Total lines : 15227.0 
 Unlabelled attachment Score 0.8590004597097262 
 Labelled attachment Score 0.7709988835620937
Total lines : 15236.0 
 Unlabelled attachment Score 0.8590181149908113 
 Labelled attachment Score 0.7710685219217642
Total lines : 15246.0 
 Unlabelled attachment Score 0.8591105863833136 
 Labelled attachment Score 0.7712186803095894
Total lines : 15262.0 
 Unlabelled attachment Score 0.8589961997117023 
 Labelled attachment Score 0.7710653911676059
Total lines : 15280.0 
 Unlabelled attachment Score 0.8590

Total lines : 16563.0 
 Unlabelled attachment Score 0.8589023727585582 
 Labelled attachment Score 0.7702107106200567
Total lines : 16590.0 
 Unlabelled attachment Score 0.8590114526823388 
 Labelled attachment Score 0.7704038577456299
Total lines : 16605.0 
 Unlabelled attachment Score 0.8591388136103584 
 Labelled attachment Score 0.7704305931948209
Total lines : 16634.0 
 Unlabelled attachment Score 0.8592641577491884 
 Labelled attachment Score 0.7705903570999159
Total lines : 16654.0 
 Unlabelled attachment Score 0.8594331692085986 
 Labelled attachment Score 0.7708058124174373
Total lines : 16664.0 
 Unlabelled attachment Score 0.8595175228036486 
 Labelled attachment Score 0.7709433509361497
Total lines : 16676.0 
 Unlabelled attachment Score 0.8595586471575918 
 Labelled attachment Score 0.7709882465819141
Total lines : 16689.0 
 Unlabelled attachment Score 0.8596081251123494 
 Labelled attachment Score 0.7711067169992211
Total lines : 16705.0 
 Unlabelled attachment Score 0.85

Total lines : 17845.0 
 Unlabelled attachment Score 0.8614177640795742 
 Labelled attachment Score 0.772093023255814
Total lines : 17861.0 
 Unlabelled attachment Score 0.8613179553216506 
 Labelled attachment Score 0.7720172442752365
Total lines : 17882.0 
 Unlabelled attachment Score 0.8613689743876524 
 Labelled attachment Score 0.7720053685270104
Total lines : 17902.0 
 Unlabelled attachment Score 0.8613004133616355 
 Labelled attachment Score 0.7718690649089487
Total lines : 17912.0 
 Unlabelled attachment Score 0.8613220187583742 
 Labelled attachment Score 0.7718847699866012
Total lines : 17938.0 
 Unlabelled attachment Score 0.8613557810235255 
 Labelled attachment Score 0.7718809231798417
Total lines : 17952.0 
 Unlabelled attachment Score 0.8614081996434938 
 Labelled attachment Score 0.7719474153297683
Total lines : 17975.0 
 Unlabelled attachment Score 0.861307371349096 
 Labelled attachment Score 0.7719610570236439
Total lines : 17998.0 
 Unlabelled attachment Score 0.8612

Total lines : 19302.0 
 Unlabelled attachment Score 0.861620557455186 
 Labelled attachment Score 0.7729250854833696
Total lines : 19334.0 
 Unlabelled attachment Score 0.8615909796213924 
 Labelled attachment Score 0.7730423088859005
Total lines : 19354.0 
 Unlabelled attachment Score 0.8616306706623954 
 Labelled attachment Score 0.7731218352795288
Total lines : 19367.0 
 Unlabelled attachment Score 0.8614653792533691 
 Labelled attachment Score 0.7729643207517943
Total lines : 19384.0 
 Unlabelled attachment Score 0.8614321089558399 
 Labelled attachment Score 0.7729570780024763
Total lines : 19419.0 
 Unlabelled attachment Score 0.8615273701014471 
 Labelled attachment Score 0.7730573150007725
Total lines : 19449.0 
 Unlabelled attachment Score 0.861381047868785 
 Labelled attachment Score 0.7729960409275541
Total lines : 19459.0 
 Unlabelled attachment Score 0.8614008941877794 
 Labelled attachment Score 0.7730099182897374
Total lines : 19478.0 
 Unlabelled attachment Score 0.8614

Total lines : 20392.0 
 Unlabelled attachment Score 0.8623970184386034 
 Labelled attachment Score 0.7735386426049431
Total lines : 20405.0 
 Unlabelled attachment Score 0.8624846851261946 
 Labelled attachment Score 0.7734378828718451
Total lines : 20422.0 
 Unlabelled attachment Score 0.862550190970522 
 Labelled attachment Score 0.7734795808441877
Total lines : 20439.0 
 Unlabelled attachment Score 0.8625666617740594 
 Labelled attachment Score 0.7735212094525172
Total lines : 20466.0 
 Unlabelled attachment Score 0.8625036646144826 
 Labelled attachment Score 0.773331378872276
Total lines : 20480.0 
 Unlabelled attachment Score 0.8625 
 Labelled attachment Score 0.773193359375
Total lines : 20498.0 
 Unlabelled attachment Score 0.862523172992487 
 Labelled attachment Score 0.7732461703580837
Total lines : 20533.0 
 Unlabelled attachment Score 0.862562703939999 
 Labelled attachment Score 0.7731943700384747
Total lines : 20564.0 
 Unlabelled attachment Score 0.862624003112235 
 Labe

Total lines : 21795.0 
 Unlabelled attachment Score 0.8624455150263822 
 Labelled attachment Score 0.77297545308557
Total lines : 21821.0 
 Unlabelled attachment Score 0.8625635855368682 
 Labelled attachment Score 0.773062646074882
Total lines : 21834.0 
 Unlabelled attachment Score 0.862508015022442 
 Labelled attachment Score 0.7730145644407804
Total lines : 21844.0 
 Unlabelled attachment Score 0.862570957700055 
 Labelled attachment Score 0.7729811389855338
Total lines : 21865.0 
 Unlabelled attachment Score 0.8626572147267322 
 Labelled attachment Score 0.7730619711868283
Total lines : 21873.0 
 Unlabelled attachment Score 0.8627074475380606 
 Labelled attachment Score 0.7730992547890093
Total lines : 21888.0 
 Unlabelled attachment Score 0.8628015350877193 
 Labelled attachment Score 0.7732547514619883
Total lines : 21920.0 
 Unlabelled attachment Score 0.8628649635036496 
 Labelled attachment Score 0.7733120437956205
Total lines : 21944.0 
 Unlabelled attachment Score 0.8628782

Total lines : 23003.0 
 Unlabelled attachment Score 0.8647132982654436 
 Labelled attachment Score 0.7746815632743556
Total lines : 23017.0 
 Unlabelled attachment Score 0.8645783551288179 
 Labelled attachment Score 0.7745579354390233
Total lines : 23027.0 
 Unlabelled attachment Score 0.8646371650670952 
 Labelled attachment Score 0.774568984235897
Total lines : 23041.0 
 Unlabelled attachment Score 0.8646760123258539 
 Labelled attachment Score 0.7746191571546374
Total lines : 23054.0 
 Unlabelled attachment Score 0.8647089442179231 
 Labelled attachment Score 0.7746594950984644
Total lines : 23083.0 
 Unlabelled attachment Score 0.8647922713685396 
 Labelled attachment Score 0.7747693107481697
Total lines : 23098.0 
 Unlabelled attachment Score 0.8647501948220625 
 Labelled attachment Score 0.7746558143562213
Total lines : 23141.0 
 Unlabelled attachment Score 0.8646990190570848 
 Labelled attachment Score 0.7747288362646385
Total lines : 23160.0 
 Unlabelled attachment Score 0.864

Total lines : 24381.0 
 Unlabelled attachment Score 0.8653459661211599 
 Labelled attachment Score 0.7747016119109142
Total lines : 24406.0 
 Unlabelled attachment Score 0.8651561091534868 
 Labelled attachment Score 0.774399737769401
Total lines : 24418.0 
 Unlabelled attachment Score 0.8650995167499386 
 Labelled attachment Score 0.7743467933491687
Total lines : 24432.0 
 Unlabelled attachment Score 0.8650949574328749 
 Labelled attachment Score 0.7743533071381794
Total lines : 24471.0 
 Unlabelled attachment Score 0.8650647705447264 
 Labelled attachment Score 0.7743042785337747
Total lines : 24499.0 
 Unlabelled attachment Score 0.8649740805747174 
 Labelled attachment Score 0.7742356830891057
Total lines : 24541.0 
 Unlabelled attachment Score 0.8650829224562976 
 Labelled attachment Score 0.7741738315472068
Total lines : 24557.0 
 Unlabelled attachment Score 0.8651301054689091 
 Labelled attachment Score 0.7741988027853565
Total lines : 24573.0 
 Unlabelled attachment Score 0.864

Total lines : 25741.0 
 Unlabelled attachment Score 0.8638747523406239 
 Labelled attachment Score 0.7727749504681248
Total lines : 25765.0 
 Unlabelled attachment Score 0.8638463031243936 
 Labelled attachment Score 0.7727537356879488
Total lines : 25775.0 
 Unlabelled attachment Score 0.8638991270611057 
 Labelled attachment Score 0.7728031037827352
Total lines : 25788.0 
 Unlabelled attachment Score 0.8639289592058321 
 Labelled attachment Score 0.7728400806576702
Total lines : 25801.0 
 Unlabelled attachment Score 0.8638424867253207 
 Labelled attachment Score 0.7727607457075307
Total lines : 25812.0 
 Unlabelled attachment Score 0.8638617697195103 
 Labelled attachment Score 0.7727801022780102
Total lines : 25832.0 
 Unlabelled attachment Score 0.8639284608237845 
 Labelled attachment Score 0.7728398885103748
Total lines : 25846.0 
 Unlabelled attachment Score 0.863924785266579 
 Labelled attachment Score 0.7728468621837035
Total lines : 25866.0 
 Unlabelled attachment Score 0.863

In [15]:
import pickle

filename = 'dependency_parsing_model.sav'
pickle.dump(model3, open(filename, 'wb'))


In [16]:
import pickle

loaded_model = pickle.load(open(filename, 'rb'))

In [17]:
# %%time

# model2 = dp.train(x_train,y_train,'bnb')


# %%time

# accuracy2 = dp.evaluate(test_treebank,model)


In [18]:
    
#         print(f'{(word_st_top,word_st_2ndtop,word_st_3rdtop)}')
#         print(f'{(POS_st_top,POS_st_2ndtop,POS_st_3rdtop)}')
#         print(f'{( word_buf_top,word_buf_2ndtop,word_buf_3rdtop)}')
#         print(f'{(POS_buf_top,POS_buf_2ndtop,POS_buf_3rdtop)}')
#         print(f'{(bufleft_word_val,bufleft_word_child1,bufleft_word_child2)}')
#         print(f'{(bufleft_tag_val,bufleft_tag_child1,bufleft_tag_child2)}')
#         print(f'{( bufright_word_val,bufright_word_child1,bufright_word_child2)}')
#         print(f'{(bufright_tag_val,bufright_tag_child1,bufright_tag_child2)}')
#         print(f'{(stleft_word_val,stleft_word_child1,stleft_word_child2)}')
#         print(f'{(stleft_tag_val,stleft_tag_child1,stleft_tag_child2)}')
#         print(f'{( stright_word_val,stright_word_child1,stright_word_child2)}')
#         print(f'{(stright_tag_val,stright_tag_child1,stright_tag_child2)}')
#         print('-'*50)

