In [1]:
import os
import sys,re
import numpy as np
from scipy import sparse
from collections import Counter,defaultdict
from tqdm.notebook import tqdm

In [2]:
# import lightgbm as lgb
# from lightgbm import LGBMClassifier 


from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import PassiveAggressiveClassifier


from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB,BernoulliNB


from sklearn.model_selection import GridSearchCV
from sklearn import model_selection

In [3]:
train_file_conllu  = r'E:\iiit\Sem4\NLP\dependency parsing\TreeBanks\universal_hindi\hi_hdtb-ud-train.conllu'
train_file_sentences  = r'E:\iiit\Sem4\NLP\dependency parsing\TreeBanks\universal_hindi\hi_hdtb-ud-train.txt'


val_file_conllu  = r'E:\iiit\Sem4\NLP\dependency parsing\TreeBanks\universal_hindi\hi_hdtb-ud-dev.conllu'
val_file_sentences  = r'E:\iiit\Sem4\NLP\dependency parsing\TreeBanks\universal_hindi\hi_hdtb-ud-dev.txt'


test_file_conllu  = r'E:\iiit\Sem4\NLP\dependency parsing\TreeBanks\universal_hindi\hi_hdtb-ud-test.conllu'
test_file_sentences  = r'E:\iiit\Sem4\NLP\dependency parsing\TreeBanks\universal_hindi\hi_hdtb-ud-test.txt'


In [4]:
def read_file(file_name):
    f = open(file_name,encoding = 'utf-8')
    lines = f.readlines()
    for i in range(len(lines)):
        lines[i] = lines[i].rstrip().split('\t')
    f.close()
    return lines

Everything based on the lectures from https://nptel.ac.in/courses/106/105/106105158/

<img src = 'conllu_format.jpg'>

In [5]:
# we'll use the fields id:0, form:1, xpos:4 , head:6, arc_label/dep_rel:7

<img src = 'sample_conllu.jpg' width ="2000" height = "1000" >

In [14]:
class DependencyParser:
    def __init__(self,train_treebank,test_treebank=None):
        self.features = list()
        self.labels = list()
        self.train_treebank = train_treebank
        self.test_treebank = test_treebank
        
        self.feature_vocabulary = defaultdict()
        self.label_vocabulary = defaultdict()
        
        self.reverse_features = list()
        self.reverse_labels = list()
        
        self.classifier = None
        self.max_features = 0
    
    def check_if_tree_is_connected(self,begin,dependent_head,token_heads):     
        current = begin
        while current is not 0:
            current = token_heads[current]
            if current == dependent_head:
                return True
        
        return False
    
    def check_projectivity(self,line_tokens):
        token_heads = defaultdict()
        
        for fields in line_tokens:
            token_id = fields[0]
            token_head = fields[3]
            
            if token_heads.get(token_id) == None: # not seen before
                token_heads[token_id] = token_head
        
        for dependent in token_heads:
            
            dependent_head = token_heads[dependent]
            left = 1 + min(dependent_head,dependent)
            right = max(dependent_head,dependent)
            
            for i in range(left,right):
                
                if self.check_if_tree_is_connected(i,dependent_head,token_heads) == False:
                    return False
        
        return True
    
    def check_condition(self,buffer,stack,dependencies = {},condition=None):
        
        if(condition == '0'):
            return len(buffer) is 0 and len(stack) is 1 and stack[0] == 0
        
        elif(condition == '1'):
            #condition for shift
            return len(stack) < 2 and len(buffer) > 0 #stack has only root and items are still in buffer
    
        elif(condition == '2'): #condition for left arc
            stack_top = stack[-1]
            stack_2nd = stack[-2]
            
            return dependencies.get(stack_top) is not None and (stack_top,stack_2nd) in dependencies[stack_top]
        
        elif(condition == '3'): #condition for right arc
            stack_top = stack[-1]
            stack_2nd = stack[-2]
            
            return dependencies.get(stack_2nd) is not None and (stack_2nd,stack_top) in dependencies[stack_2nd]
        

                
    def tree_to_actions(self,buffer,stack,arcs,dependencies):
        def perform_shift(buffer,stack,arcs,configuration_states,gold_transitions):
                configuration_states.append((list(buffer), list(stack), list(arcs)))
    
                gold_transitions.append("shift")
    
                stack.append(buffer.pop())
        
        def perform_arc(arc_label,buffer,stack,arcs,configuration_states,gold_transitions,arc_type):
            configuration_states.append((list(buffer), list(stack), list(arcs)))
            
            action = f'{arc_type}_{arc_label}'
            
            gold_transitions.append(action)
            
            
            left = stack[-1]
            right = stack[-2]
            
            if arc_type == 'left':
                arcs.append((arc_label,left,right))
                stack.pop(-2)
                
            elif arc_type == 'right':
                arcs.append((arc_label,right,left))
                stack.pop()
        
        configuration_states = list() #[(buf,stk,arc)...]
        gold_transitions = list() #[shift or left_rel or right_rel] actions for each state
        
        seen_dependencies = defaultdict()
        
        while( len(buffer) >= 0 ):
            if self.check_condition(buffer,stack,condition = '0'):
                return configuration_states , gold_transitions
            
            if self.check_condition(buffer,stack,condition = '1'):
                perform_shift(buffer,stack,arcs,configuration_states,gold_transitions)
                continue
                
            stack_top = stack[-1]
            stack_2nd = stack[-2]
            
            if self.check_condition(buffer,stack,dependencies,condition = '2'):
                arc_label = dependencies[stack_top][(stack_top,stack_2nd)]
                
                perform_arc(arc_label, buffer,stack,arcs,
                           configuration_states,gold_transitions,
                           arc_type = 'left')
                
                seen_dependencies[stack_2nd] = 1
                
            elif self.check_condition(buffer,stack,dependencies,condition = '3'):
                if( dependencies.get(stack_top) is not None and any([child not in seen_dependencies for _,child in dependencies[stack_top]]) ):
                    
                    perform_shift(buffer,stack,arcs,configuration_states,gold_transitions)
                    
                else:                 
                    arc_label = dependencies[stack_2nd][(stack_2nd,stack_top)]
                
                    perform_arc(arc_label, buffer,stack,arcs,
                           configuration_states,gold_transitions,
                           arc_type = 'right')
                
                    seen_dependencies[stack_top] = 1

            else:
                perform_shift(buffer,stack,arcs,configuration_states,gold_transitions)
        
    def extract_configs_transitions(self,line_tokens):
        
        stack,buffer,arcs = list(),list(),list()
        dependencies = defaultdict(lambda: defaultdict())
        
        stack.append(0) #add root
        
        for tokens in reversed(line_tokens):
            tok_id,word,x_pos,head,dependecy_relation = tokens
            
            head_tok_id = (head,tok_id)
                
            dependencies[head][head_tok_id] = dependecy_relation
            
            buffer.append(tok_id)
            
        return self.tree_to_actions(buffer,stack,arcs,dependencies)
    
    
    def get_features_labels(self,current_configuration,current_transtion,tokens_dict,POS_tags):
        def get_stack_features(idx,stack,tokens_dict,POS_tags):
            if idx == 'top':
                return tokens_dict[stack[-1]],POS_tags[stack[-1]]
            elif idx == '2nd_top':
                return tokens_dict[stack[-2]],POS_tags[stack[-2]]
        
        def get_buffer_features(buffer,tokens_dict,POS_tags):
                return tokens_dict[buffer[-1]],POS_tags[buffer[-1]]
            
        def get_two_word_features(tokens_dict,stack,POS_tags):
            return tokens_dict[stack[-1]],POS_tags[stack[-1]],POS_tags[stack[-2]]
        
        def insert_to_features(features,feat_i,feat_j,feat_k):
            features[feat_i] = 1
            features[feat_j] = 1
            features[feat_k] = 1
            
        label = current_transtion
        buffer,stack,arcs = current_configuration       
        features = defaultdict()
        
        buffer_len = f'buffer_length_{len(buffer)}'
        stack_len = f'stack_length_{len(stack)}'
        
        if len(stack) > 0:
            top_token,top_pos = get_stack_features('top',stack,tokens_dict,POS_tags)
        
            top_feature_1 = f'stack_top_{top_token}'
            top_feature_2 = f'stack_pos_{top_pos}'
            top_feature_3 = f'stack_top_pos_{top_token}_{top_pos}'
            
            insert_to_features(features,top_feature_1,top_feature_2,top_feature_3)
        
        if len(stack) > 1:
            top2nd_token,top2nd_pos = get_stack_features('2nd_top',stack,tokens_dict,POS_tags)
        
            top2nd_feature_4 = f'stack2nd_top_{top2nd_token}'
            top2nd_feature_5 = f'stack2nd_pos_{top2nd_pos}'
            top2nd_feature_6 = f'stack2nd_top_pos_{top2nd_token}_{top2nd_pos}'
            
            insert_to_features(features,top2nd_feature_4,top2nd_feature_5,top2nd_feature_6)
            
        if len(buffer) > 0:
            buf_token,buf_pos = get_buffer_features(buffer,tokens_dict,POS_tags)
        
            buf_feature_7 = f'buffer_top_{buf_token}'
            buf_feature_8 = f'buffer_pos_{buf_pos}'
            buf_feature_9 = f'buffer_top_pos_{buf_token}_{buf_pos}'
            
            insert_to_features(features,buf_feature_7,buf_feature_8,buf_feature_9)
            
            
        if len(stack) > 1 : 
            stack_top , pos_top , pos_2nd_top = get_two_word_features(tokens_dict,stack,POS_tags)
            
            two_word_feature_10 = f'stack_top_pos_top_pos_2ndtop_{stack_top}_{pos_top}_{pos_2nd_top}'
            features[two_word_feature_10] = 1
            
        #print(f'features {features} \n\n labels {label}')
        
        return features,label
    
    def create_reverse_mappings(self,label_id,feature_id):
        self.reverse_labels = [0] * label_id # empty array of number of labels
        self.reverse_features = [0] * feature_id # empty array of number of features
        
        for feature in tqdm(self.feature_vocabulary):
            self.reverse_features[self.feature_vocabulary[feature]] = feature
        for label in tqdm(self.label_vocabulary):
            self.reverse_labels[self.label_vocabulary[label]] = label

        
    
    def convert_to_training_data(self,):
        
        label_id = 0
        feature_id = 0 
        
        feature_counts = defaultdict(int)
        total_features = len(self.features)
        
        print('Building Training Data...')
        
        for i in tqdm(range(total_features)):
            for feature in self.features[i]:
                feature_counts[feature] += 1
                
            if self.label_vocabulary.get(self.labels[i]) is None:
                self.label_vocabulary[self.labels[i]] = label_id
                label_id += 1
                
        for feature in tqdm(feature_counts):
            if feature_counts[ feature ] > 5 and self.feature_vocabulary.get(feature) is None:
                self.feature_vocabulary[ feature ] = feature_id
                feature_id += 1
        
        print('building cache')
        self.create_reverse_mappings(label_id,feature_id)
        
        self.max_features = feature_id
        x_train = sparse.lil_matrix((total_features,feature_id))
        y_train = list()
        
        for i in tqdm(range(total_features)):
            for feature in self.features[i]:
                
                if self.feature_vocabulary.get(feature) is not None:
                    f_count = self.feature_vocabulary[feature]
                    x_train[i,f_count] = 1
                
            y_train.append(self.label_vocabulary[self.labels[i]])
        
        return x_train,y_train
            

    def extract_line_tokens(self,line):
        line_id,word,stem_word,u_pos,x_pos = line[0],line[1],line[2],line[3],line[4]
        feat,head,dep_rel,dep,misc = line[5],line[6],line[7],line[8],line[9]
    
        return int(line_id),word,x_pos,int(head),dep_rel

    def build_oracles(self,):
        
        current_tokens = []
        tokens_dict = {}
        POS_tags = {}
        
        tokens_dict[0]= 'root'
        POS_tags[0]= 'root'
        
        for current_line in self.train_treebank:
    
            if len(current_line) < 2 : # useful fields
                
                if len(current_tokens) > 0 : #have found some tokens before
                    
                        
                    if self.check_projectivity(current_tokens) == True: #if current tree is projective
                        
                        configuration_state,gold_transitions = self.extract_configs_transitions(current_tokens)
                        
                        
                        for i in range(len(configuration_state)):
                            print(f'{configuration_state[i]}------{gold_transitions[i]}')
                            features , labels = self.get_features_labels(configuration_state[i],gold_transitions[i],
                                                                    tokens_dict,POS_tags)

                            self.features.append(features)
                            self.labels.append(labels)
                        
                    
                    current_tokens = []
                    tokens_dict = {}
                    POS_tags = {}
                    
                    tokens_dict[0]= 'root'
                    POS_tags[0]= 'root'
                    
                continue
                
            if current_line[0].startswith('#'):
                continue
            
            tok_id,word,x_pos,head_id,dep_rel = self.extract_line_tokens(current_line)
            current_tokens.append((tok_id,word,x_pos,head_id,dep_rel))
            tokens_dict[tok_id] = word
            POS_tags[tok_id] = x_pos 
            
        
#         for i in range(len(self.features)):
#             print(self.features[i])
#             print('=====')
#             print(self.labels[i])
       
        X_train,y_train = self.convert_to_training_data()
        
        print('finished')
        return X_train,y_train,self.features
    
    def get_model(self,model_name):
        
        model = None
        if model_name == 'lr_l2':
            model = LogisticRegression(verbose = 2,n_jobs = 40)
        
        elif model_name == 'lr_el_saga':
            model = LogisticRegression(penalty = 'elasticnet',verbose = 2,n_jobs = 4,solver = 'saga',l1_ratio = 0.25)

        elif model_name == 'svm_rbf':
            model = SVC(kernel = 'rbf', C=1.6)
        
        elif model_name == 'svm_poly':
            model = SVC(kernel = 'poly', C=1.5)
        
        elif model_name == 'sgd_l2':
            model = SGDClassifier(shuffle = False)
        
        elif model_name == 'sgd_el':
            model = SGDClassifier(penalty = 'elasticnet', shuffle=False)
        
        elif model_name == 'rf_gini':
            model = RandomForestClassifier(n_estimators = 552, criterion = 'gini', n_jobs = 4 )
        
        elif model_name == 'rf_ent':
            model = RandomForestClassifier(n_estimators = 552, criterion = 'entropy', n_jobs = 4 )
            
        elif model_name == 'pac':
            model = PassiveAggressiveClassifier(n_jobs = 4,vebose = 2)
        
        elif model_name == 'gnb':
            model = GaussianNB()
        
        elif model_name == 'bnb':
            model = BernoulliNB()
        
        elif model_name == 'lgb':
            model = LGBMClassifier()
            
        return model   

    def train(self,x_train,y_train,model_name = None):
        
        self.classifier = model_name
        
        model = self.get_model(model_name)
        
        model.fit(x_train,y_train)
        
        return model
    
    def action_to_tree(self,test_dependency_tree,prediction_probabs,test_buffer,test_stack,test_arcs):
        
        def is_action_valid(test_stack,test_buffer,action_taken):
            if action_taken == "shift" and len(test_buffer) > 0 :
                return True
            
            elif action_taken.startswith('left') and len(test_stack) > 1 and test_stack[-1] != 0: #0 is root
                return True 
            
            elif action_taken.startswith('right') and len(test_stack) > 1 and test_stack[-2] != 0: #0 is root
                return True 
            
            else:
                return False
            
            
        for label_id_prob in np.argsort(prediction_probabs, kind = 'quicksort')[0]:
            
            action_taken = self.reverse_labels[label_id_prob]
            
            if is_action_valid(test_stack,test_buffer,action_taken):
                
                if action_taken == "shift":
                    test_stack.append(test_buffer.pop())
                
                elif action_taken.startswith('left'):
                    arc_label = action_taken.split('_')[1]
                    test_dependency_tree[test_stack[-2]] = (test_stack[-1], arc_label)
                    test_stack.pop(-2)
                    
                elif action_taken.startswith('right'):
                    arc_label = action_taken.split('_')[1]
                    test_dependency_tree[test_stack[-1]] = (test_stack[-2], arc_label)
                    test_stack.pop()
                return
    
    def evaluate_oracles_and_predictions(self,test_tokens,model):
        
        test_tokens_dict = defaultdict()
        test_POS_tags = defaultdict()
        
        test_tokens_dict[0] = 'root'
        test_POS_tags[0] = 'root'
        
        test_heads = defaultdict()
        test_labels = defaultdict()
        
        test_buffer = list()
        
        test_stack = list()
        test_stack.append(0)
        
        test_arcs = list()
        
        
        
        for test_token in reversed(test_tokens):
            
            
            
            token_id,token_word,token_tag,token_head,token_label = test_token
            
            test_tokens_dict[token_id] = token_word
            test_POS_tags[token_id] = token_tag
            
            test_heads[token_id] = token_head
            test_labels[token_id] = token_label
            
            test_buffer.append(token_id)
            
        test_dependency_tree = defaultdict()
        
        while len(test_buffer) > 0:
            
            if len(test_buffer) == 0  and len(test_stack)==0 : #nothing available to perform an action
                break
                
            if len(test_buffer) == 0  and len(test_stack)==1 and test_stack[0] == 0 : #nothing but root
                break
            
            current_config = (test_buffer,test_stack,test_arcs)
            
            test_features, _ = self.get_features_labels(current_config,None,test_tokens_dict,test_POS_tags)
            
            x_test = sparse.lil_matrix((1,self.max_features))
            
            for feature in test_features:
                if self.feature_vocabulary.get(feature) is not None:
                    x_test[0,self.feature_vocabulary[feature]] = 1
                    
            prediction_probabs = -1 * model.predict_proba(x_test)
            
            self.action_to_tree(test_dependency_tree,prediction_probabs,test_buffer,test_stack,test_arcs)
            
        unlabelled_attachment_score = 0
        labelled_attachment_score = 0
            
        total_children = 0
        
            
        for child in test_dependency_tree:
                
            token_head,arc_label = test_dependency_tree[child]
            
            print('----')
            print(child)
            print(test_heads[child])
            print('----')
            
            
            if token_head == test_heads[child]:
                unlabelled_attachment_score += 1
                    
                if arc_label == test_labels[child]:
                    labelled_attachment_score += 1
                
            total_children += 1
                
        return [unlabelled_attachment_score,labelled_attachment_score,total_children]
        
    
    def evaluate(self,test_treebank,model):
        
        if self.test_treebank is None:
            self.test_treebank = test_treebank
        
        test_tokens = [] #tokens in current line
        current_all_scores = []
        total_scores = np.zeros(3)
        
        i=0
        
        for current_line in tqdm(self.test_treebank):
            i+=1
            if len(current_line) < 2 :
                
                if len(test_tokens) > 0: # has seen tokens
                    #print(test_tokens)
                    if self.check_projectivity(test_tokens) == True:

                            current_all_scores = self.evaluate_oracles_and_predictions(test_tokens,model)
                            total_scores += current_all_scores
                            
                            print(f'Total lines : {total_scores[2]} \n Unlabelled attachment Score {total_scores[0]/total_scores[2]} \n Labelled attachment Score {total_scores[1]/total_scores[2]}')
                            
                            
                    test_tokens = []
                continue
                
            if current_line[0].startswith('#'):
                continue
            
            tok_id,word,x_pos,head_id,dep_rel = self.extract_line_tokens(current_line)
            test_tokens.append((tok_id,word,x_pos,head_id,dep_rel))
        
        accuracy = f'Total lines : {total_scores[2]} \n Unlabelled attachment Score {total_scores[0]/total_scores[2]} \n Unlabelled attachment Score {total_scores[1]/total_scores[2]}'
        
        return accuracy

In [7]:
train_treebank = read_file(train_file_conllu)
test_treebank = read_file(val_file_conllu)

# GET ORACLES AND TRAINING DATA

In [15]:
#

dp = DependencyParser(train_treebank)

x_train,y_train,f = dp.build_oracles()


([11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1], [0], [])------shift
([11, 10, 9, 8, 7, 6, 5, 4, 3, 2], [0, 1], [])------shift
([11, 10, 9, 8, 7, 6, 5, 4, 3], [0, 1, 2], [])------left_det
([11, 10, 9, 8, 7, 6, 5, 4, 3], [0, 2], [('det', 2, 1)])------shift
([11, 10, 9, 8, 7, 6, 5, 4], [0, 2, 3], [('det', 2, 1)])------right_case
([11, 10, 9, 8, 7, 6, 5, 4], [0, 2], [('det', 2, 1), ('case', 2, 3)])------shift
([11, 10, 9, 8, 7, 6, 5], [0, 2, 4], [('det', 2, 1), ('case', 2, 3)])------shift
([11, 10, 9, 8, 7, 6], [0, 2, 4, 5], [('det', 2, 1), ('case', 2, 3)])------left_advmod
([11, 10, 9, 8, 7, 6], [0, 2, 5], [('det', 2, 1), ('case', 2, 3), ('advmod', 5, 4)])------shift
([11, 10, 9, 8, 7], [0, 2, 5, 6], [('det', 2, 1), ('case', 2, 3), ('advmod', 5, 4)])------left_amod
([11, 10, 9, 8, 7], [0, 2, 6], [('det', 2, 1), ('case', 2, 3), ('advmod', 5, 4), ('amod', 6, 5)])------left_nmod
([11, 10, 9, 8, 7], [0, 6], [('det', 2, 1), ('case', 2, 3), ('advmod', 5, 4), ('amod', 6, 5), ('nmod', 6, 2)])------shift
([

HBox(children=(FloatProgress(value=0.0, max=96.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=316.0), HTML(value='')))


building cache


HBox(children=(FloatProgress(value=0.0, max=34.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=96.0), HTML(value='')))


finished


In [16]:
x_train

<460986x50532 sparse matrix of type '<class 'numpy.float64'>'
	with 4252205 stored elements in LInked List format>

# MODEL EVALUATION

# Bernoulli NB

In [11]:
%%time

model2 = dp.train(x_train,y_train,'bnb')


Wall time: 14 ms


In [1]:

# %%time

# accuracy2 = dp.evaluate(test_treebank,model2)


# LOGISTIC REGRESSION WITH L2 REG

In [43]:
%%time

model3 = dp.train(x_train,y_train,'lr_l2')


[Parallel(n_jobs=40)]: Using backend LokyBackend with 40 concurrent workers.


Wall time: 3min 3s


[Parallel(n_jobs=40)]: Done   1 out of   1 | elapsed:  3.0min finished


In [15]:

%%time

accuracy3 = dp.evaluate(test_treebank,model3)


HBox(children=(FloatProgress(value=0.0, max=40194.0), HTML(value='')))

Total lines : 22.0 
 Unlabelled attachment Score 0.8181818181818182 
 Labelled attachment Score 0.7727272727272727
Total lines : 35.0 
 Unlabelled attachment Score 0.7714285714285715 
 Labelled attachment Score 0.7142857142857143
Total lines : 52.0 
 Unlabelled attachment Score 0.7884615384615384 
 Labelled attachment Score 0.75
Total lines : 77.0 
 Unlabelled attachment Score 0.7662337662337663 
 Labelled attachment Score 0.7012987012987013
Total lines : 87.0 
 Unlabelled attachment Score 0.7816091954022989 
 Labelled attachment Score 0.7126436781609196
Total lines : 87.0 
 Unlabelled attachment Score 0.7816091954022989 
 Labelled attachment Score 0.7126436781609196
Total lines : 99.0 
 Unlabelled attachment Score 0.7878787878787878 
 Labelled attachment Score 0.7272727272727273
Total lines : 112.0 
 Unlabelled attachment Score 0.8125 
 Labelled attachment Score 0.7410714285714286
Total lines : 118.0 
 Unlabelled attachment Score 0.8050847457627118 
 Labelled attachment Score 0.728813

Total lines : 865.0 
 Unlabelled attachment Score 0.8080924855491329 
 Labelled attachment Score 0.7260115606936416
Total lines : 878.0 
 Unlabelled attachment Score 0.8097949886104784 
 Labelled attachment Score 0.7266514806378133
Total lines : 886.0 
 Unlabelled attachment Score 0.8115124153498872 
 Labelled attachment Score 0.7279909706546276
Total lines : 893.0 
 Unlabelled attachment Score 0.8096304591265397 
 Labelled attachment Score 0.7256438969764838
Total lines : 904.0 
 Unlabelled attachment Score 0.8108407079646017 
 Labelled attachment Score 0.7278761061946902
Total lines : 914.0 
 Unlabelled attachment Score 0.812910284463895 
 Labelled attachment Score 0.7308533916849015
Total lines : 928.0 
 Unlabelled attachment Score 0.8135775862068966 
 Labelled attachment Score 0.7295258620689655
Total lines : 944.0 
 Unlabelled attachment Score 0.8114406779661016 
 Labelled attachment Score 0.7266949152542372
Total lines : 948.0 
 Unlabelled attachment Score 0.8122362869198312 
 La

Total lines : 1973.0 
 Unlabelled attachment Score 0.8226051697921947 
 Labelled attachment Score 0.7364419665484034
Total lines : 2002.0 
 Unlabelled attachment Score 0.8221778221778222 
 Labelled attachment Score 0.7367632367632367
Total lines : 2009.0 
 Unlabelled attachment Score 0.8227974116475859 
 Labelled attachment Score 0.7371826779492284
Total lines : 2034.0 
 Unlabelled attachment Score 0.8230088495575221 
 Labelled attachment Score 0.7379547689282202
Total lines : 2047.0 
 Unlabelled attachment Score 0.8236443575964827 
 Labelled attachment Score 0.7386419149975574
Total lines : 2067.0 
 Unlabelled attachment Score 0.8243831640058055 
 Labelled attachment Score 0.7397194000967586
Total lines : 2077.0 
 Unlabelled attachment Score 0.8228213769860375 
 Labelled attachment Score 0.7385652383245065
Total lines : 2089.0 
 Unlabelled attachment Score 0.8238391574916227 
 Labelled attachment Score 0.7386309238870273
Total lines : 2101.0 
 Unlabelled attachment Score 0.82436934792

Total lines : 3296.0 
 Unlabelled attachment Score 0.8200849514563107 
 Labelled attachment Score 0.7330097087378641
Total lines : 3308.0 
 Unlabelled attachment Score 0.8204353083434099 
 Labelled attachment Score 0.7330713422007256
Total lines : 3317.0 
 Unlabelled attachment Score 0.8206210431112451 
 Labelled attachment Score 0.7334941211938498
Total lines : 3354.0 
 Unlabelled attachment Score 0.8205128205128205 
 Labelled attachment Score 0.7316636851520573
Total lines : 3368.0 
 Unlabelled attachment Score 0.8206650831353919 
 Labelled attachment Score 0.7312945368171021
Total lines : 3383.0 
 Unlabelled attachment Score 0.8205734555128584 
 Labelled attachment Score 0.7313035767070647
Total lines : 3392.0 
 Unlabelled attachment Score 0.8198702830188679 
 Labelled attachment Score 0.730247641509434
Total lines : 3401.0 
 Unlabelled attachment Score 0.8200529256101147 
 Labelled attachment Score 0.7303734195824757
Total lines : 3415.0 
 Unlabelled attachment Score 0.820790629575

Total lines : 4635.0 
 Unlabelled attachment Score 0.8284789644012945 
 Labelled attachment Score 0.7411003236245954
Total lines : 4653.0 
 Unlabelled attachment Score 0.8284977433913604 
 Labelled attachment Score 0.7408123791102514
Total lines : 4662.0 
 Unlabelled attachment Score 0.8286143286143286 
 Labelled attachment Score 0.7408837408837409
Total lines : 4689.0 
 Unlabelled attachment Score 0.8287481339304755 
 Labelled attachment Score 0.7410961825549157
Total lines : 4698.0 
 Unlabelled attachment Score 0.8288633461047255 
 Labelled attachment Score 0.7413793103448276
Total lines : 4721.0 
 Unlabelled attachment Score 0.8280025418343572 
 Labelled attachment Score 0.7407328955729718
Total lines : 4727.0 
 Unlabelled attachment Score 0.8282208588957055 
 Labelled attachment Score 0.7404273323460969
Total lines : 4758.0 
 Unlabelled attachment Score 0.8280790248003362 
 Labelled attachment Score 0.7406473308112652
Total lines : 4777.0 
 Unlabelled attachment Score 0.82834414904

Total lines : 5989.0 
 Unlabelled attachment Score 0.8253464685256303 
 Labelled attachment Score 0.7381866755718818
Total lines : 6019.0 
 Unlabelled attachment Score 0.825220136235255 
 Labelled attachment Score 0.7376640637979731
Total lines : 6047.0 
 Unlabelled attachment Score 0.8252025797916323 
 Labelled attachment Score 0.7380519265751613
Total lines : 6068.0 
 Unlabelled attachment Score 0.8253131179960448 
 Labelled attachment Score 0.7382992748846408
Total lines : 6082.0 
 Unlabelled attachment Score 0.825386386057218 
 Labelled attachment Score 0.7382439986846432
Total lines : 6117.0 
 Unlabelled attachment Score 0.8252411312735001 
 Labelled attachment Score 0.7384338728134706
Total lines : 6135.0 
 Unlabelled attachment Score 0.8251018744906276 
 Labelled attachment Score 0.7380603096984515
Total lines : 6147.0 
 Unlabelled attachment Score 0.8243045387994143 
 Labelled attachment Score 0.7374328940946804
Total lines : 6173.0 
 Unlabelled attachment Score 0.8242345699011

Total lines : 7288.0 
 Unlabelled attachment Score 0.8229967069154775 
 Labelled attachment Score 0.7366904500548848
Total lines : 7304.0 
 Unlabelled attachment Score 0.8232475355969332 
 Labelled attachment Score 0.7367196056955093
Total lines : 7315.0 
 Unlabelled attachment Score 0.8228298017771702 
 Labelled attachment Score 0.736431989063568
Total lines : 7333.0 
 Unlabelled attachment Score 0.8229919541797355 
 Labelled attachment Score 0.7365334787944906
Total lines : 7357.0 
 Unlabelled attachment Score 0.8232975397580535 
 Labelled attachment Score 0.7367133342394998
Total lines : 7394.0 
 Unlabelled attachment Score 0.8226940762780633 
 Labelled attachment Score 0.7362726535028401
Total lines : 7417.0 
 Unlabelled attachment Score 0.8231090737494944 
 Labelled attachment Score 0.7366859916408252
Total lines : 7440.0 
 Unlabelled attachment Score 0.8233870967741935 
 Labelled attachment Score 0.7368279569892473
Total lines : 7474.0 
 Unlabelled attachment Score 0.823789135670

Total lines : 8741.0 
 Unlabelled attachment Score 0.8297677611257294 
 Labelled attachment Score 0.7413339434847271
Total lines : 8777.0 
 Unlabelled attachment Score 0.8294405833428279 
 Labelled attachment Score 0.7410276859974935
Total lines : 8799.0 
 Unlabelled attachment Score 0.8295260825093761 
 Labelled attachment Score 0.7408796454142517
Total lines : 8805.0 
 Unlabelled attachment Score 0.8296422487223168 
 Labelled attachment Score 0.7409426462237365
Total lines : 8839.0 
 Unlabelled attachment Score 0.8297318701210544 
 Labelled attachment Score 0.741147188595995
Total lines : 8848.0 
 Unlabelled attachment Score 0.8299050632911392 
 Labelled attachment Score 0.7411844484629295
Total lines : 8893.0 
 Unlabelled attachment Score 0.8297537388957608 
 Labelled attachment Score 0.7412571685595412
Total lines : 8915.0 
 Unlabelled attachment Score 0.8297251822770612 
 Labelled attachment Score 0.7411104879416713
Total lines : 8929.0 
 Unlabelled attachment Score 0.829544181879

Total lines : 10118.0 
 Unlabelled attachment Score 0.8312907689266653 
 Labelled attachment Score 0.7427357185214469
Total lines : 10134.0 
 Unlabelled attachment Score 0.8312611012433393 
 Labelled attachment Score 0.7428458654035919
Total lines : 10154.0 
 Unlabelled attachment Score 0.8312980106362025 
 Labelled attachment Score 0.7428599566673232
Total lines : 10181.0 
 Unlabelled attachment Score 0.8315489637560161 
 Labelled attachment Score 0.7427561143306158
Total lines : 10203.0 
 Unlabelled attachment Score 0.8315201411349603 
 Labelled attachment Score 0.7429187493874351
Total lines : 10212.0 
 Unlabelled attachment Score 0.831668625146886 
 Labelled attachment Score 0.7431453192322758
Total lines : 10228.0 
 Unlabelled attachment Score 0.831834180680485 
 Labelled attachment Score 0.743351583887368
Total lines : 10237.0 
 Unlabelled attachment Score 0.8318843411155612 
 Labelled attachment Score 0.7433818501514116
Total lines : 10248.0 
 Unlabelled attachment Score 0.83206

Total lines : 11480.0 
 Unlabelled attachment Score 0.8349303135888502 
 Labelled attachment Score 0.747822299651568
Total lines : 11500.0 
 Unlabelled attachment Score 0.8348695652173913 
 Labelled attachment Score 0.7479130434782608
Total lines : 11528.0 
 Unlabelled attachment Score 0.8351839000693962 
 Labelled attachment Score 0.7479181124219292
Total lines : 11546.0 
 Unlabelled attachment Score 0.8350944049887407 
 Labelled attachment Score 0.7479646630867833
Total lines : 11556.0 
 Unlabelled attachment Score 0.8348909657320872 
 Labelled attachment Score 0.7477500865351333
Total lines : 11582.0 
 Unlabelled attachment Score 0.835002590226213 
 Labelled attachment Score 0.7479709894664134
Total lines : 11597.0 
 Unlabelled attachment Score 0.8351297749417953 
 Labelled attachment Score 0.7479520565663533
Total lines : 11627.0 
 Unlabelled attachment Score 0.8350391330523781 
 Labelled attachment Score 0.7479143373183108
Total lines : 11637.0 
 Unlabelled attachment Score 0.8351

Total lines : 14262.0 
 Unlabelled attachment Score 0.8351563595568644 
 Labelled attachment Score 0.7477913336137989
Total lines : 14275.0 
 Unlabelled attachment Score 0.8353064798598949 
 Labelled attachment Score 0.7478809106830122
Total lines : 14280.0 
 Unlabelled attachment Score 0.8352941176470589 
 Labelled attachment Score 0.7478991596638656
Total lines : 14300.0 
 Unlabelled attachment Score 0.8353146853146853 
 Labelled attachment Score 0.7479020979020979
Total lines : 14313.0 
 Unlabelled attachment Score 0.8351149304827779 
 Labelled attachment Score 0.7477118703276742
Total lines : 14334.0 
 Unlabelled attachment Score 0.8350774382586856 
 Labelled attachment Score 0.747662899400028
Total lines : 14349.0 
 Unlabelled attachment Score 0.8351104606592794 
 Labelled attachment Score 0.7476479197156596
Total lines : 14357.0 
 Unlabelled attachment Score 0.8352023403217943 
 Labelled attachment Score 0.7476492303406004
Total lines : 14366.0 
 Unlabelled attachment Score 0.835

Total lines : 15234.0 
 Unlabelled attachment Score 0.8353026125771301 
 Labelled attachment Score 0.7474071156623342
Total lines : 15243.0 
 Unlabelled attachment Score 0.8352686479039559 
 Labelled attachment Score 0.7474250475628157
Total lines : 15252.0 
 Unlabelled attachment Score 0.8353002884867559 
 Labelled attachment Score 0.7474429583005507
Total lines : 15262.0 
 Unlabelled attachment Score 0.8354082033809461 
 Labelled attachment Score 0.7476084392609095
Total lines : 15278.0 
 Unlabelled attachment Score 0.8352533054064668 
 Labelled attachment Score 0.74741458306061
Total lines : 15296.0 
 Unlabelled attachment Score 0.8353817991631799 
 Labelled attachment Score 0.7475156903765691
Total lines : 15306.0 
 Unlabelled attachment Score 0.8354240167254672 
 Labelled attachment Score 0.7474846465438391
Total lines : 15329.0 
 Unlabelled attachment Score 0.8355404788309739 
 Labelled attachment Score 0.7475373475112532
Total lines : 15350.0 
 Unlabelled attachment Score 0.8355

Total lines : 16654.0 
 Unlabelled attachment Score 0.8367959649333494 
 Labelled attachment Score 0.7488891557583763
Total lines : 16674.0 
 Unlabelled attachment Score 0.8369917236415977 
 Labelled attachment Score 0.7491903562432529
Total lines : 16684.0 
 Unlabelled attachment Score 0.8370294893310957 
 Labelled attachment Score 0.749280748022057
Total lines : 16696.0 
 Unlabelled attachment Score 0.8370867273598467 
 Labelled attachment Score 0.7493411595591758
Total lines : 16709.0 
 Unlabelled attachment Score 0.8371536297803579 
 Labelled attachment Score 0.7494763301214914
Total lines : 16725.0 
 Unlabelled attachment Score 0.8371898355754858 
 Labelled attachment Score 0.7495366218236174
Total lines : 16770.0 
 Unlabelled attachment Score 0.8372093023255814 
 Labelled attachment Score 0.7496124031007751
Total lines : 16783.0 
 Unlabelled attachment Score 0.8371566466066853 
 Labelled attachment Score 0.749508431150569
Total lines : 16805.0 
 Unlabelled attachment Score 0.8371

Total lines : 17926.0 
 Unlabelled attachment Score 0.8390605823942876 
 Labelled attachment Score 0.7502510320205289
Total lines : 17936.0 
 Unlabelled attachment Score 0.8390945584299733 
 Labelled attachment Score 0.7502230151650312
Total lines : 17962.0 
 Unlabelled attachment Score 0.8390491036632892 
 Labelled attachment Score 0.7501948558067031
Total lines : 17976.0 
 Unlabelled attachment Score 0.8391744548286605 
 Labelled attachment Score 0.7503337783711616
Total lines : 17999.0 
 Unlabelled attachment Score 0.8392132896272015 
 Labelled attachment Score 0.7504305794766376
Total lines : 18022.0 
 Unlabelled attachment Score 0.8392520253024082 
 Labelled attachment Score 0.7504716457662857
Total lines : 18037.0 
 Unlabelled attachment Score 0.8392748239729445 
 Labelled attachment Score 0.7504573931363309
Total lines : 18060.0 
 Unlabelled attachment Score 0.8392580287929126 
 Labelled attachment Score 0.7503322259136213
Total lines : 18076.0 
 Unlabelled attachment Score 0.83

Total lines : 19399.0 
 Unlabelled attachment Score 0.8389092221248518 
 Labelled attachment Score 0.7512758389607712
Total lines : 19416.0 
 Unlabelled attachment Score 0.8389472599917593 
 Labelled attachment Score 0.7513391017717347
Total lines : 19451.0 
 Unlabelled attachment Score 0.8390828235052182 
 Labelled attachment Score 0.7514780731067812
Total lines : 19481.0 
 Unlabelled attachment Score 0.8389713053744674 
 Labelled attachment Score 0.7513474667624865
Total lines : 19491.0 
 Unlabelled attachment Score 0.8390026165922734 
 Labelled attachment Score 0.7513724283002411
Total lines : 19510.0 
 Unlabelled attachment Score 0.8390568939005638 
 Labelled attachment Score 0.7514607893388006
Total lines : 19521.0 
 Unlabelled attachment Score 0.839096357768557 
 Labelled attachment Score 0.7514983863531581
Total lines : 19539.0 
 Unlabelled attachment Score 0.8391422283637853 
 Labelled attachment Score 0.7515225958339731
Total lines : 19557.0 
 Unlabelled attachment Score 0.839

Total lines : 20471.0 
 Unlabelled attachment Score 0.84011528503737 
 Labelled attachment Score 0.7514532753651507
Total lines : 20498.0 
 Unlabelled attachment Score 0.8401795297102156 
 Labelled attachment Score 0.7514391647965655
Total lines : 20512.0 
 Unlabelled attachment Score 0.8401911076443058 
 Labelled attachment Score 0.7513163026521061
Total lines : 20530.0 
 Unlabelled attachment Score 0.8402338041889917 
 Labelled attachment Score 0.7513882123721384
Total lines : 20565.0 
 Unlabelled attachment Score 0.8402625820568927 
 Labelled attachment Score 0.7513250668611718
Total lines : 20596.0 
 Unlabelled attachment Score 0.8403087978248204 
 Labelled attachment Score 0.7513109341619733
Total lines : 20617.0 
 Unlabelled attachment Score 0.8403259446088179 
 Labelled attachment Score 0.7513702284522481
Total lines : 20641.0 
 Unlabelled attachment Score 0.8402693667942445 
 Labelled attachment Score 0.7513686352405406
Total lines : 20652.0 
 Unlabelled attachment Score 0.8403

Total lines : 21878.0 
 Unlabelled attachment Score 0.839930523813877 
 Labelled attachment Score 0.7509827223695036
Total lines : 21899.0 
 Unlabelled attachment Score 0.8400383579158865 
 Labelled attachment Score 0.7511301885930864
Total lines : 21907.0 
 Unlabelled attachment Score 0.8400967727210481 
 Labelled attachment Score 0.7511754233806546
Total lines : 21922.0 
 Unlabelled attachment Score 0.8402061855670103 
 Labelled attachment Score 0.7513456801386735
Total lines : 21954.0 
 Unlabelled attachment Score 0.8403024505784823 
 Labelled attachment Score 0.7513437186845222
Total lines : 21978.0 
 Unlabelled attachment Score 0.8402493402493403 
 Labelled attachment Score 0.7512967512967513
Total lines : 21995.0 
 Unlabelled attachment Score 0.8403273471243464 
 Labelled attachment Score 0.7513980450102296
Total lines : 22004.0 
 Unlabelled attachment Score 0.8403017633157608 
 Labelled attachment Score 0.7513633884748228
Total lines : 22024.0 
 Unlabelled attachment Score 0.840

Total lines : 23075.0 
 Unlabelled attachment Score 0.8421668472372698 
 Labelled attachment Score 0.7527627302275189
Total lines : 23088.0 
 Unlabelled attachment Score 0.8422124047124047 
 Labelled attachment Score 0.7528153153153153
Total lines : 23117.0 
 Unlabelled attachment Score 0.8422805727386772 
 Labelled attachment Score 0.7529523727127222
Total lines : 23132.0 
 Unlabelled attachment Score 0.8422963859588449 
 Labelled attachment Score 0.7529828808576863
Total lines : 23178.0 
 Unlabelled attachment Score 0.8420484942618 
 Labelled attachment Score 0.7528691000086288
Total lines : 23197.0 
 Unlabelled attachment Score 0.8418761046687072 
 Labelled attachment Score 0.7526835366642238
Total lines : 23207.0 
 Unlabelled attachment Score 0.8417287887275391 
 Labelled attachment Score 0.7525315637523161
Total lines : 23229.0 
 Unlabelled attachment Score 0.8417925868526411 
 Labelled attachment Score 0.7525076413104309
Total lines : 23257.0 
 Unlabelled attachment Score 0.84185

Total lines : 24476.0 
 Unlabelled attachment Score 0.8424579179604511 
 Labelled attachment Score 0.7527782317372119
Total lines : 24515.0 
 Unlabelled attachment Score 0.8425045890271262 
 Labelled attachment Score 0.7527636141138079
Total lines : 24543.0 
 Unlabelled attachment Score 0.8423990547202869 
 Labelled attachment Score 0.7526382267856415
Total lines : 24585.0 
 Unlabelled attachment Score 0.8422615415904007 
 Labelled attachment Score 0.7524506813097417
Total lines : 24601.0 
 Unlabelled attachment Score 0.8423234827852526 
 Labelled attachment Score 0.7525303849437015
Total lines : 24617.0 
 Unlabelled attachment Score 0.842222854125198 
 Labelled attachment Score 0.7524474956330991
Total lines : 24630.0 
 Unlabelled attachment Score 0.8422655298416565 
 Labelled attachment Score 0.7524157531465693
Total lines : 24643.0 
 Unlabelled attachment Score 0.8421864221076979 
 Labelled attachment Score 0.7523028852006655
Total lines : 24652.0 
 Unlabelled attachment Score 0.842

Total lines : 25832.0 
 Unlabelled attachment Score 0.8414369773923815 
 Labelled attachment Score 0.7510839269123568
Total lines : 25845.0 
 Unlabelled attachment Score 0.8413619655639388 
 Labelled attachment Score 0.751015670342426
Total lines : 25856.0 
 Unlabelled attachment Score 0.8413521039603961 
 Labelled attachment Score 0.7510055693069307
Total lines : 25876.0 
 Unlabelled attachment Score 0.8414360797650332 
 Labelled attachment Score 0.7510820837842016
Total lines : 25890.0 
 Unlabelled attachment Score 0.8414059482425646 
 Labelled attachment Score 0.7510621861722673
Total lines : 25910.0 
 Unlabelled attachment Score 0.8414125820146662 
 Labelled attachment Score 0.751099961404863
Total lines : 25936.0 
 Unlabelled attachment Score 0.8414173349784084 
 Labelled attachment Score 0.7511181369524984
Total lines : 25948.0 
 Unlabelled attachment Score 0.8414135964236165 
 Labelled attachment Score 0.7511561584707878
Total lines : 25961.0 
 Unlabelled attachment Score 0.8414

# LOGISTIC REGRESSION WITH ELASTICNET REG

In [None]:
%%time

model4 = dp.train(x_train,y_train,'lr_el_saga')


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


In [None]:

%%time

accuracy4 = dp.evaluate(test_treebank,model4)
