In [1]:
import networkx as nx
import pickle
import random
import os

These are representative 6 files, 
which have correct ground truth ids, however **have conflicting edges between them**.


Of 4k sentences,

177 fail complete graph test.

Of that these issues comprise 74% of such cases (~130 sentences).

29088
<br>
122173
<br>
190932
<br>
45474
<br>
74231
<br>
27284


In [2]:
# just change the filenames

graphml_file = '29088.graphml'
pickle_file = '29088.p'

In [3]:

BASE_DIR = os.getcwd()
PICKLE_DCS_FILE_NAME = os.path.join(BASE_DIR,'data','conflict',pickle_file)
GRAPHML_FILE_NAME = os.path.join(BASE_DIR,'data','conflict',graphml_file)

In [4]:
# defining class for pickle file 

class DCS:
    def __init__(self, sent_id, sentence):
        self.sent_id = sent_id
        self.sentence = sentence
        self.dcs_chunks = []
        self.lemmas = []
        self.cng = []

dcs_class_params = ['sentence id', 'sentence ', 'chunks', 'lemmas', 'morph class']        

In [5]:

# Double characters mapping to single characters
_dbl = dict({
    'ai' : 'E',
    'au' : 'O',
    'kh' : 'K',
    'gh' : 'G',
    'ch' : 'C',
    'jh' : 'J',
    'ṭh' : 'W',
    'ḍh' : 'Q',
    'th' : 'T',
    'dh' : 'D',
    'ph' : 'P',
    'bh' : 'B'})

# One to one mapping
_oth = dict({
    'ā' : 'A',
    'ī' : 'I',
    'ū' : 'U',
    'ṛ' : 'f',
    'ṝ' : 'F',
    'ḷ' : 'x',
    'ḹ' : 'X',
    'ṃ' : 'M',
    'ḥ' : 'H',
    'ṁ' : '~',
    'ṅ' : 'N',
    'ñ' : 'Y',
    'ṭ' : 'w',
    'ḍ' : 'q',
    'ṇ' : 'R',
    'ś' : 'S',
    'ṣ' : 'z'})


In [6]:
##IAST to SLP1 transliteration


def iast2slp(src):
    '''
    Converts International Alphabet for Sanskrit Transliteration (IAST) scheme to
    Sanskrit Library Phonetic Basic notation
    '''
    tgt = ''
    inc = 0
    while inc < len(src):
        now = src[inc]
        nxt = src[inc+1] if inc < len(src) - 1 else ''
        if now + nxt in _dbl:
            tgt += _dbl[now + nxt]
            inc += 1
        elif now in _oth:
            tgt += _oth[now]
        else:
            tgt += now
        inc += 1
    return tgt




In [7]:
# Read the pickle file containing ground truth data





def pickle_data_give(filename):
    '''Returns the attributes of the DCS pickle file
    
    Args:
    filename (str) = name of file
    
    Returns:
    output_load.sent_id (int): sentence id
    output_load.sentence (str): sentence 
    output_load.dcs_chunks (list): chunks
    output_load.lemmas (list): lemmas
    output_load.cng (list): morphological class
    
    Raises:
    '''
    
    output_load = pickle.load(open(filename, "rb"), encoding='utf-8')
    
    
    return output_load.sent_id, output_load.sentence, output_load.dcs_chunks, output_load.lemmas, output_load.cng
    

In [8]:
# read graph ml file

graph_file = nx.read_graphml(GRAPHML_FILE_NAME)


In [9]:
# Recursively break and simplify a nested list

def simplify_nested_list(nested_list):   
    
    # requires resetting of "all_single_elements_list" variable prior to each function call
    
    if len(nested_list) >= 1 and type(nested_list) is list:
        for idx, ele in enumerate(nested_list):
            #print(idx, ele, type(ele))
            if type(ele) is str:
                
                all_single_elements_list.append([ele])
                continue
                
            else:
                simplify_nested_list(nested_list[idx])
                
        
    else:
        
        all_single_elements_list.append(nested_list)
        return (nested_list)
        


In [10]:
# algorithm to find the cng+lemma combinations

_, dcs_sentence, _, lemmas, cng = pickle_data_give(PICKLE_DCS_FILE_NAME)
#print(lemma, '\n',cng)

#print(len(dcs_sentence.strip().rstrip().split(' ')))
#print(dcs_sentence.strip().rstrip().split(' '))
graph_ground_truth_ids = []
flattened_chunk_no = []
print('==========================================================================')


#get chunk no's flattened list
for idx, lemma in enumerate(lemmas):
    #print(len(lemma))
    if len(lemma)>1:
        #print('$')
        tmp=[]
        for x in range(len(lemma)):
            tmp.append(str(idx+1))
        flattened_chunk_no.append(tmp)
    else:
        #print('%')
        flattened_chunk_no.append(str(idx+1))
        

all_single_elements_list = []
simplify_nested_list(flattened_chunk_no)
flattened_chunk_no = all_single_elements_list 
        
        
all_single_elements_list = []
simplify_nested_list(lemmas)
lemmas = all_single_elements_list


all_single_elements_list = []
simplify_nested_list(cng)
cng = all_single_elements_list

print('\n\nlemmas:\t%s \ncngs:\t%s' %(lemmas, cng))

lemma_cng = zip(lemmas, cng)

number_of_chunks = len(dcs_sentence.strip().rstrip().split(' '))


for dcs_lemma, dcs_cng in lemma_cng:
    print('\n\n******************************')
    print('\n\nCurrent lemma: %s \tCurrent cng:%s' %(dcs_lemma, dcs_cng))
    
    same_ground_truth_ids = []
    
    for di in graph_file.nodes():
        
        graph_file.add_node(str(di), ground_truth_id=0)
        
        # this is random one time traversal search in the whole graph
        #print('Current node:%s' %(di))
        temp_dict = graph_file.nodes[di]
        
        # convert the lemma to slp1
        dcs_lemma[0] = iast2slp(dcs_lemma[0])
        
        
        
        print('checking node %s' %(di))
        print('\t\tLemma:%s\tCng:%s\n' %(temp_dict['lemma'], temp_dict['cng']))
        
        # check if in the current node, the cng matches to the pickle file cng
        if temp_dict['cng'] == int(dcs_cng[0]):
                       
            
            
            # check if in the current node, the lemma matches to the pickle file lemma
            if (temp_dict['lemma'] == dcs_lemma[0]):
                
                # only if cng and the converted lemma match, then these are executed
                
                #if tempdict chunk number matches dcs chnk number
                print('\t\t\t\t\t|$| - Here in node %s' %(di))
                #graph_file.add_node(di, ground_truth_id=1)
                same_ground_truth_ids.append(di)
        else:
            
            graph_file.add_node(str(di), ground_truth_id=0)
        
    graph_ground_truth_ids.append(same_ground_truth_ids)
                
                #print('#', di, '\n')
    

#return len(graph_ground_truth_ids)/number_of_chunks



lemmas:	[['upās'], ['tapas'], ['niṣṭhā'], ['haṃsa'], ['mad'], ['muc'], ['kilbiṣa']] 
cngs:	[['-19'], ['3'], ['39'], ['69'], ['72'], ['-190'], ['39']]


******************************


Current lemma: ['upās'] 	Current cng:['-19']
checking node 6
		Lemma:nizWA	Cng:80

checking node 12
		Lemma:nizWA	Cng:3

checking node 11
		Lemma:nizWa	Cng:40

checking node 18
		Lemma:mad	Cng:72

checking node 15
		Lemma:haMsa	Cng:69

checking node 8
		Lemma:nizWA	Cng:39

checking node 17
		Lemma:haMsa	Cng:31

checking node 20
		Lemma:muc	Cng:-190

checking node 10
		Lemma:nizWa	Cng:80

checking node 1
		Lemma:upAs	Cng:-19

					|$| - Here in node 1
checking node 3
		Lemma:tapas	Cng:71

checking node 4
		Lemma:tapas	Cng:31

checking node 14
		Lemma:nizWa	Cng:30

checking node 19
		Lemma:mukta	Cng:3

checking node 23
		Lemma:kilbiza	Cng:39

checking node 16
		Lemma:haMsa	Cng:71

checking node 5
		Lemma:tapa	Cng:29

checking node 21
		Lemma:kilbiza	Cng:80

checking node 7
		Lemma:nizWA	Cng:40

checking 

In [11]:
# this function is used to pick one cng+lemma combination if redundant units are present.

def pick_one_cng_lemma(graph_ground_truth_ids):
    
    update_ground_truth_ids = []
    graph_ground_truth_ids = list(filter(None, graph_ground_truth_ids))
    for ids_list in graph_ground_truth_ids:
        if len(ids_list) > 1:
            chose_cng_lemma = random.choice(ids_list)
            graph_file.add_node(chose_cng_lemma, ground_truth_id=1)
        else:
            chose_cng_lemma = ids_list[0]            
            graph_file.add_node(chose_cng_lemma, ground_truth_id=1)
            
        update_ground_truth_ids.append(chose_cng_lemma)
    
    return update_ground_truth_ids
    
graph_ground_truth_ids = pick_one_cng_lemma(graph_ground_truth_ids)
#print(graph_ground_truth_ids)

In [13]:
# check if the set of ground truth id's form a complete graph or not


        
def check_complete_graph(graph_file, graph_ground_truth_ids):
    graph_status = False
     
    #print(graph_file.get_edge_data(graph_ground_truth_ids[0], graph_ground_truth_ids[1])['key'])
    for idx in range(1, len(graph_ground_truth_ids)):
               
        try :
            if graph_file.get_edge_data(graph_ground_truth_ids[idx-1], graph_ground_truth_ids[idx])['key'] == 2:
                print('conflicting edge!!  \t\t\t', graph_file.has_edge(graph_ground_truth_ids[idx-1], graph_ground_truth_ids[idx]))
                graph_status = False
                break
            if graph_file.get_edge_data(graph_ground_truth_ids[idx-1], graph_ground_truth_ids[idx])['key'] == 1:
                print('not a conflicting edge \t\t\t', graph_file.has_edge(graph_ground_truth_ids[idx-1], graph_ground_truth_ids[idx]))
                print(graph_file.nodes[graph_ground_truth_ids[idx-1]]['word'], graph_file.nodes[graph_ground_truth_ids[idx]]['word'])
                print(graph_ground_truth_ids[idx-1], graph_ground_truth_ids[idx])
                print('\n')
                graph_status = True
        except KeyError and TypeError as key_type_issue:
            print(key_type_issue)
        
    return graph_status

#### The ground truth ids are given below and we check if they have a non conflicting edge or not between them.

In [14]:
#graph ground truth ids

graph_ground_truth_ids


['1', '2', '8', '15', '18', '20', '23']

The below line is used to check if the nodes have edge between them or not.

In [15]:

#graph_file.has_edge('83', '86')
# uncomment the lines

The below line is used to check if they confict or not

if 2 is returned then it's a conflicting edge

if 1 is returned then it's a not a conflicting edge

In [16]:
#graph_file.get_edge_data('1', '12')

In [17]:
# snippet to get the ground truth sentence from the graphML file

print('\n\nGraph ground truth nodes:')
#graph_ground_truth_ids = sorted(graph_ground_truth_ids, key = lambda x : graph_file.nodes[x]['chunk_no'])
print(graph_ground_truth_ids)


for ids in graph_ground_truth_ids:
    ids = str(ids)
    print(graph_file.nodes[str(ids)]['word'], end=' ')

  
print('\n\n')
print('DCS sentence below:\n')

for idx, _ in enumerate(pickle_data_give(PICKLE_DCS_FILE_NAME)):
    print('%s: \t%s' %(dcs_class_params[idx], _))
    
print('\n\n\n========================')
graph_complete_or_not = check_complete_graph(graph_file, graph_ground_truth_ids)
print('\n\n\nGraph status: ',graph_complete_or_not)



Graph ground truth nodes:
['1', '2', '8', '15', '18', '20', '23']
upAsate tapaH nizWAH haMsam mAm mukta kilbizAH 


DCS sentence below:

sentence id: 	29088
sentence : 	upAsate taponizWA haMsaM mAM muktakilbizAH    
chunks: 	['upās', 'tapas', 'niṣṭhā', 'haṃsa', 'mad', 'muc', 'kilbiṣa']
lemmas: 	[['upās'], ['tapas', 'niṣṭhā'], ['haṃsa'], ['mad'], ['muc', 'kilbiṣa']]
morph class: 	[['-19'], ['3', '39'], ['69'], ['72'], ['-190', '39']]



not a conflicting edge 			 True
upAsate tapaH
1 2


conflicting edge!!  			 True



Graph status:  False


### Possible solutions from here

In such a case where we have a conflicting edge between ground truth nodes.

- either leave them as it is and update the ground truths. ( no overwriting of previous attributes)

- or overwrite 
    - the previous attributes and create non coflicting edges incase conflicting.
    
Again would prefer the overwrite option because
 - the ground truth ids give correct solution
 - correct features for learning