### **Imported Libraries**

In [929]:
import re
import os
import pandas as pd
from functools import reduce

### **Processing Methods**

In [930]:
# Define regular expressions that match each contact type
# r'[^/][a-z]?(\d{4})(?! is )  ?-?[\u0621-\u064AA-Za-z]+(?!. Tous|’)\b'
# \b(?:Tel|T\s?:?\s?(?:\(\+216\)|\+216)?)?\s?(\d{2} \d{3} \d{3}|\d{2} \d{2} \d{2} \d{2}|\d{2} \d{6})\b
contact_re = {
    'mails': re.compile(r'\b(?:(?!mailto)\w+)\.?\w+@\w+\.\w+\.?[a-z]*\b'),
    'code': re.compile(r"[^/][a-z]?(\d{4})(?! is )(?: - \w{2,3} \d+|  ?-?[\u0621-\u064AA-Za-z]+)(?!. Tous|’|. All)\b"),
    'tels': re.compile(r'\b(?:Tel|T\s?:?\s?(?:\(\+216\)|\+216)?)?\s?(\d{2} \d{3} \d{3}|\d{2} \d{2} \d{2} \d{2}|\d{2} \d{6})\b')
}

In [931]:
def list_to_dict_of_counts(type_matches):
    counts = dict()
    for m in type_matches:
        counts[m] = counts.get(m, 0) + 1
    return counts

In [932]:
def edit_phone_number_format(matches):
    pattern = re.compile(r'\b(?:Tel|T\s?:?\s?(?:\(\+216\)|\+216)?)?\s?(\d{2})\s?(\d{2})\s?(\d{2})\s?(\d{2})\b')
    if matches:
        for m in range(len(matches)):
            matches[m] = re.sub(pattern, r'(+216) \1 \2 \3 \4', ''.join(matches[m].split()))
    return matches

In [933]:
def fetch_matches_from_sys_file(file_url):
    '''
    This function ser
    '''
    contact_matches = {'mails':[], 'code':[], 'tels':[]}
    with open(file=file_url) as f:
        lines = f.readlines()
        for line in lines:
            for type, pattern in contact_re.items():
                matches = pattern.findall(line)
                if type == 'tels': 
                    matches = edit_phone_number_format(matches)
                contact_matches[type].extend(matches)
    
    sys_matches = {}
    for type in contact_matches:
        sys_matches[type] = list_to_dict_of_counts(contact_matches[type])
    sys_matches = {file_url.split('.')[1][1:]: sys_matches}
    return sys_matches

In [934]:
def get_htm_files(url='./'):
    files = []
    for f in os.listdir(url):
        match = re.search(r'(.htm)$', f)
        if match:
            url_f = os.path.join(url, f)
            files.append(url_f)
        else: continue
    return files

In [935]:
def merge_dictionaries(dict1, dict2):
    merged_dict = dict1.copy()
    merged_dict.update(dict2)
    return merged_dict

In [936]:
dict_of_matches = [fetch_matches_from_sys_file(f) for f in get_htm_files()]
sys_contacts = reduce(merge_dictionaries, dict_of_matches)
print(sys_contacts)

{"ENET'Com": {'mails': {'contact@enetcom.usf.tn': 3}, 'code': {'3018': 2}, 'tels': {'(+216) 74 86 30 47': 2, '(+216) 74 86 25 00': 2, '(+216) 74 86 30 37': 2}}, 'ENIM': {'mails': {'enim@enim.rnu.tn': 3}, 'code': {'5019': 2}, 'tels': {'(+216) 73 50 05 11': 2, '(+216) 73 50 05 14': 4}}, 'ENIS': {'mails': {'webmaster@enis.tn': 4}, 'code': {'3038': 2}, 'tels': {'(+216) 70 25 85 20': 2, '(+216) 74 27 55 95': 1}}, 'FSEGMA': {'mails': {'fsegma@fsegma.rnu.tn': 2}, 'code': {'5111': 1}, 'tels': {'(+216) 73 68 31 91': 1, '(+216) 73 68 31 92': 1}}, 'FSEGS': {'mails': {'contact@fsegs.rnu.tn': 1}, 'code': {'3018': 1}, 'tels': {'(+216) 74 27 87 77': 1, '(+216) 74 27 91 39': 1}}, 'FSM': {'mails': {'fsm@fsm.rnu.tn': 1}, 'code': {'5019': 2}, 'tels': {'(+216) 73 50 02 76': 1, '(+216) 73 50 02 78': 1}}, 'FSS': {'mails': {'contact@fss.rnu.tn': 3}, 'code': {'3000': 2}, 'tels': {'(+216) 74 27 64 00': 2, '(+216) 74 27 67 63': 2, '(+216) 74 27 44 37': 2}}, 'ISGIS': {'mails': {'direction.isgis@isgis.usf.tn': 2}

In [937]:
def trait_ref_file(url='./ref.txt'):
    ref = pd.read_csv(url, delimiter='\t', names=['File', 'Type', 'Value', 'Counts'])
    ref['Counts'] = ref['Counts'].astype('int')
    ref_matches = {}
    for _, row in ref.iterrows():
        if row['File'] not in ref_matches: ref_matches[row['File']] = {'mails' : {}, 'code' : {}, 'tels' : {}}
        if row['Type'] == 'fax':
            if row['Value'] not in ref_matches[row['File']]['tels']:
                ref_matches[row['File']]['tels'][row['Value']] = row['Counts']
            else: ref_matches[row['File']]['tels'][row['Value']] += row['Counts']
        else:
            ref_matches[row['File']][row['Type']][row['Value']] = row['Counts']
    return ref_matches

In [938]:
print(trait_ref_file())

{'ENIS': {'mails': {'webmaster@enis.tn': 4}, 'code': {'3038': 2}, 'tels': {'(+216) 70 25 85 20': 2, '(+216) 74 27 55 95': 1}}, "ENET'Com": {'mails': {'contact@enetcom.usf.tn': 3}, 'code': {'3018': 2}, 'tels': {'(+216) 74 86 30 47': 2, '(+216) 74 86 25 00': 2, '(+216) 74 86 30 37': 2}}, 'ENIM': {'mails': {'enim@enim.rnu.tn': 3}, 'code': {'5019': 2}, 'tels': {'(+216) 73 50 05 11': 2, '(+216) 73 50 05 14': 4}}, 'ISIMa': {'mails': {'isima@isima.rnu.tn': 1}, 'code': {'5111': 1}, 'tels': {'(+216) 73 68 31 00': 1, '(+216) 73 68 31 20': 1}}, 'ISGIS': {'mails': {'direction.isgis@isgis.usf.tn': 2}, 'code': {'3021': 4}, 'tels': {'(+216) 74 86 30 90': 1, '(+216) 74 86 30 92': 1}}, 'FSS': {'mails': {'contact@fss.rnu.tn': 3}, 'code': {'3000': 2}, 'tels': {'(+216) 74 27 64 00': 2, '(+216) 74 27 67 63': 2, '(+216) 74 27 44 37': 2}}, 'FSM': {'mails': {'fsm@fsm.rnu.tn': 1}, 'code': {'5019': 2}, 'tels': {'(+216) 73 50 02 76': 1, '(+216) 73 50 02 78': 1}}, 'FSEGS': {'mails': {'contact@fsegs.rnu.tn': 1}, '

In [939]:
def comparer(sys_contacts, ref_contacts):
    print(ref_contacts)
    print(sys_contacts)
    INT = 0 # taille ref intersection sys
    SYS = 0 # taille sys
    REF = 0 # taille ref
    resultat = {}
    for fichier in ref_contacts:
        resultat[fichier] = {}

        ref_types = ref_contacts[fichier]
        if fichier in sys_contacts:
            sys_types = sys_contacts[fichier]
        else:
            sys_types = None

        for type in ['mails', 'code', 'tels']:
            res_elements = resultat[fichier][type] = {}
            for element in ref_types[type]:
                ref_nbr = ref_types[type][element]
                sys_nbr = 0
                if (sys_types != None) and (element in sys_types[type]):
                    sys_nbr = sys_types[type][element]
                res_elements[element] = 'sys(' + str(sys_nbr) + '), ref(' + str(ref_nbr) + ')'
                SYS += sys_nbr
                REF += ref_nbr
                INT += min(sys_nbr, ref_nbr)
            if (sys_types):
                for element in sys_types[type]:
                    if not element in res_elements:
                        sys_nbr = sys_types[type][element]
                        res_elements[element] = 'sys(' + str(sys_nbr) + '), ref(0)'
                        SYS += sys_types[type][element]
    R = INT / REF
    P = 0.0 if SYS == 0 else INT / SYS
    F1 = 0.0 if R + P == 0 else 2 * P * R / (P + R)
    return resultat, R, P, F1

In [940]:
# Affichage des statistiques
def affichage(contacts):
    for fichier in contacts:
        print('========== ', fichier, ' ==========')
        stats = contacts[fichier]
        for type in stats:
            print('------> ', type)
            stats_type = stats[type]
            for element in stats_type:
                print('         ', element, ' : ', stats_type[element])

In [941]:
ref_contacts = trait_ref_file()
comp, R, P, F1 = comparer(sys_contacts, ref_contacts)
affichage(comp)
print('---------------------------------------------------')
print('R =', R, ', P =', P, ', F1 =', F1)

{'ENIS': {'mails': {'webmaster@enis.tn': 4}, 'code': {'3038': 2}, 'tels': {'(+216) 70 25 85 20': 2, '(+216) 74 27 55 95': 1}}, "ENET'Com": {'mails': {'contact@enetcom.usf.tn': 3}, 'code': {'3018': 2}, 'tels': {'(+216) 74 86 30 47': 2, '(+216) 74 86 25 00': 2, '(+216) 74 86 30 37': 2}}, 'ENIM': {'mails': {'enim@enim.rnu.tn': 3}, 'code': {'5019': 2}, 'tels': {'(+216) 73 50 05 11': 2, '(+216) 73 50 05 14': 4}}, 'ISIMa': {'mails': {'isima@isima.rnu.tn': 1}, 'code': {'5111': 1}, 'tels': {'(+216) 73 68 31 00': 1, '(+216) 73 68 31 20': 1}}, 'ISGIS': {'mails': {'direction.isgis@isgis.usf.tn': 2}, 'code': {'3021': 4}, 'tels': {'(+216) 74 86 30 90': 1, '(+216) 74 86 30 92': 1}}, 'FSS': {'mails': {'contact@fss.rnu.tn': 3}, 'code': {'3000': 2}, 'tels': {'(+216) 74 27 64 00': 2, '(+216) 74 27 67 63': 2, '(+216) 74 27 44 37': 2}}, 'FSM': {'mails': {'fsm@fsm.rnu.tn': 1}, 'code': {'5019': 2}, 'tels': {'(+216) 73 50 02 76': 1, '(+216) 73 50 02 78': 1}}, 'FSEGS': {'mails': {'contact@fsegs.rnu.tn': 1}, '