In [2]:
import pandas as pd
import numpy as np

In [9]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [25]:
results = pd.read_pickle("crf_test_results.pkl")

In [4]:
predicted = results[results.pred == "signature"]

In [11]:
for name, group in predicted.groupby("filename"):
    print(group.loc[:, ["line", "person", "org", "email", "url", "phone"]])
    print()

       line  person  org  email  url  phone
4  Ka'Tina-  0       0    0      0    0    

     line  person  org  email  url  phone
11  Jenny  0       1    0      0    0    

                         line  person  org  email  url  phone
21  Many Thanks,               0       0    0      0    0    
22                             0       0    0      0    0    
23  Liz Taylor                 1       0    0      0    0    
24  Assistant to Greg Whalley  0       0    0      0    0    
25  713.853.1935 office        0       0    0      0    1    
26  713.853.1838 fax           0       0    0      0    1    
27  713.854.3056 mobile        0       0    0      0    1    

    line  person  org  email  url  phone
30  Lisa  0       0    0      0    0    

          line  person  org  email  url  phone
37  Thanks,     0       0    0      0    0    
39  Danielle    1       0    0      0    0    

   line  person  org  email  url  phone
47  GT   0       0    0      0    0    

         line  person  

In [12]:
import spacy
nlp = spacy.load("en_core_web_md")

In [20]:
doc = nlp("Vice President")
[(ent, ent.label_) for ent in doc.ents]

[]

==============  ==========================================================================================
Feature Name    Description
==============  ==========================================================================================
low             Checks if the token is lower case.
upper           Checks if the token is upper case.
title           Checks if the token starts with an uppercase character and all remaining characters are
                lowercased.
digit           Checks if the token contains just digits.
prefix5         Take the first five characters of the token.
prefix2         Take the first two characters of the token.
suffix5         Take the last five characters of the token.
suffix3         Take the last three characters of the token.
suffix2         Take the last two characters of the token.
suffix1         Take the last character of the token.
pos             Take the Part-of-Speech tag of the token (``SpacyTokenizer`` required).
pos2            Take the first two characters of the Part-of-Speech tag of the token
                (``SpacyTokenizer`` required).
pattern         Take the patterns defined by ``RegexFeaturizer``.
bias            Add an additional "bias" feature to the list of features.
==============  ==========================================================================================

In [3]:
master = pd.read_pickle("enron_random_clean1.pkl")

In [8]:
sigs = master[master.entity == 'signature']
sigs = sigs[master.blank == 0]

  


In [10]:
signature_blocks = {}

In [11]:
for name, group in sigs.groupby("filename"):
    signature_blocks[name] = "\n".join(group.line)

In [13]:
len(signature_blocks)

116

In [24]:
import os
import spacy
import re
from collections import defaultdict

In [18]:
nlp = spacy.load("en_core_web_md")

In [25]:
def separate_entities(entities):
    separated = defaultdict(list)
    for ent in entities:
        if ent.label_ == "PERSON":
            separated["PERSON"].append(ent)
        if ent.label_ == "ORG":
            separated["ORG"].append(ent)
    return separated

In [27]:
RE_EMAIL = re.compile(
    r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*)@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])")
RE_URL = re.compile(r"((https?|ftp|smtp):\/\/)?(www.)?[a-z0-9]+\.[a-z]+(\/[a-zA-Z0-9#]+\/?)*", re.IGNORECASE)
RE_RELAX_PHONE = re.compile(r'(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')

In [65]:
for filename, text in signature_blocks.items():
    print(filename)
    path = os.path.join(r"F:\Documents\stopansko\masters\thesis\sig-detect\data\clean\enron_random_clean_signatures", filename)
    doc = nlp(text)
    separated_ents = separate_entities(doc.ents)
    new_text = text
    offset = 0
    for entity_type, ents in separated_ents.items():
        for ent in ents:
            start, end = ent.start_char + offset, ent.end_char + offset
            old_len = len(new_text)
            new_text = new_text[0:start] + f"[{new_text[start:end]}]({entity_type})" + new_text[end:]
            offset = len(new_text) - old_len

    match_map = {
        "EMAIL": RE_EMAIL.finditer(new_text, re.I),
        "PHONE": RE_RELAX_PHONE.finditer(new_text, re.I),
    }          
    
    with open(path, "w+", encoding="utf-8") as f:
        f.write(new_text+"\n\n\n\n\nOriginal:"+text)

allen-p_all_documents_113.txt
allen-p_all_documents_171.txt
allen-p_all_documents_496.txt
allen-p_all_documents_505.txt
allen-p_all_documents_595.txt
allen-p_all_documents_599.txt
allen-p_all_documents_90.txt
arnold-j_all_documents_1027.txt
arnold-j_all_documents_454.txt
arnold-j_all_documents_687.txt
arora-h_all_documents_44.txt
arora-h_all_documents_58.txt
arora-h_all_documents_64.txt
badeer-r_all_documents_109.txt
badeer-r_all_documents_119.txt
badeer-r_all_documents_128.txt
badeer-r_all_documents_141.txt
badeer-r_all_documents_257.txt
badeer-r_all_documents_274.txt
badeer-r_all_documents_99.txt
bailey-s_all_documents_14.txt
bailey-s_all_documents_8.txt
bass-e_all_documents_1290.txt
bass-e_all_documents_1290_a.txt
bass-e_all_documents_1485.txt
bass-e_all_documents_1485_a.txt
bass-e_all_documents_1888.txt
bass-e_all_documents_713.txt
bass-e_all_documents_990.txt
baughman-e_all_documents_131.txt
baughman-e_all_documents_255.txt
baughman-e_all_documents_268.txt
baughman-e_all_documents