In [3]:
%matplotlib inline
import os
import re
import string

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

In [4]:
import spacy
nlp = spacy.load("en_core_web_md")

In [5]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

In [4]:
RE_EMAIL = re.compile(
    r"(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*)@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])")
RE_URL = re.compile(r"((https?|ftp|smtp):\/\/)?(www.)?[a-z0-9]+\.[a-z]+(\/[a-zA-Z0-9#]+\/?)*", re.IGNORECASE)
RE_RELAX_PHONE = re.compile(r'(\(? ?[\d]{2,3} ?\)?.{,3}?){2,}')
# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line matches the regular expression "^[\s]*---*[\s]*$".
RE_SEPARATOR = re.compile(r'^[\s]*---*[\s]*')
RE_REPLY = re.compile(r'^\>')
RE_REPLY_PUNCT = re.compile('^[^A-Za-z0-9]{1,2}\>')
RE_TAB = re.compile(r'\t')
RE_WROTE = re.compile(r'\bwr[oi]tes?:\b', re.IGNORECASE)

# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line has a sequence of 10 or more special characters.
RE_SPECIAL_CHARS = re.compile(('^[\s]*([\*]|#|[\+]|[\^]|-|[\~]|[\&]|[\$]|_|[\!]|'
                               '[\/]|[\%]|[\:]|[\=]){10,}[\s]*$'))

keywords = [r"thank",r"regards",r"wishes", r"^sent from", r"\bBR\b", r"sincerely", r"corporation", r"\bcorp\b", r"\bLLC\b", r"group", r"fax", r"e?mail", r"phone", r"www"]
RE_SIGNATURE_WORDS = re.compile("|".join([f"({w})" for w in keywords]), re.IGNORECASE)

# Taken from:
# http://www.cs.cmu.edu/~vitor/papers/sigFilePaper_finalversion.pdf
# Line contains a pattern like Vitor R. Carvalho or William W. Cohen.
RE_NAME = re.compile(r'[A-Z][a-z]+\s\s?[A-Z][\.]?\s\s?[A-Z][a-z]+')

RE_HEADER_WORDS = re.compile(r'from|\bto\b|subject|[Cc]c|[Bb]cc:|[Ff]orwarded', re.IGNORECASE)


def punct_percent(line):
    if len(line) == 0:
        return 0
    punct = [c for c in line if c in string.punctuation]
    return len(punct) / len(line)


def alphanum_percent(line):
    if len(line) == 0:
        return 0
    punct = [c for c in line if c.isalnum()]
    return len(punct) / len(line)

def num_percent(line):
    line = line.strip()
    if len(line) == 0:
        return 0
    punct = [c for c in line if c.isdigit()]
    return len(punct) / len(line)

In [1]:
def email_to_name(text):
    import email
    import os
    import re

    if "@" not in text:
        return text
    
    name, mail = email.utils.parseaddr(text)
    if name:
        return name
    
    name, domain = text.split("@")
    name_parts = [x.strip() for x in re.split(r"@|\.|\W|\_", name) if x.strip()]
    return " ".join(name_parts)

In [5]:
from nltk.tokenize.casual import TweetTokenizer
import nltk
tw = TweetTokenizer()

In [6]:
def percent_title(line):
    clean = "".join([x for x in line])
    tokens = tw.tokenize(clean)
    if not tokens:
        return 0
    titles = [t for t in tokens if t.istitle() and len(t) > 2]
    return len(titles)/len(tokens) 

In [7]:
def is_person(spacy_doc):
    if not spacy_doc.text.strip():
        return False
    labels = []
    sizes = []
    for ent in spacy_doc.ents:
        labels.append(ent.label_)
        sizes.append(ent.end_char-ent.start_char)
    ratio = sum(sizes)/len(spacy_doc.text)
    return 'PERSON' in labels and ratio > 0.7 and not spacy_doc.text.startswith("<")

In [8]:
def is_org(spacy_doc):
    if not spacy_doc.text.strip():
        return False
    labels = []
    sizes = []
    for ent in spacy_doc.ents:
        labels.append(ent.label_)
        sizes.append(ent.end_char-ent.start_char)
    ratio = sum(sizes)/len(spacy_doc.text)
    return 'ORG' in labels and ratio > 0.7

In [9]:
def named_entity(row):
    for x in ['blank', 'email', 'url', 'phone', 'person', 'org']:
        if row[x] == 1:
            return x
    return "no_entity"

In [10]:
feature_dict = {
        'blank': lambda doc: 1 if len(doc[0].strip()) == 0 else 0,
        'email': lambda doc: 1 if RE_EMAIL.search(doc[0]) else 0,
        'url': lambda doc: 1 if RE_URL.search(doc[0]) else 0,
        'phone': lambda doc: 1 if RE_RELAX_PHONE.search(doc[0]) else 0,
        'sigdelimiter': lambda doc: 1 if RE_SEPARATOR.match(doc[0]) else 0,
        'special': lambda doc: 1 if RE_SPECIAL_CHARS.search(doc[0]) else 0,
        'words': lambda doc: 1 if RE_SIGNATURE_WORDS.search(doc[0]) else 0,
        'header': lambda doc: 1 if RE_HEADER_WORDS.search(doc[0]) else 0,
        'name': lambda doc: 1 if RE_NAME.search(doc[0]) else 0,
        'endquote': lambda doc: 1 if doc[0].endswith("\"") else 0,
        'tabs1': lambda doc: 1 if len(RE_TAB.findall(doc[0])) == 1 else 0,
        'tabs2': lambda doc: 1 if len(RE_TAB.findall(doc[0])) == 2 else 0,
        'tabs3': lambda doc: 1 if len(RE_TAB.findall(doc[0])) >= 3 else 0,
        'punct20': lambda doc: 1 if punct_percent(doc[0]) >= 0.2 else 0,
        'punct50': lambda doc: 1 if punct_percent(doc[0]) >= 0.5 else 0,
        'punct90': lambda doc: 1 if punct_percent(doc[0]) >= 0.9 else 0,
        'reply': lambda doc: 1 if RE_REPLY.match(doc[0]) else 0,
        'startpunct': lambda doc: 1 if doc[0].startswith(tuple(p for p in string.punctuation)) else 0,
        'firstchar': lambda doc: doc[0][0] if len(doc[0]) > 0 else "",
        'replypunct': lambda doc: 1 if RE_REPLY_PUNCT.match(doc[0]) else 0,
        'wrote': lambda doc: 1 if RE_WROTE.search(doc[0]) else 0,
        'alphanum90': lambda doc: 1 if alphanum_percent(doc[0]) < 0.9 else 0, # i.e. more than 10% special symbols
        'alphanum50': lambda doc: 1 if alphanum_percent(doc[0]) < 0.5 else 0,
        'alphanum10': lambda doc: 1 if alphanum_percent(doc[0]) < 0.1 else 0,
        'num90': lambda doc: 1 if num_percent(doc[0]) >= 0.9 else 0,  # i.e. numbers are 90% of the line
        'num50': lambda doc: 1 if num_percent(doc[0]) >= 0.5 else 0,
        'num10': lambda doc: 1 if num_percent(doc[0]) >= 0.1 else 0,
        'title': lambda doc: 1 if doc[0].strip().istitle() else 0,
        'many_titles': lambda doc: 1 if percent_title(doc[0]) >= 0.5 else 0,
        'person': lambda doc: 1 if is_person(doc[1]) else 0,
        'org': lambda doc: 1 if is_org(doc[1]) else 0
}

In [11]:
def has_sender(line, sender):
    low_line = line.lower()
    name, sep, domain = sender.partition("@")
    full_name = " ".join(name.split("."))
    return sender.lower() in low_line or full_name.lower() in low_line or name in low_line

In [12]:
ENTITY_PATTERN = "^#sig#"
def line_to_entity(line, filename, i):
    # sender = get_sender(filename)
    m = re.match(ENTITY_PATTERN, line)
    if m:
        spacy_doc = nlp(line[5:])
        e = {
            "line": line[5:],
            "filename": filename,
            "entity": "signature",
            "len": len(line[5:]),
            "lineNo": i+1
        }
    else:
        spacy_doc = nlp(line)
        e = {"line": line, "filename": filename, "entity": "no_entity", "len": len(line), "lineNo": i+1}    
    doc = (e["line"], spacy_doc)
    for feature, fn in feature_dict.items():            
        e[feature] = fn(doc)    
    return e

In [13]:
def remove_blanks(lines):    
    return [line for line in lines if len(line.strip()) > 0]

In [14]:
def strip_blank_lines(lines, leading=True, trailing=True):
    leading_blank = 0
    trailing_blank = len(lines)
    lines_it = iter(lines)
    next_line = next(lines_it, None)
    while next_line is not None and len(next_line.strip()) == 0:
        leading_blank += 1
        next_line = next(lines_it, None)

    if trailing:
        it_reversed = iter(reversed(lines))
        next_line = next(it_reversed, None)
        while next_line is not None and len(next_line.strip()) == 0:
            trailing_blank -= 1
            next_line = next(it_reversed, None)
    return lines[leading_blank:trailing_blank]

In [15]:
def get_signature_length(ents):
    """ Signature length in number of lines """
    return sum(1 for e in ents if e["entity"] == "signature")

In [16]:
# check if it has multiple signatures
def detect_signature_positions(lines):
    """Returns list of lists, each with lineNo of the signatures"""
    sigs = [i for i, line in enumerate(lines) if re.match(ENTITY_PATTERN, line)]
    sig_groups = []
    cur_group = []
    for n in sorted(sigs):
        last_n = cur_group[-1] if cur_group else None
        if not last_n:
            cur_group.append(n)
            continue
        if n-last_n == 1:
            cur_group.append(n)
        else:
            sig_groups.append(cur_group)
            cur_group = []
    if cur_group:
        sig_groups.append(cur_group)
    return sig_groups

In [17]:
def get_sender(filename):
    sender = senders[filename]
    return sender if sender != "None" else "None@enron.com"

In [2]:
senders = {}
for filename in filenames:
    with open(os.path.join(files_path, filename+"_sender"), mode="r", encoding="utf-8") as f:
        senders[filename] = f.read()

NameError: name 'filenames' is not defined

In [18]:
files_path = r"F:\Documents\stopansko\masters\thesis\sig-detect\data\clean\enron_random_clean"
#files_path = r"F:\Documents\stopansko\masters\thesis\sig-detect\data\clean\enron_random"
#files_path = r"F:\Documents\stopansko\masters\thesis\sig-detect\data\clean\sigPlusReply"
filenames = [f for f in os.listdir(files_path) if os.path.isfile(os.path.join(files_path, f)) and "_sender" not in f]

In [19]:
files = list()
entities = list()
for filename in filenames:
    with open(os.path.join(files_path, filename), mode="r", encoding="utf-8") as f:        
        lines = f.read().splitlines()
        lines = strip_blank_lines(lines)
        blocks = detect_signature_positions(lines)
        non_blanks = remove_blanks(lines)
        if len(non_blanks) > 0:
            lengths = [len(line) for line in lines]
            file_entities = [line_to_entity(line, filename, i) for i, line in enumerate(lines)]
            entities.extend(file_entities)
            files.append({
                "filename": filename,
                "nlines": len(lines),
                "len_avg": np.ceil(np.mean(lengths)),
                "len_min": min(lengths),
                "len_max": max(lengths),
                "nBlanks": len(lines) - len(non_blanks),
                "nNonBlanks": len(non_blanks),
                "nSig": get_signature_length(file_entities), # lengh of the signature in lines
                "nSigBlocks": len(blocks)
            })
        
df_files = pd.DataFrame(files) 
entities = pd.DataFrame(entities)

In [20]:
master = entities.merge(df_files, on="filename")

In [21]:
master["posFromEnd"] = master.nlines - master.lineNo
master["last"] = master.posFromEnd.apply(lambda x: 1 if x == 0 else 0)
master["prevlast"] = master.posFromEnd.apply(lambda x: 1 if x == 1 else 0)
master["last5"] = master.posFromEnd.apply(lambda x: 1 if x < 5 else 0)
master["last11"] = master.posFromEnd.apply(lambda x: 1 if x < 11 else 0)
master["posRatio"] = master.lineNo / master.nlines # 1 = last
master["posRatioFromEnd"] = master.posFromEnd / master.nlines
master["posRatioNB"] = master.lineNo / master.nNonBlanks # 1 = last
master["lenRatio"] = master.len / master.len_avg
master["lenRatioMax"] = master.len / master.len_max

master["less_avg_len"] = master.apply(lambda row: 1 if row.len <= row.len_avg else 0, axis=1)
master["more_avg_len"] = master.apply(lambda row: 1 if row.len > row.len_avg else 0, axis=1)
master["less_avg_len75"] = master.apply(lambda row: 1 if row.len <= (row.len_avg*.75) else 0, axis=1)
master["less_avg_len50"] = master.apply(lambda row: 1 if row.len <= (row.len_avg*.5) else 0, axis=1)

master["named_entity"] = master.apply(lambda row: named_entity(row), axis=1)

In [22]:
master["pred_file"] = master.filename.shift(1)
master["next_file"] = master.filename.shift(-1)
master["pred_named_entity"] = master.named_entity.shift(1)
master["next_named_entity"] = master.named_entity.shift(-1)
master["prev_same_entity"] = master.apply(lambda row: 1 if row.named_entity == row.pred_named_entity else 0, axis=1)
master["next_same_entity"] = master.apply(lambda row: 1 if row.named_entity == row.next_named_entity else 0, axis=1)

In [144]:
signatures = master[master.entity=="signature"]
negative = master[master.entity!="signature"]

In [125]:
from collections import Counter
entity_order = ['no_entity', 'person', 'org', 'blank', 'email', 'url', 'phone']

In [126]:
ln = len(negative)
c = Counter(negative.named_entity)
for ent in entity_order:
    print(f"{ent} = {(c[ent]/ln)*100:.2f}% of negatives")

no_entity = 47.61% of negatives
person = 2.75% of negatives
org = 1.69% of negatives
blank = 31.99% of negatives
email = 0.95% of negatives
url = 6.39% of negatives
phone = 8.63% of negatives


In [127]:
ln = len(signatures)
c = Counter(signatures.named_entity)
for ent in entity_order:
    print(f"{ent} = {(c[ent]/ln)*100:.2f}% of signaturess")

no_entity = 44.65% of signaturess
person = 13.27% of signaturess
org = 9.42% of signaturess
blank = 3.99% of signaturess
email = 5.14% of signaturess
url = 3.71% of signaturess
phone = 19.83% of signaturess


In [25]:
master.to_pickle("enron_random_clean1.pkl")