In [1]:
# from transformers import AutoFeatureExtractor, AutoModelForImageClassification
# import torch
from PIL import Image
from pathlib import Path
from tqdm import tqdm
import pickle
import time

import sys
sys.path.append('../src')

from utils_tiramisu import *

# this is the same TIRAMISU_PATH as shown in start_here.ipynb
TIRAMISU_PATH = 


# import datasets
import pandas as pd
import json
# from pytesseract import Output
# import pytesseract
# import cv2
import re
from math import ceil, floor
from unidecode import unidecode

import numpy as np
from collections import Counter, defaultdict
import itertools
device = 'cuda:0'

In [2]:
def runner(path):
	image = Image.open(path).convert('RGB')

	inputs = feature_extractor(images=image, return_tensors="pt").to(device)
	outputs = model(**inputs)
	logits = outputs.logits

	# model predicts one of the 16 RVL-CDIP classes
	predicted_class_idx = logits.argmax(-1).item()


	return (path, model.config.id2label[predicted_class_idx])

We first need to detect all email documents and get recipients/senders from them. All emails are from scanned PDFs.

In [3]:
pdf_documents = return_from_neo4j("""
match (n:Folder) - [:CONTAINS] -> (e:File) - [:SPLIT_INTO] -> (c:File) - [:PART_OF] -> (d:Document) 
where e.fileExtension = 'pdf' 
WITH c, d 
match (c) - [:CONVERT_TO] -> (f:File) 
where f.fileExtension = 'png' 
return c.nodeID as nodeID, d.nodeID as documentID, c.page as page, f.tiramisuPath as tiramisuPath 
""")

first_pages = pdf_documents.sort_values(['documentID', 'page']).groupby('documentID').head(1)
first_pages['tiramisuPath'] = first_pages['tiramisuPath'].apply(lambda x: (Path(TIRAMISU_PATH) / Path(x).relative_to('/tiramisu/')).as_posix())
pdf_documents['tiramisuPath'] = pdf_documents['tiramisuPath'].apply(lambda x: (Path(TIRAMISU_PATH) / Path(x).relative_to('/tiramisu/')).as_posix())

We first find the categories of each document. Upon request, we can provide the `../cache/categories_of_first_pages_071023.parquet` which is the output of a [Document Image Transformer](https://arxiv.org/abs/2203.02378) model that was trained on 16 classes in [RVL-CDIP](https://adamharley.com/rvl-cdip/) dataset.

In [4]:

emails = pd.read_parquet('../cache/categories_of_first_pages_071023.parquet')
emails = pd.merge(pdf_documents, emails[['path', 'class']], left_on = 'tiramisuPath', right_on = 'path')
emails = emails.sort_values(['documentID', 'page'])
documents = emails.groupby('documentID').head(1)

# We uncomment out the following line of code to send to a labeling interface
# emails.loc[emails.documentID.isin(documents.loc[documents['class'] == 'email'].documentID)]\
# [['documentID', 'nodeID', 'page', 'path']].\
# to_parquet("../models/email_parsing/emails_to_inference_240414.parquet")

We sent just the emails to a LayoutLMV3 model fine-tuned for detecting email fields. We used LabelStudio for labeling similar to the page stream segmentation and entity recognition tasks. Upon request, we can provide the fields of the detected email outputs.

In [5]:
all_emails_inferenced = pd.read_parquet("../models/email_parsing/inference_result_on_240414.parquet").drop_duplicates(subset = "nodeID")
all_emails_inferenced = pd.merge(all_emails_inferenced, emails.loc[emails.documentID.isin(documents.loc[documents['class'] == 'email'].documentID)]\
[['nodeID', 'path']], on = 'nodeID', how = 'left')

In [6]:
only_labels = all_emails_inferenced.groupby(["documentID"]).agg({"labels": lambda x: list(itertools.chain.from_iterable(x))}).reset_index()
only_labels['length'] = only_labels['labels'].apply(lambda x: len([i for i in x if i != "O"]))

In [7]:
def unnormalize_box(bbox, width, height):
	 return [
		 width * (bbox[0] / 1000),
		 height * (bbox[1] / 1000),
		 width * (bbox[2] / 1000),
		 height * (bbox[3] / 1000),
	 ]
    
def find_text(box, df, threshold):
    return df.loc[(df.left >= box[0]-threshold) & (df.top >= box[1]-threshold) & (df.left <=box[2]+threshold) & (df.top <= box[3] + threshold)]

def translate(outputs, encoding, img):
	predictions = outputs.logits.cpu().argmax(-1).squeeze().tolist()
	token_boxes = encoding.bbox.cpu().squeeze().tolist()

	height, width = img.shape[0], img.shape[1]

	if len(predictions) == 512:
		predictions = [predictions]
		token_boxes = [token_boxes]

	true_predictions =[]
	true_boxes = []
	STRIDE_COUNT = 128
	for i, (pred, box, mapped) in enumerate(zip(predictions, token_boxes, offset_mapping)):
		is_subword = np.array(mapped.squeeze().tolist())[:,0] != 0
		if i == 0:
			true_predictions += [id2label[pred_] for idx, pred_ in enumerate(pred) if (not is_subword[idx])]
			true_boxes += [unnormalize_box(box_, width, height) for idx, box_ in enumerate(box) if not is_subword[idx]]
		else:
			true_predictions += [id2label[pred_] for idx, pred_ in enumerate(pred) if (not is_subword[idx])][STRIDE_COUNT + 1 - sum(is_subword[:STRIDE_COUNT + 1]):]
			true_boxes += [unnormalize_box(box_, width, height) for idx, box_ in enumerate(box) if not is_subword[idx]][STRIDE_COUNT + 1 - sum(is_subword[:STRIDE_COUNT + 1]):]

	return true_predictions, true_boxes

def prepare_input(path):
	img = cv2.imread(path, cv2.IMREAD_COLOR)

	encoding = processor(img, return_tensors="pt", truncation=True, stride = 128,\
						padding = 'max_length', max_length = 512, return_overflowing_tokens=True, return_offsets_mapping = True)

	x = []
	for i in range(0, len(encoding['pixel_values'])):
		x.append(encoding['pixel_values'][i])
	x = torch.stack(x)
	encoding['pixel_values'] = x
	offset_mapping = encoding.pop('offset_mapping')
	overflow_to_sample_mapping = encoding.pop("overflow_to_sample_mapping")
	for k,v in encoding.items():
		encoding[k] = v.to(device)
	return encoding, img, offset_mapping, overflow_to_sample_mapping
def get_entities_from_email(row):
    chosen = row
    encoding, img, offset_mapping, overflow_to_sample_mapping = prepare_input(chosen['path'])

    img = cv2.imread(chosen['path'], cv2.IMREAD_COLOR)
    
    height, width = img.shape[0], img.shape[1]
    
    d = pytesseract.image_to_data(chosen['path'], output_type=Output.DATAFRAME)
    d = d.loc[d.text.notna()]
    entities = []
    labels = []
    tokens = []
    current_entity = None
    bboxes = []
    for i, (box, label) in enumerate(zip(chosen['boxes'], chosen['labels'])):
        text_df = find_text(box, d, 0)
        if label != "O":
            entity = label.split('-')[1].replace("CC", "TO").strip()
#             entity = label.split("-")[1].strip()
            
            
            if box[0] < box[2] and box[1] < box[3]:
                if text_df.shape[0] == 0:
                    text = pytesseract.image_to_string(img[max(floor(box[1]), \
                            0):min(ceil(box[3]),height), max(floor(box[0]), 0):min(ceil(box[2]), width)]).strip()
                elif text_df.shape[0] >1:
                    text = pytesseract.image_to_string(img[max(floor(box[1]), 0)\
                                            :min(ceil(box[3]), height), max(floor(box[0]), 0)\
                                                           :min(ceil(box[2]), width)]).strip()
                else:
                    text = text_df.iloc[0]['text']
            else:
                text = ""
            if current_entity is None:
                entities.append([text])
#                 token_list.append([i])
                current_entity = entity
                bboxes.append(box)
                labels.append(current_entity)
                
                tokens.append((current_entity, entities[-1]))
            elif current_entity == entity:
#                 print(find_text(box, d, 0)['text'].shape[0])
                if not text == entities[-1][-1]:
                    entities[-1].append(text)
#                     token_list[-1].append(i)
            else:
#                 print(find_text(box, d, 0)['text'].shape[0])
                entities.append([text])
#                 token_list.append([i])
                current_entity = entity
                labels.append(current_entity)
                bboxes.append(box)
            
                tokens.append((current_entity, entities[-1]))
        else:
            if text_df.shape[0] == 1:

                tokens.append(("TEXT", text_df.iloc[0]['text']))
                bboxes.append(box)
                
            else:
                tokens.append(("TEXT", ""))
                bboxes.append(box)

    
    return (entities, labels, tokens, row['nodeID'], chosen['boxes'], chosen['labels'])
        

In [None]:
from transformers import AutoProcessor

# we'll use the Auto API here - it will load LayoutLMv3Processor behind the scenes,
# based on the checkpoint we provide from the hub
processor = AutoProcessor.from_pretrained("microsoft/layoutlmv3-base", apply_ocr=True)

In [8]:
all_emails_inferenced = all_emails_inferenced.loc[all_emails_inferenced.documentID.isin(only_labels.loc[only_labels.length != 0].documentID)]

In [None]:
all_emails_inferenced.shape

We now get the different email fields such as `To`, `From`, `CC`, etc.

In [None]:
all_entities_collected = []
for i, row in tqdm(all_emails_inferenced.iterrows(), total = all_emails_inferenced.shape[0]):
    all_entities_collected.append(get_entities_from_email(row))

In [11]:
with open("../models/email_parsing/all_entities_df_240415.pkl", "rb") as f:
    all_entities_df = pickle.load(f)

In [12]:
all_entities_df['label'] = all_entities_df['label'].apply(lambda x: [i if i != "CC" else "TO" for i in x ])
all_entities_df['tokens'] = all_entities_df['tokens'].apply(lambda x: [i if i[0] != "CC" else ("TO", i[1]) for i in x ])
all_entities_df['tokens_label'] = all_entities_df['tokens'].apply(lambda x: [i[0] for i in x])
all_entities_df['tokens_word'] = all_entities_df['tokens'].apply(lambda x: [i[1] if isinstance(i[1], list) else [i[1]] for i in x])

In [13]:

merged = pd.merge(all_entities_df, emails[["nodeID", "documentID", "page"]], left_on = "nodeID", right_on ="nodeID", how = 'inner')
merged = merged.drop_duplicates('nodeID')

In [15]:
merged = merged.sort_values(['documentID', 'page'])

last = None
lasttoken = 0
documentIDs = []
tokens = []
nodeIDs = []
labels = []
label = []
name = []
boxes = []
for i, row in merged.iterrows():
    if row['documentID'] != last:
        lasttoken = 0
    tokens.append([(k[0],k[1]) for k in row['tokens']])
    name.append([k for k in row['name']])
    documentIDs.append(row['documentID'])
    boxes.append([(i[0], i[1] + lasttoken, i[2], i[3] + lasttoken) for i in row['boxes']])
    nodeIDs.append(row['nodeID'])
    label.append(row['label'])
    labels.append(row['labels'])
    last = row['documentID']
    lasttoken += max([i[-1] for i in row['boxes']])

In [16]:
per_doc = pd.DataFrame({
    "documentID": documentIDs,
    "name": name,
    "tokens": tokens,
    "boxes": boxes,
    'label': label,
    "labels": labels,
    "nodeID": nodeIDs
})
per_doc = per_doc.groupby(["documentID"]).agg({'label': 'sum', \
                                        "nodeID": lambda x: list(x), "name": "sum", "tokens": "sum", "boxes": "sum", "labels" : lambda x: list(itertools.chain.from_iterable(x))}).reset_index()

In [17]:
def qualifier(x):
    if "TO" in x and "FROM" in x:
        return True
    elif "TO" in x and "HEADER" in x:
        return True
    elif "FROM" in x and "HEADER" in x:
        return True
    else:
        return False
def combine(row):
    results = []
    for i in range(len(row['label'])):
        if row['label'][i] == "TO" or row['label'][i] == "FROM" or row['label'][i] == 'HEADER':
            results.append((" ".join(row['name'][i]), row['label'][i]))
    return results

In [None]:
per_doc = per_doc[per_doc['label'].apply(qualifier)]
per_doc["names_normalized"] = per_doc.apply(lambda x: combine(x), axis = 1)
per_doc.shape

In [19]:
def find_consecutive_O(tokens):
    consecutive_Os = []
#     end = 0
    
    # Initialize variables to track consecutive "O"s
    start_index = None
    consecutive_count = 0
    
    for i, token in enumerate(tokens):
        # If current label is "O"
        if token[0] != "FROM" and token[0] != "TO" and token[0] != "FROM" and token[0] != "TO":
            # If this is the first "O" in a consecutive sequence
            if consecutive_count == 0:
                start_index = i
            consecutive_count += 1
        # If current label is not "O" and we have a consecutive sequence of "O"s
        elif consecutive_count > 0:
            end_index = i - 1
#             end_y = bounding_boxes[end_index][3]  # Ending vertical position
#             consecutive_Os.append((start_index, bounding_boxes[start_index][1], end_y, consecutive_count))
            consecutive_Os.append((start_index, end_index, consecutive_count))
            # Reset consecutive sequence variables
            start_index = None
            consecutive_count = 0
#         end = max(end, bounding_boxes[i-1][3])
    
    # If there are consecutive "O"s at the end of the sequence
    if consecutive_count > 0:
        end_index = len(labels) - 1
#         end_y = bounding_boxes[-1][3]  # Ending vertical position
#         consecutive_Os.append((start_index, bounding_boxes[start_index][1], end, consecutive_count))
        consecutive_Os.append((start_index, end_index, consecutive_count))
    
    return sorted(consecutive_Os, key = lambda x: x[-1], reverse = True)

def get_conversation(tokens):
    
    conversations_index = find_consecutive_O(tokens)
    
    conversations = []
    
    for i in conversations_index:
        try:
            conversations.append((" ".join([i[-1] for i in tokens[i[0]+1:i[1]]]), i[0]))
        except TypeError:
            pass
    return conversations

In [20]:
per_doc['conversations'] = per_doc['tokens'].apply(lambda x: get_conversation(x))

We use a simple name splitting model since bounding boxes captured in the Layout model includes all of the names.

In [None]:
import spacy
spacy.prefer_gpu()

nlp = spacy.load("../models/entity_splitting/nhgri_version/model-best/")


In [None]:
results = []
for i, row in tqdm(per_doc.iterrows(), total = per_doc.shape[0]):
    
    temp = []
    for name in row['names_normalized']:
        doc = nlp(name[0])
        temp_temp = []
        for ent in doc.ents:
            temp_temp.append(ent.text)
        temp.append(temp_temp)
    results.append(temp)

In [26]:
per_doc['split_names'] = results

In [27]:
from string import punctuation

whitespace_regex = re.compile(r"\s+", re.MULTILINE)
email_regex = re.compile(r'[%\w.+-—:]+@[\w-]+\.[\w.-]+')
# parentheses_regex = re.compile(r"\([^)]*\)")
parentheses_regex = re.compile(r'\[(?:[^\]]*)\]|\((?:[^)]*)\)')
def separate_institutions(text):
    
    matches = []
    for match in parentheses_regex.findall(text):
        text = text.replace(match, "")
        matches.append(match)
    return text, matches

def remove_emails(text):
    matches = []
    for match in email_regex.findall(text):
        text = text.replace(match, "")
        matches.append(match)
    return text, matches

def remove_whitespace(text):
    

    return whitespace_regex.sub("", text)

def remove_keywords(text):
    return text.replace("[mailto:]", "").replace("<mailto:>", "").replace("[mailto: ]", "").\
replace("[c]", "").replace("[e]", "").replace("phd", "").replace("ph.d", "").replace("@ internet", "").replace("@internet", "")
def reorder_names(text):
    
    if text[0] != "," or text[-1] != ",":
        if len(text.split(',')) == 2:
            return text.split(',')[-1].strip() + " " + text.split(",")[0].strip()
        else:
            return text
    else:
        return text
    

def remove_trailing_char(text):
    start_index = 0
    end_index = len(text)

    # Find the starting index of alphanumeric character
    for i, char in enumerate(text):
        if char.isalnum():
            start_index = i
            break

    # Find the ending index of alphanumeric character
    for i in range(len(text) - 1, -1, -1):
        if text[i].isalnum():
            end_index = i + 1
            break

    # Remove trailing and leading punctuation
    trimmed_text = text[start_index:end_index].strip(punctuation)
    return trimmed_text


def normalize_names(row):
    normalize_names = []
    
    orgs = []
    emails = []
    for group in row['split_names']:
        temp = []
        for name in group:
            text, extracted_email = remove_emails(name.strip().lower())
            
            emails.extend(extracted_email)
            
            text = remove_keywords(text.strip()).strip()
            
            text, extracted_org = separate_institutions(text)
            
            orgs.extend(extracted_org)
            
            text = remove_trailing_char(text.strip()).strip()
            
            if len(text) < 3:
                if len(extracted_email) > 0:
                    temp.append(extracted_email[0])
                else:
                    temp.append(text)
            else:
                temp.append(reorder_names(text).strip())
        normalize_names.append(temp)
    return normalize_names

prefixes_suffixes = ["mr.", "mr", "mrs", "mrs.", "dr", "dr.", "phd", "ph.d", "ms", "ms.", "mister", "miss", "doctor", "jr.", "jr", "frs"]

prefix_suffix_pattern = r'\b(?:' + "|".join(map(re.escape, prefixes_suffixes)) + r')\b'
def normalize_name(name):
    
    if len(name) > 40:
        return name
    
    
    
    
    # remove all prefix and suffix
    cleaned_name = re.sub(prefix_suffix_pattern, '', unidecode(name.lower()))
    
    
    # remove every leading and trailing commas
    cleaned_name = re.sub(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9]+$', '', cleaned_name)
    
    # remove everything that is not a alphabetic character and a comma
    
    cleaned_name = re.sub(r"[^a-zA-Z\s,@0-9]" ,'', cleaned_name)
    if "," in cleaned_name:
        
        # if there is a comma, there should only be one as its usually lastname, first name
        if len(cleaned_name.split(",")) > 2:
            return name
        else:
            first = cleaned_name.split(",")[0]
            last = cleaned_name.split(",")[-1]

            if len(first) == 1 or len(last) == 1:
                return name
            else:
                return cleaned_name
    
    return cleaned_name

In [28]:
per_doc['orgs'] = per_doc.apply(lambda x: normalize_names(x), axis = 1)
per_doc['emails'] = per_doc.apply(lambda x: normalize_names(x), axis = 1)
per_doc['normalized_names'] = per_doc.apply(lambda x: normalize_names(x), axis = 1)


We now encode all names into identifiers.

This cached file mapped disambiguated names to identifiers.

In [44]:
with open("../cache/name_to_ID_email_network_240502.json", "r") as f:
    name_to_ID = json.load(f)

In [69]:
def map_name_to_ID(x):
    mapped = []
    
    for names in x:
        temp = []
        for name in names:
            if name in name_to_ID:
                temp.append(name_to_ID[name])
            else:
                temp.append(name)
        mapped.append(temp)
    return mapped

In [70]:
per_doc['IDs'] = per_doc['normalized_names'].apply(map_name_to_ID)

In [71]:
def sort_indices_for_paired_pattern(lst):
    indices_A = [i for i, val in enumerate(lst) if val == 'FROM']
    indices_B = [i for i, val in enumerate(lst) if val == 'TO']

    sorted_indices_A = sorted(indices_A)
    sorted_indices_B = sorted(indices_B)

    sorted_indices = []
    for a, b in zip(sorted_indices_A, sorted_indices_B):
        sorted_indices.extend([a, b])
    # print(sorted_indices)
    return sorted_indices


def get_pairs_of_emails(row):
    number_of_conversations = 0
    conversations = []
    filtered_label = [i for i in row['label'] if (i == 'HEADER') or (i == 'TO') or (i == 'FROM')]
    pairs = []
    if 'FROM' not in filtered_label and 'HEADER' in filtered_label:
        from_entity = row['IDs'][filtered_label.index('HEADER')]
        to_entity = None
        for i, (label, field) in enumerate(zip(filtered_label, row['IDs'])):
            if label == 'TO':
                if from_entity is not None:
                    for ent in field:
                        for ent_ in from_entity:
                            pairs.append((ent_, ent, ))
                    conversations.append(pairs)
                    pairs = []
                    number_of_conversations += 1
                else:
                    if i == len(filtered_label) -1:
                        for ent in field:
                            for ent_ in to_entity:
                                pairs.append((ent, ent_))
                        conversations.append(pairs)
                        pairs = []
                        number_of_conversations += 1
                    else:
                        # from_entity = None
                        to_entity = field
            if label == 'FROM':
                if to_entity is not None:
                    # print(to_entity)
                    for ent in field:
                        for ent_ in to_entity:
                            pairs.append((ent, ent_))
                    conversations.append(pairs)
                    pairs = []
                    number_of_conversations += 1
                else:
                    if i == len(filtered_label) -1:
                        for ent in from_entity:
                            for ent_ in field:
                                pairs.append((ent, ent_))
                        conversations.append(pairs)
                        pairs = []
                        number_of_conversations += 1
                    else:
                        from_entity = field
    elif 'TO' not in row['label'] and 'HEADER' in filtered_label:
        to_entity = row['IDs'][filtered_label.index('HEADER')]
        from_entity = None
        for i, (label, field) in enumerate(zip(filtered_label, row['IDs'])):
            if label == 'TO':
                if from_entity is not None:
                    for ent in field:
                        for ent_ in from_entity:
                            pairs.append((ent_, ent))
                    number_of_conversations += 1
                    conversations.append(pairs)
                    pairs = []
                else:
                    if i == len(filtered_label) -1:
                        for ent in field:
                            for ent_ in to_entity:
                                pairs.append((ent, ent_))
                        number_of_conversations += 1
                        conversations.append(pairs)
                        pairs = []
                    else:
                        # from_entity = None
                        to_entity = field
            if label == 'FROM':
                if to_entity is not None:
                    # print(to_entity)
                    for ent in field:
                        for ent_ in to_entity:
                            pairs.append((ent, ent_))
                    number_of_conversations += 1
                    conversations.append(pairs)
                    pairs = []
                else:
                    if i == len(filtered_label) -1:
                        for ent in from_entity:
                            for ent_ in field:
                                pairs.append((ent, ent_))
                        number_of_conversations += 1
                        conversations.append(pairs)
                        pairs = []
                    else:
                        from_entity = field

    else:
        to_entity = None
        from_entity = None

        sorted_indices = sort_indices_for_paired_pattern(filtered_label)
        for i, index in enumerate(sorted_indices):
        # for i, (label, field) in enumerate(zip(filtered_label, row['name_ID'])):
            if filtered_label[index] == 'TO':
                if from_entity is not None:
                    for ent in row['IDs'][index]:
                        for ent_ in from_entity:
                            pairs.append((ent_, ent))
                    number_of_conversations += 1
                    conversations.append(pairs)
                    pairs = []
                else:
                    if i == len(filtered_label) -1:
                        for ent in row['IDs'][index]:
                            for ent_ in to_entity:
                                pairs.append((ent, ent_))
                        number_of_conversations += 1
                        conversations.append(pairs)
                        pairs = []
                    else:
                        # from_entity = None
                        to_entity = row['IDs'][index]
            if filtered_label[index] == 'FROM':
                if to_entity is not None:
                    # print(to_entity)
                    for ent in row['IDs'][index]:
                        for ent_ in to_entity:
                            pairs.append((ent, ent_))
                    number_of_conversations += 1
                    conversations.append(pairs)
                    pairs = []
                else:
                    if i == len(filtered_label) -1:
                        for ent in from_entity:
                            for ent_ in row['IDs'][index]:
                                pairs.append((ent, ent_))
                        number_of_conversations += 1
                        conversations.append(pairs)
                        pairs = []
                    else:
                        from_entity = row['IDs'][index]
                    # to_entity = None
        
    return conversations, number_of_conversations

In [72]:
per_doc['pairs'] = per_doc.apply(lambda x: get_pairs_of_emails(x), axis =1)
per_doc['num_conversations'] = per_doc['pairs'].apply(lambda x: x[-1])
per_doc['pairs'] = per_doc['pairs'].apply(lambda x: x[0])

In [73]:
import datefinder
from datetime import datetime

We manually fix frequently missed dates.

In [None]:
all_dates = []
counter = 0
counter_1 = 0
date_reg_exp = re.compile('(\d{1,2}/\\d{1,2}/\\d{2,4})')
for i, row in per_doc.iterrows():
    temp = []
    try:
        dates = datefinder.find_dates(" ".join(list(row['name'])[list(row['label']).index('DATE')]))
        datestring= " ".join(list(row['name'])[list(row['label']).index('DATE')])
        if "4/15/94 4:11PM (2058 bytes: 41 In)" == datestring:
            temp.append(datetime(1994, 4, 15))
        elif "Wednesday, Apri! 28, 2004 8:26 PM" == datestring:
            temp.append(datetime(2004, 4, 28))
        elif "4 Wednesday, December 20, 2000 6:40 PM" == datestring:
            temp.append(datetime(2000, 12, 20))
        elif "Thursday, July 14,2005 3:49PM ." == datestring:
            temp.append(datetime(2005, 7, 14))
        elif "9/15/94 9:05PM (1973 bytes:" == datestring:
            temp.append(datetime(1997, 9, 15))
        elif "Monday, Apri! 28, 2003 11:20 AM" == datestring:
            temp.append(datetime(2003, 4, 28))
        elif "12/01/98 TUE 13:35 FAX 301 SIP™837" == datestring:
            temp.append(datetime(1998, 12, 1))
        elif "10/2/92 1:15PM (1019 bytes: 11 ln)" == datestring:
            temp.append(datetime(1992, 10, 2))
        elif "4/11/95 1:51PM (5693 bytes: 86" == datestring:
            temp.append(datetime(1995, 4, 11))
        elif "9/19/94 10:28AM (684 bytes: 9 ln)" == datestring:
            temp.append(datetime(1994, 9, 19))
        elif "8/5/94 1:29PM (1619 bytes: 31 1n)" == datestring:
            temp.append(datetime(1994, 8, 5))
        elif "7/29/94 3:01PM (2168 bytes: 36 ln)" == datestring:
            temp.append(datetime(1994, 7, 29))
        elif "Friday, November 08; 2002 5:39 PM" == datestring:
            temp.append(datetime(2002, 11, 8))
        elif "11/17/93 10:44AM (520 bytes: 10 ln)" == datestring:
            temp.append(datetime(1993, 11, 17))
        elif "Monday, Apri! 25, 2005 6:42 PM" == datestring:
            temp.append(datetime(2005, 4, 25))
        elif " December 08, 1999 6)08 AM" == datestring:
            temp.append(datetime(1999, 12, 8))
        for date in dates:
            if date.year > 1980 and date.year < 2016:
                new_datetime_object = datetime(date.year, date.month, date.day)
                temp.append(new_datetime_object)
            else:
                if date in temp:
                    print(date, \
                          " ".join(list(row['name'])[list(row['label']).index('DATE')].tolist()))
                    counter_1 += 1
        all_dates.append(list(set(temp)))
    except:
        counter += 1
#         traceback.print_exc()
        all_dates.append([None])

In [75]:
per_doc['date'] = [i[0] if len(i) > 0 else None for i in all_dates]
per_doc = per_doc.loc[per_doc.date.notna()]

In [76]:
per_doc['conversations'] = per_doc.apply(lambda x: sorted(x['conversations'][:x['num_conversations']], key = lambda y: y[-1]), 
                                         axis = 1)

In [168]:
all_email_pairs = []

for i, row in per_doc.iterrows():
    
    for conversation, pair in zip(row['conversations'], row['pairs']):
        if pair is None:
            print(row)
        all_email_pairs.append((conversation[0], pair, row['date'], row['documentID']))

In [169]:
all_pairs = pd.DataFrame(all_email_pairs, columns = ["conversation", "pair", "date", "documentID"]).explode('pair').drop_duplicates()

In [170]:
all_pairs = all_pairs.loc[all_pairs.pair.notna()]

In [171]:
all_pairs = all_pairs.loc[all_pairs.pair.notna()].drop_duplicates()

In [172]:
all_names_from_email = all_pairs['pair'].to_list()
all_names_from_email= [e for l in all_names_from_email for e in l if l is not None]
all_identifiers = [i for i in all_names_from_email if str(i).isnumeric()]
counter_pre_disambiguated = Counter(all_names_from_email)
counter_all_ids = Counter(all_identifiers)


counter_names_pre_disambiguated = pd.DataFrame([(counter_pre_disambiguated[i], i) for i in sorted(counter_pre_disambiguated, \
                                                        key = lambda x: counter_pre_disambiguated.get(x),  reverse = True)], columns\
                         = ["count", "ID"])

counter_names_pre_disambiguated['cumsum'] = counter_names_pre_disambiguated['count'].cumsum() / sum(counter_pre_disambiguated.values())
counter_names_pre_disambiguated['cumsum'] = 1 - counter_names_pre_disambiguated['cumsum']

counter_names_pre_disambiguated.loc[-1] = [0, 'origin', 1]
counter_names_pre_disambiguated = counter_names_pre_disambiguated.sort_index()
counter_names = pd.DataFrame([(counter_all_ids[i], i) for i in sorted(counter_all_ids, \
                                                        key = lambda x: counter_all_ids.get(x),  reverse = True)], columns\
                         = ["count", "ID"])

counter_names['cumsum'] = counter_names['count'].cumsum() / sum(counter_all_ids.values())
counter_names['cumsum'] = 1 - counter_names['cumsum']

counter_names.loc[-1] = [0, 'origin', 1]
counter_names = counter_names.sort_index()

In [83]:
import pandas as pd
import re
from unidecode import unidecode
import json
from tqdm import tqdm

from collections import defaultdict
from collections import Counter

import numpy as np

import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style('white', rc={
    'xtick.bottom': True,
    'ytick.left': True,
})



matplotlib.rcParams.update({"axes.labelsize": 7,
"xtick.labelsize": 7,
"ytick.labelsize": 7,
"legend.fontsize": 7,
"font.size":7})
matplotlib.rc('font', family='Helvetica') 
matplotlib.rc('pdf', fonttype=42)
matplotlib.rc('text', usetex='false') 
matplotlib.rcParams['axes.unicode_minus'] = False

matplotlib.rcParams['xtick.major.size'] = 2
matplotlib.rcParams['xtick.major.width'] = 0.5
matplotlib.rcParams['xtick.minor.size'] = 2
matplotlib.rcParams['xtick.minor.width'] = 0.5

matplotlib.rcParams['ytick.major.size'] = 2
matplotlib.rcParams['ytick.major.width'] = 0.5
matplotlib.rcParams['ytick.minor.size'] = 2
matplotlib.rcParams['ytick.minor.width'] = 0.5

In [None]:
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize=(3,2), dpi = 300)
ax.spines['right'].set_linewidth(0)
ax.spines['top'].set_linewidth(0)

counter_names['percentage'] = counter_names['cumsum'] * 100
counter_names_pre_disambiguated['percentage'] = counter_names_pre_disambiguated['cumsum'] * 100
sns.lineplot(data = counter_names.reset_index(), x = 'index', \
            y ='percentage', linewidth = 1, color = 'k', ax = ax, label = "Disambiguated emails")

sns.lineplot(data = counter_names_pre_disambiguated.reset_index(), x = 'index', \
            y ='percentage', linewidth = 1, color = 'k', ax = ax, alpha = 0.4, label = "Pre-disambiguated emails")

# ax.axhline(xmin = 0, xmax = 5000, y = 100 * ((counter_all_names.total() - counter_all_ids.total()) / counter_all_names.total()), \
#            c = 'k', linewidth = 0.5, linestyle = '--')

# ax.axhline(xmin = 0, xmax = 500, y = 100 * ((counter_all_names.total() - counter_all_ids_valid.total()) / counter_all_names.total()), \
#            c = 'darkred', linewidth = 0.5, linestyle = '--')

ax.axvline(ymin=0.05, ymax = 0.97, x = 500, c = 'darkred', linewidth = 0.5, linestyle = '--')
# ax.axhline(xmin=0, xmax=1, y = 100 * (counter_names.iloc[500]['cumsum']), c = 'darkred', linewidth = 0.5, linestyle = '--')

ax.tick_params(axis='x', which='minor', bottom=True)
# ax.xaxis.set_minor_locator(AutoMinorLocator(4))

# ax.text(550, 12, "{:.1f}%".format(100 * ( 1- counter_names.iloc[500]['cumsum'])), c = 'darkred')

ax.fill_between(np.arange(0, 500), np.zeros(500), counter_names.iloc[0:500]['cumsum'] *100, color = 'grey')
ax.text(550, 55, "500 most mentioned email sender/recipients", c = 'darkred')

# ax.text(400, -2, "{:.1f}%".format(100 * (1 - ((counter_all_names.total() - counter_all_ids.total()) / \
#                                               counter_all_names.total()))), c = 'k')

# ax.text(1500, -2, "disambiguation", c = 'k')

plt.legend(frameon = False)
ax.set_ylabel("Cumulative % of mentions\nof individuals")
ax.set_xlabel("Individual rank")
plt.show()
# plt.savefig("../figures/SI_email_disambiguation_pareto.pdf", dpi = 300, bbox_inches = "tight")

In [None]:
sum([i[-1] for i in counter_all_ids.most_common(500)])/sum(counter_pre_disambiguated.values())

We also cleaned frequently mentioned domains and institutions. This cached file is available upon request.

In [90]:
domains_dict = pd.read_excel("../models/email_clean_manual/cleaned_domains.xlsx", sheet_name = "domains_dict")[['domain', 'name']]
entities_dict = pd.read_excel("../models/email_clean_manual/cleaned_domains.xlsx", sheet_name = "entities_dict")[["name", "fullname", "country", "city", "category", "NIH"]]

entities_dict['NIH'] = entities_dict['NIH'].apply(lambda x: True if x == 1 else False)

domains_total = pd.merge(domains_dict, entities_dict, left_on = 'name', right_on = 'name')

domain_list_total  = domains_total['domain'].tolist()

domains_total = domains_total.fillna("")

In [91]:
affiliations_500 = pd.read_csv("../models/email_clean_manual/top_500_affiliations_completed.csv")

In [None]:
counter_all_ids.most_common(500)

In [175]:
all_email_pairs = []

for i, row in per_doc.iterrows():
    
    for conversation, pair in zip(row['conversations'], row['pairs']):
        if pair is None:
            print(row)
        all_email_pairs.append((conversation[0], pair, row['date'], row['documentID']))

In [176]:
all_email_pairs = pd.DataFrame(all_email_pairs, columns = ["conversation", "pair", "date", "documentID"]).explode('pair').drop_duplicates()

In [177]:
all_email_pairs = all_email_pairs.loc[all_email_pairs.pair.notna()]
# all_email_pairs['pair'] =all_email_pairs['pair'].apply(lambda x:  correct(x))
all_email_pairs['To'] = all_email_pairs['pair'].str[-1].astype(str)
all_email_pairs['From'] = all_email_pairs['pair'].str[0].astype(str)


In [178]:
all_email_pairs = all_email_pairs.loc[all_email_pairs.To != all_email_pairs.From]

In [179]:
all_email_pairs = all_email_pairs.loc[all_email_pairs.To != all_email_pairs.From]
all_email_pairs = all_email_pairs.loc[(all_email_pairs.To.str.isnumeric()) & (all_email_pairs.From.str.isnumeric())]
all_email_pairs = all_email_pairs.loc[(all_email_pairs.To.isin([str(i[0]) for i in counter_all_ids.most_common(507)])) &
                                      (all_email_pairs.From.isin([str(i[0]) for i in counter_all_ids.most_common(507)]))]

In [None]:
len(list(set(all_email_pairs.To.astype(int).to_list() + all_email_pairs.From.astype(int).to_list())))

In [145]:
map_affiliation_from_ID = affiliations_500.set_index("ID")['groundtruth'].to_dict()
all_email_pairs['To_org'] = all_email_pairs['To'].apply(lambda x: map_affiliation_from_ID[int(x)] if int(x) in map_affiliation_from_ID else "other")
all_email_pairs['From_org'] = all_email_pairs['From'].apply(lambda x: map_affiliation_from_ID[int(x)] if int(x) in map_affiliation_from_ID else "other")

In [146]:
categories_dict = entities_dict.set_index("name")['category'].to_dict()

In [147]:
def category(x):
    if x == "nih":
        return "nih"
    elif x in categories_dict:
        return categories_dict[x]
    else:
        return "other"
all_email_pairs['To_category'] = all_email_pairs["To_org"].apply(lambda x: category(x))
all_email_pairs['From_category'] = all_email_pairs["From_org"].apply(lambda x: category(x))

In [None]:
len(set(all_email_pairs['To']).union(
set(all_email_pairs['From'])
))

In [None]:
len(set(all_email_pairs.loc[(all_email_pairs.To_category.isin(["nih", "academic", "private-nonprofit"]))]['To']).union(
set(all_email_pairs.loc[(all_email_pairs.From_category.isin(["nih", "academic", "private-nonprofit"]))]['From'])
))

In [150]:
del all_email_pairs['pair']

In [151]:
all_email_pairs.to_parquet("../cache/all_email_pairs_240520.parquet")

In [None]:
all_email_pairs[['conversation', 'date', 'documentID', 'To', 'From']].to_parquet('../models/sbm/emails_network_pairs_top500_240416.parquet')