In [None]:
from codecs import xmlcharrefreplace_errors
!pip install Faker
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install openai
!pip install datasets

In [None]:
from faker import Faker 
import itertools
import spacy
import random
import pandas as pd
import math
import openai 
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict, Features, Sequence, Value, ClassLabel, load_from_disk
import os
from dotenv import load_dotenv

In [None]:
load_dotenv()

In [None]:
update_val_test = False
TRAIN_DATASET = 'filtered_dataset_train_modif_2'
OPENAI_API_KEY = os.getenv('OPEN_API_KEY')
openai.api_key = OPENAI_API_KEY

## Load and Prepare the Data for the Templates

In [None]:
"Read the Parquet file with the data tor the templates and convert it to a pandas DataFrame
train_df = spark.read.parquet('ner_research/sample').toPandas()

In [None]:
# Remove any leading '+' characters from the 'phone' column in the DataFrame to generalize better how the phones are presented in the templates
train_df['phone'] = train_df['phone'].apply(lambda x: x.lstrip('+'))

In [None]:
" Create a new 'webpage' column by removing spaces and converting the orpanization column to lowercase to add webpages in the templates
train_df['webpage'] = train_df['organization'].apply(lambda x: x. replace(" ",
"").lower())

In [None]:
train_df['social_networks'] = train_df['email'].apply(lambda x: x. split("@") [0].lower ())

In [None]:
# Initialize a Faker obiect to generate fake data
fake = Faker()

In [None]:
def remove_spaces_conditionally(x, index, condition) :
    """
    Remove spaces from a string based on a condition applied to its index.
    
    Args:
    x (str): The input string.
    index (int): The index of the input string.
    condition (callable): A function to apply the condition on the index.
    
    Returns:
    str: The modified string with spaces removed if the condition is met, otherwise the original string.
    """
    
    if condition(index) :
        return x.replace(" ", "")
    else:
        return x

def change_location_conditionally(x, index, condition):
    """"
    Change a location string based on a condition applied to its index.
    Args:
    x (str): The input location string.
    index (int): The index of the input
    condition (callable): A function to apply the condition on the index.

    Returns:
    str: A fake address if the condition is met, otherwise the original location string.
    """

    if condition (index) :
        return fake.address ()
    else:
        return x
    
def condition(index):
    """"
    Check if the given index is even.
    
    Args:
    index (int): The index to check.
    
    Returns:
    bool: True if the index is even, False otherwise.
    """
    return index % 2 == 0

In [None]:
# Update the 'phone' column by removing spaces conditionally using the 'condition' function to generalize better how the phones are presented in the templates
train_df['phone'] = train_df.apply(lambda row: remove_spaces_conditionally(row['phone'], row.name, condition), axis=1)

In [None]:
# Update the 'location' column by changing the location conditionally using the 'condition' function to generalize better how the locations are presented in the templates
train_df['location'] = train_df.apply(lambda row: change_location_conditionally(row[' location'], row.name, condition), axis=1)

In [None]:
train_df

In [None]:
# Define the number of rows you want in the Validation and Test DataFrame
NUM_SAMPLES = 1000
# Generate the data
data = {'name': [fake.name() for _ in range (NUM_SAMPLES)] ,
        'job_title': [fake.job() for _ in range (NUM_SAMPLES)] ,
        'organization': [fake.company() for _ in range(NUM_SAMPLES)] ,
        'location': [fake.address() for _ in range(NUM_SAMPLES)]}

# Create the pandas DataFrame
val_test_df = pd.DataFrame(data)

## Generate the templates for the training Set

In [None]:
NUM_CLOSINGS = 15
response = openai.ChatCompletion.create(
    model='gpt-4',
    messages=[{"role": "user", "content": f'Generate {NUM_CLOSINGS} examples of messages that typically appear after an email signature. Include elements such as disclaimers, reminders, postscripts (P.S.), inspirational messages, important dates, and other types of notices.'}],
    max_tokens=4000,
    n=1,
    stop=None,
    temperature=1
)

#Get the response content
closing_text = response['choice'][0]['message']['content']

In [None]:
closings = [re.sub(r'^\d+\.\s+', '', message.strip()) for message in re.split(r"\n\n\d+\.", closing_text) if message.strip()]

In [None]:
NUM_SPAMS = 40
response = openai.ChatCompletion.create(
    model='gpt-4',
    messages=[{"role": "user", "content": f'Generate a list of {NUM_SPAMS} creative and diverse spam email signatures from various fields and interests. Incorporating elements commonly found in spam emails, such as promotions, URLs, dollar amounts, dates, questions, subscriptions, unsubscriptions, access denials, daily reviews, Linkedin job suggestions, and more, using fictitious information. Start some of the messages with the organization name.'}],
    max_tokens=8000,
    n=1,
    stop=None,
    temperature=1.1
)

spam_text = response['choices'][0]['message']['content']

In [None]:
spams = [re.sub(r'^\d+\.\s+', '', message.strip()) for message in re.split(r"\n\d+\.", spam_text) if message.strip()]

In [None]:
def print_sometimes(attribute, threshold, new_line=True):
    #Generate a random number between 0 and 1
    probability = random.random()

    # Print the message if the random number is greater than the threshold
    if probability > threshold:
        if new_line:
            return f'{attribute}\n'
        else:
            return attribute
    else:
        return ""

openings = ["Best regards,",
            "Kind regards,",
            "Sincerely,",
            "Best,",
            "Warm regards,",
            "Personal contact information"]

name_prefix = lambda: f"{fake.prefix()} "

name = [f"{print_sometimes('Name:', 0.9, False)}"+print_sometimes('{name}', 0.05, False)]

name_suffix = [" (she/her)", " (he/him)", lambda: f" {fake.suffix()}"]

title_section = f"{print_sometimes('Job Title: ', 0.9, False)}"+"{job_title}"+f"{print_sometimes ('Department', 0.9, False)}"

organization_section = "{organization}"

location_section = f"{print_sometimes ('Address: ', 0.9, False)}"+"{location}"+f"{print_sometimes(', ', 0.5, False)}"+f"{print_sometimes({fake.country()}, 0.5, False) }"

email_section = ["Email: {email}",
                 "{email}"]

phone_section = [f"Phone{print_sometimes(':', 0.5, False)} {print_sometimes('+', 0.5, False)}"+"{phone}",
                 f"Telephone{print_sometimes(':', 0.5, False)} {print_sometimes('+', 0.5, False)}"+"{phone}",
                 f"Contact{print_sometimes(':', 0.5, False)} {print_sometimes('+', 0.5, False)}"+"{phone}",
                 f"Direct{print_sometimes(':', 0.5, False)} {print_sometimes('+', 0.5, False)}"+"{phone}",
                 f"Reception{print_sometimes(':', 0.5, False)} {print_sometimes('+', 0.5, False)}"+"{phone}",
                 f"Office{print_sometimes(':', 0.5, False)} {print_sometimes ('+', 0.5, False)}"+"{phone}",
                 f"Call{print_sometimes(':', 0.5, False)} {print_sometimes('+', 0.5, False)}"+"{phone}",
                 f"Mobile{print_sometimes(':', 0.5, False)} {print_sometimes('+', 0.5, False)}"+"{phone}",
                 f"{print_sometimes('+', 0.5, False)}"+"{phone}"]

webpage_section = ["{webpage}/"+f"{fake.dga()}"+"fkedpem0m-2f-2fbglbdaiofac0jfz",
                   "www.{webpage}/"+f"{fake.dga()}"+"byb21",
                   "http://{webpage}/"+f"{fake.dga()}",
                   "https://{webpage}/"+f"{fake.dga()}"]

closings = ['sent from outlook for ios',
            f"work days: {fake.day_of_week()}-{fake.day_of_week()} {fake.time('%H:%M')}am-{fake.time('%:%M')}pm",
            'all rights reserved',
            f"registered in {fake.countr()}"]+closings

social_networks = [f"{print_sometimes('LinkedIn: ', 0.5, False)}"+"linkedin.com/in/{social_networks}",
                   f"{print_sometimes('Twitter: ', 0.5, False)}"+"twitter.com/{social_networks}",
                   f"{print_sometimes('Skype live: ', 0.5, False)}"+"{social_networks}"]

attribute_sections = [title_section,
                      organization_section,
                      location_section]

separators = ["\n", ", ", " - ", " | ", " "]

# Generate combinations of attributes (title, organization, location) and combinations of email and phone sections
attribute_combinations = list(set([
    sep.join(perm)
    for n in range(len(attribute_sections) + 1)
    for subset in itertools.combinations(attribute_sections, n)
    for perm in itertools.permutations(subset)
    for sep in separators
]))

email_phone_combinations = list(set([
    email + (sep if email and phone else "") + phone
    for email, phone in itertools.product(email_section, phone_section)
    for sep in separators
]))+[""]

# Generate various email signature templates using combinations of different sections (salutations, name, attributes, email_phone, webpage, name_noises, and closing_noises)
normal_templates = list(set([
    f"{print_sometimes(opening, 0.1)}"
    f"{print_sometimes('', 0.9)}"
    f"{(print_sometimes(name_prefix(), 0.6, False) if name else '')}"
    f"{name}"
    f"{(print_sometimes(',', 0.3, False) if name else '')}"
    f"{(print_sometimes(name_suf() if callable(name_suf) else name_suf, 0.6, False) if name else '')}"
    f"{(print_sometimes('', 0) if name else '')}"
    f"{attributes}"
    f"{(print_sometimes('', 0) if attributes else '')}"
    f"{email_phone}"
    f"{(print_sometimes('', 0) if email_phone else '')}"
    f"{print_sometimes(webpage, 0.1)}" 
    f"{print_sometimes(social_network, 0.1)}"
    f"{print_sometimes (closing,0.1, False)}"
    for opening,name, attributes,email_phone, webpage, name_suf, social_network, closing 
    in itertools.product(openings, name, attribute_combinations, email_phone_combinations, webpage_section, name_suffix, social_networks, closings)
]))

In [None]:
# Calculate the total number of unique email signature templates generated
len(normal_templates+spams)

In [None]:
select_normal = random.sample(normal_templates, train_df.shape[0])
templates = select_normal+spams
num_templates = train_df.shape[0]+NUM_SPAMS
train = train_df.reindex(range(num_templates), fill_value='nan')
train['template'] = templates

In [None]:
# Print the first 10 email signature templates for demonstration purposes
for template in train.sample(50)['template']:
    print(f'(template)\n')

In [None]:
PROPORTION_SPAMS = 0.05
num_spam = int(PROPORTION_SPAMS*NUM_SAMPLES)

def generate_email_signature(batch_size):
    
    name = list(val_test_df['name'])
    job_title = list(val_test_df['job_title'])
    organization = list(val_test_df['organization'])
    location = list(val_test_df['location'])
    num_batches = math.ceil(NUM_SAMPLES / batch_size)
    Signatures = list()
    Spams = list()
    
    for i in tqdm(range(num_batches), desc="Generating signatures"):
        start_index = i * batch_size
        end_index = min((i + 1) * batch_size, NUM_SAMPLES)
        batch_names = name[start_index:end_index]
        batch_job_title = job_title[start_index:end_index]
        batch_organization = organization[start_index:end_index]
        batch_location = location[start_index:end_index]
        
        prompt = (
        f'Generate {batch_size} diverse and realistic text email signature templates. For each template, use the corresponding elements from the {batch_names} (for names), {batch_job_title} (for job titles), {batch_organization} (for company names), and {batch_location} (for addresses) lists based on their indices (e.g., use the first elements from each list for the first template, the second elements for the second template, and so on). It is imperative to use the elements from {batch_names}, {batch_job_title}, {batch_organization}, and {batch_location} precisely as they are given, without any alteration. Do not modify, adjust, or correct the elements in any way, including not capitalizing, not changing spacing, not changing or adding any type of punctuation, word order, or formatting. Preserve the original state of each element unconditionally. For example, if the given company name is "Ballard, Williams and Smith," use it as is, without changing it to "Ballard, Williams, and Smith." If the job title is "Diagnostic radiographer," leave it like that and do not change it to "Diagnostic Radiographer." Please respect this for all templates. In the templates, vary the order of elements in a realistic manner. Ensure that all variations make sense and are consistent with real-world email signatures. In addition to the provided elements, incorporate a variety of common email signature elements, generating fictional information for each, in no particular order, including but not limited to:'
        f'-Occasionally, a message before the signature'
        f'-Name followed by titles, pronouns, certifications, or awards'
        f'-Department names'
        f'-Multiple phone numbers and contact details, including office, mobile, and fax numbers'
        f'-Email addresses'
        f'-Social media links (LinkedIn, Twitter, etc.)'
        f'-Company websites and personal websites or blogs'
        f'-Disclaimers and confidentiality statements'
        f'-Legal or regulatory information'
        f'-Slogans, quotes, or inspirational messages!'
        f'Include region-specific signatures, and represent QR codes as URLs.'
        f'Vary the presentation of the additional elements, sometimes including only a subset of them or using different formats and arrangements for a more diverse and realistic set of templates.'
        f'Present the templates consistently, separated as follows:'
        f'Template 1:'
        f'Template 2:'
        f'And so on, ensuring clear separation between templates."
    )
        
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=4500,
        n=1,
        Stop=None,
        temperature=0.9
    )
    
    # Get the response content
    response_text = response[' choices'][0]['message ']['content']
    Signatures.extend([template.strip() for template in re.split(r"Template\s+\d+:", response_text) if template.strip()])
    
    prompt = (
        f"Generate a list of (num_spam) diverse spam email signatures without including name, job title, company name, or address. Incorporate elements typically found in spam, such as promotions and URLs, with fake information."
        f"Present the templates consistently, separated as follows: "
        f"Template 1:"
        f"Template 2:"
        f"And so on, ensuring clear separation between templates."
    )
    
    response = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": prompt}],
        max_tokens=1500,
        n = 1,
        Stop=None,
        temperature=0.9
    )
    
    response_text = response['choices'][0]['message']['content']
    Spams.extend([template.strip() for template in re.split(r"Template\s+\d+:", response_text) if template.strip()])
    
    return Signatures+Spams

In [None]:
if update_val_test:
    signatures = generate_email_signature(20)
    # Add generated_signatures as a new column and add 'NA' or NaN to extra rows in other columns
    num_signatures = NUM_SAMPLES+num_spam-1
    val_test_df = val_test_df.reindex(range(num_signatures), fill_value='nan')
    val_test_df['signature'] = signatures

    val_test_data = val_test_df

## Create Dataset objects for Training, Validation and Test Set

In [None]:
# Load the NLP spacy "en_core_web_sm" model and define a function to process email signatures
# The function assigns NER tags (PER, JOB, ORG, LOC) to the relevant sections in the signature
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner", "lemmatizer", "attribute_ruler"])

def process_email_signature(row, train):
    if train:
        signature = row['template'].format (**row)
    else:
    signature = row['signature']

    doc = nlp(signature.lower())
    tokens = [token.text for token in doc]
    ner_tags = ['O'] * len (tokens)

    def assign_tags(entity, tag):
        entity_tokens = [token.text for token in nlp(entity)]
        n = len(entity_tokens)

        for i in range(len(tokens) - n + 1):
            if tokens[i:i+n] == entity_tokens:
                for j in range(n):
                    if j == 0:
                        ner_tags[i] = f'B-{tag}'
                    else:
                        ner_tags[i + j] = f'I-{tag}'

    assign_tags(row['name'].lower(), 'PER')
    assign_tags(row['job_title'].lower(), 'JOB')
    assign_tags(row['organization'].lower(), 'ORG')
    assign_tags(row['location'].lower(), 'LOC')
    
    return tokens, ner_tags

# Define a function to create a dataset for NER training and evaluation from the input data
# The function processes email signatures, tokenize the text and assign NER tags (PER, JOB, ORG, LOC) to the relevant sections
# It then splits the data into train, validation, and test sets, and creates Dataset objects for each set

def create_dataset(train, val_test):
    # Process the email signatures in the input data using the process_email_signature function
    # For each email signature, tokenize the text and assign NER tags (PER, JOB, ORG, LOC) to the relevant sections
    train_results = [process_email_signature(row, train=True) for row in tqdm(train.to_dict('records'), desc="Processing email signatures")]
    train_tokenized_signatures, train_ner_tags = zip(*train_results)
    
    if update_val_test:
        val_test_results = [process_email_signature(row, train=False) for row in tqdm(val_test.to_dict('records'), desc="Processing email signatures")]
        val_test_tokenized_signatures, val_test_ner_tags = zip (*val_test_results)
        
    tag_mapping = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-JOB': 3, 'I-JOB': 4, 'B-ORG': 5, 'I-ORG': 6, 'B-LOC': 7, 'I-LOC': 8}
    
    def convert_tags(tags):
        return[tag_mapping[tag] for tag in tags]

    features = Features ({
    'tokens': Sequence(Value(dtype='string', id=None)),
    'ner_tags': Sequence(ClassLabel(names=['O', 'B-PER', 'I-PER', 'B-JOB', 'I-JOB', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC'], id=None))
    })

    train_ner_tags_int = [convert_tags(signature_tags) for signature_tags in train_ner_tags]
    train_data = [{'tokens': signature, 'ner_tags': signature_tags} for signature, signature_tags in zip(train_tokenized_signatures, train_ner_tags_int)]
    train_dataset = Dataset.from_dict({k: [d[k] for d in train_data] for k in train_data[0]}, features=features)

    if update_val_test:
        val_test_ner_tags_int = [convert_tags(signature_tags) for signature_tags in val_test_ner_tags]
        val_test_data = [{'tokens': signature, 'ner_tags': signature_tags} for signature, signature_tags in zip(val_test_tokenized_signatures, val_test_ner_tags_int)]
        val_data, test_data = train_test_split(val_test_data, test_size=0.5, random_state=42)
        val_dataset = Dataset.from_dict({k: [d[k] for d in val_data] for k in val_data[0]}, features=features)
        test_dataset = Dataset.from_dict({k: [d[k] for d in test_data] for k in test_data[0]}, features=features)

        # Create a DatasetDict containing the train, validation, and Test Datasets
        # Each dataset in the dictionary consists of tokens and NER tags, which will be used for model training and evaluation

        dataset_dict = DatasetDict({
        'train' : train_dataset,
        'validation': val_dataset,
        'test': test_dataset,
        })
    
    else:
        dataset_dict = load_from_disk('ner_research/filtered_dataset')
        dataset_dict['train'] = train_dataset
    
    return dataset_dict

In [None]:
if update_val_test:
    dataset_dict = create_dataset(train, val_test)
else:
    dataset_dict = create_dataset(train, '')

In [None]:
filtered_validation = dataset_dict['validation'].select([i for i in range(dataset_dict['validation'].num_rows) if (set(range(9)) == set(dataset_dict['validation'][i]['ner_tags'])) or (set(range(1)) == set(dataset_dict['validation'][i]['ner_tags']))])

filtered_test = dataset_dict['test'].select([i for i in range(dataset_dict['test'].num_rows) if (set(range(9)) == set(dataset_dict['test'][i]['ner_tags'])) or (set(range(1)) == set(dataset_dict['test'][i]['ner_tags']))])

#Update the 'validation' key in dataset_dict with the filtered dataset
dataset_dict['validation'] = filtered_validation
dataset_dict['test'] = filtered_test

In [None]:
filtered_dataset_dict = dataset_dict

In [None]:
filtered_dataset_dict.save_to_disk(f"ner_research/{TRAIN_DATASET}/")