In [92]:
import re
import json
from bs4 import BeautifulSoup
from collections import defaultdict
import emailProcessing
with open('RawEmails.json', 'r') as file:
    datas = json.load(file)
    

In [93]:
def clean_text(message):
    output = re.sub(r'[\u200b\u200c\u200d\u200e\u200f\ufeff\r\xa0\ud83d\ude80\u202f\u2019\u2014\u2605\u2022\u2023\u2024\u034f]', '', BeautifulSoup(message, "lxml").text)
    return outputs]

In [95]:
import base64
def emailProcessing(emails):
    """
    Take a list of raw emails, and process them into a dictionary where sender email is the key and a list of tuples containing the email id, labelIds, snippet, and internalDate is the value.
    """
    senders = defaultdict(list)
    
    for sender in emails:
        for s_sender in sender["payload"]["headers"]:
            if "From" in s_sender["name"]:
                headers = sender.get('payload', {}).get('headers', [])
                for header in headers:
                    if header.get('name', '').lower() == 'subject':
                        title = header.get('value', 'No title found')
                if "data" in sender["payload"]["body"]:
                    decoder = base64.urlsafe_b64decode(sender["payload"]["body"]["data"].encode("ASCII")).decode("utf-8")
                    senders[s_sender["value"]].append((sender["id"], sender["labelIds"], title, decoder, sender["internalDate"]))
                elif "data" in sender["payload"]["parts"][0]["body"]:
                    decoder = base64.urlsafe_b64decode(sender["payload"]["parts"][0]["body"]["data"].encode("ASCII")).decode("utf-8")
                    senders[s_sender["value"]].append((sender["id"], sender["labelIds"], title, decoder, sender["internalDate"]))
    
    return senders
sll = emailProcessing(datas)

In [96]:
import datetime

from transformers import pipeline
from collections import defaultdict
from subprocess import list2cmdline
import spacy
from spacy.matcher import Matcher

nlp = spacy.load('en_core_web_lg')
model_checkpoint = "xlm-roberta-large-finetuned-conll03-english"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)

def org_name(extracted_text):
    # Extract the complete text in the resume
    classifier = token_classifier(extracted_text)
    entity_name = None
    max_score = 0
    for s in classifier:
        if s['entity_group'] == 'ORG':
            if s['score'] > max_score:
                entity_name = s['word']
                max_score = s['score']
    return entity_name


def application_categorizer(content):
    """
    Take a dictionary of emails grouped by sender, with id, labelIds, content, internalDate as the value.
    Return a dictionary with two keys: "app_focused" and "non_app_focused".
    "app_focused" key will point to a dictionary of emails that are application related, with company emails as the key and a list of tuples containing the email id, labelIds, snippet, and internalDate as the value.
    "non_app_focused" key will point to a dictionary of emails that are not application related, with company emails as the key and a list of tuples containing the email id, labelIds, snippet, and internalDate as the value.
    """
    application = defaultdict(list)
    non_application = defaultdict(list)
    combine_list = defaultdict(list)
    for company, email_data in content.items():
        for mailId, mailCategories, mailSubject, mailContent, mailTime in email_data:
            newMailContent = clean_text(mailContent)   
            newMailTime = datetime.datetime.fromtimestamp(int(mailTime)/1e3)
            if ("CATEGORY_PERSONAL" in mailCategories or "CATEGORY_UPDATES" in mailCategories or "IMPORTANT" in mailCategories) and ("great fit" not in mailSubject.lower() or 
                                                                                                        "apply now" not in mailSubject.lower()):
                if any(keyword in mailSubject.lower() or keyword in newMailContent.lower() for keyword in 
                       ["application", "applications", "assessment", "assessments", 
                        "next step", "submission", "submissions", "recruiting"]):
                    application[company].append([mailId, mailCategories, mailSubject, newMailContent, str(newMailTime)])
            else:
                non_application[company].append([mailId, mailCategories, mailSubject, newMailContent, str(newMailTime)])

    
    #Move unnecessary emails from application list non-application list
    temp_list_non_app = [company for company in non_application.keys()]
    for i in temp_list_non_app:
        if i in application.keys():
            non_application[i] = application[i]
            del application[i]
    combine_list["app_focused"] = application
    combine_list["non_app_focused"] = non_application
    
    return combine_list

def gimmeAFunctionName(file):
    app_focused = file['app_focused']
    
    the_fix = defaultdict(list)
    
    for email, content in app_focused.items():
        for mailId, mailCategories, mailSubject, mailContent, mailTime in content:
            mailCorporationName = org_name(mailContent)
            
            # If no company name is found, use the original email address
            if mailCorporationName is None:
                mailCorporationName = email
            
            # Add the email data under the company name
            the_fix[mailCorporationName].append([mailId, mailCategories, mailSubject, mailContent, mailTime])
    # Replace the original 'app_focused' content with the merged version
    file['app_focused'] = the_fix
    
    return file
    

ssssss = gimmeAFunctionName(application_categorizer(sll))

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [97]:
with open('resulttt2.json', 'w') as f:
    json.dump(ssssss, f, indent=4)

In [75]:
x = "\n\n\n \n\n\n\n\n\n\n\nDear Tri Cuong,\nThank you for the time and energy you invested in applying for the position Intern: Data Analyst (Summer 2024) (5165).\n\n\t\t\t\t\t\tUnfortunately, we have to inform you that this time we have decided to move on with other candidates.\n\n\t\t\t\t\t\tWe appreciate your interest in our company and are grateful for the opportunity to get to know you and your aspirations.\n\n\t\t\t\t\t\tWe greatly value your interest for shaping the future of transport solutions together with us.\n\t\t\t\t\t\tWe hope you will keep us in mind and apply again for future positions within your areas of expertise.\n\n\t\t\t\t\t\tUntil then, we wish you all the best in your future career. Once again, thank you for your interest in working with Volvo Group.\n\n\n\t\t\t\t\t\tBest regards,\n\t\t\t\t\t\tVolvo Group Talent Acquisition\n\t\t\t\t\t\t\n\n\n\n\n\nABOUT US | YOUR CAREER AT VOLVO GROUP | MEET OUR EMPLOYEES | FOLLOW US ON SOCIAL MEDIA\n\n\n\n\n\n\n\n\n"

if "great fit" in x:
    print(1)
else:
    print(0)

0


In [58]:
contents = []
for a,b in ssssss['non_app_focused'].items():
    for i in b:
        contents.append(i[3])

In [63]:
with open('resulttt2.json', 'w') as f:
    json.dump(ssssss, f, indent=4)

with open('content_emails_non_app.json', 'w') as f:
    json.dump(contents, f, indent=4)

In [64]:
f = ['this is my text', 'what', 'the', 'heck']

for i in f:
    if 'is' in i:
        print(1)

1


In [65]:
with open('categorizer3.json', 'r') as file:
    datas = json.load(file)

FileNotFoundError: [Errno 2] No such file or directory: 'categorizer3.json'

In [66]:
import pandas as pd
key_list = defaultdict()
daaaa = datas["non_app_focused"]
countdaaa = 0
for i, email in daaaa.items():
    countdaaa += len(email)
countdaaa


TypeError: list indices must be integers or slices, not str

In [67]:
countdaaa

NameError: name 'countdaaa' is not defined

In [68]:
falsePositiveList = ['The Untapped Team <marketing@untapped.io>', 'Google <no-reply@accounts.google.com>', 'Paisly by JetBlue <jetblueairways@email.jetblue.com>', 
                 'Capital One <donotreply@bankmessage.capitalone.com>', 'JetBlue <jetblueairways@email.jetblue.com>', 'Progressive <customerservice@e.progressive.com>',
                 'Eric <eric@jobrightai.com>', '\"Abby F.\" <info@bb3.wayup.com>', 'qorvousinc-jobnotification@noreply.jobs2web.com', 'Temu <email@market.temuemail.com>',
                 'Shopee <info@mail.shopee.vn>', 'Indeed <donotreply@indeed.com>', 'support@referralhub.dev', 'Tata Consultancy Services via WayUp <info@wayup.com>',
                 'Indeed <no-reply@indeed.com>', 'LinkedIn <jobs-noreply@linkedin.com>', '\"Peter Mattis @ Cockroach Labs\" <news@mail.cockroachlabs.com>',
                 'Recruiter via WayUp <info@bb3.wayup.com>', 'Spotify <no-reply@spotify.com>', 'Twilio <no-reply@twilio.com>', 'Sapna B <sapna.b@brilliantinfotech.com>',
                 '\"Freelancer.com\" <noreply@notifications.freelancer.com>', 'Michael Yan <michael@hey.simplify.jobs>', 'Simplify Team <noreply@simplify.jobs>', 'Apple <no_reply@email.apple.com>',
                 'Hiring at Intuit <hiring@intuit.com>', 'Spirit Airlines <booking@fly.spirit-airlines.com>', 'Glassdoor Community <info@glassdoor.com>', 'Free Spirit <FreeSpirit@fly.spirit-airlines.com>',
                 'China Airlines <calmarketing@email-china-airlines.com>', 'Recruiter via WayUp <recruiter.2.2131160214@messages.wayup.com>', 'True', 'Zety <info@tr.zety.com>', 'Hosted', '\"Phil @ ZipRecruiter\" <phil@ziprecruiter.com>',
                 'ZipRecruiter <support@ziprecruiter.com>', 'membership@governmentjobs.com', 'jobnotification@avaturecrm.com', 'Coursera <no-reply@m.mail.coursera.org>']
truePositive = 386
falseNegative = 141
trueNegative = 517
falsePositive = 0

Accuracy = (truePositive + trueNegative)/(truePositive + trueNegative + falsePositive + falseNegative)
Accuracy *= 100
Accuracy

86.49425287356321

In [49]:
count = 0
for i, email in daaaa.items():
    if i in falsePositiveList:
        count += len(email)
count

0

In [37]:
from transformers import pipeline
from subprocess import list2cmdline
from pdfminer.high_level import extract_text
import docx2txt
import spacy
from spacy.matcher import Matcher
import time
start = time.time()
nlp = spacy.load('en_core_web_lg')
model_checkpoint = "xlm-roberta-large-finetuned-conll03-english"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)




# Organisation names extraction
def org_name(file):
    # Extract the complete text in the resume
    extracted_text = file
    classifier = token_classifier(extracted_text)
    entity_name = None
    max_score = 0
    # for s in classifier:
    #     if s['entity_group'] == 'ORG':
    #         if s['score'] > max_score:
    #             entity_name = s['word']
    #             max_score = s['score']
    print(classifier)

       
org_name("Hi Tri Cuong,Thank you for your interest in Akuna Capital! We have received your application for the Software Engineer Intern - Python, Summer 2025 role and we look forward to reviewing your application. Please note that due to the large quantity of applicants we've received for this role, there have been significant lags in the process. We will still be accepting applications, but wanted to inform you of any potential delays. If your qualifications align with our needs for this particular role, a member of our recruitment team will be in touch with you to coordinate next steps.Please be sure to keep an eye on your spam/junk folder for any communications from us or our testing partners. If you move forward in the interview process with us, please know that all interview questions are confidential and not to be shared externally.In the meantime, please check out some of the links below to get to know us better:* Meet a few members of our team and tour our Chicago headquarters through our virtual reality ( https://vr.akunacapital.com/registration/ ) experience.* Explore ( https://youtu.be/oAS29_yxVcc ) the world of options market making and take our Options 101 ( https://akunacapital.teachable.com/ ) course.* Learn more about our hiring process ( https://akunacapital.com/careers#hire-dev ) and FAQs ( https://akunacapital.com/careers#general ) from candidates.Kind regards,Akuna Capital Recruitment Teamwww.akunacapital.com ( http://www.akunacapital.com )**Please note: Do not reply to this email. This email is sent from an unattended mailbox. Replies will not be read.")
end = time.time()

print("The time of execution of above program is :", round((end - start), 2))

Some weights of the model checkpoint at xlm-roberta-large-finetuned-conll03-english were not used when initializing XLMRobertaForTokenClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity_group': 'PER', 'score': 0.9998582, 'word': 'Tri Cuong', 'start': 3, 'end': 12}, {'entity_group': 'ORG', 'score': 0.9999771, 'word': 'Akuna Capital', 'start': 44, 'end': 57}, {'entity_group': 'MISC', 'score': 0.9988606, 'word': 'Python', 'start': 128, 'end': 134}, {'entity_group': 'LOC', 'score': 0.9999567, 'word': 'Chicago', 'start': 969, 'end': 976}, {'entity_group': 'ORG', 'score': 0.9943621, 'word': 'akunacapital', 'start': 1031, 'end': 1043}, {'entity_group': 'MISC', 'score': 0.99302113, 'word': 'Options 101', 'start': 1166, 'end': 1177}, {'entity_group': 'ORG', 'score': 0.97176725, 'word': 'akunacapital', 'start': 1188, 'end': 1200}, {'entity_group': 'ORG', 'score': 0.9873828, 'word': 'akunacapital', 'start': 1273, 'end': 1285}, {'entity_group': 'ORG', 'score': 0.9762569, 'word': 'akunacapital', 'start': 1328, 'end': 1340}, {'entity_group': 'ORG', 'score': 0.87209845, 'word': 'Akuna Capital Recruitment Team', 'start': 1392, 'end': 1422}, {'entity_group': 'ORG', 'score': 

In [131]:
from transformers import pipeline
import pandas as pd

text = "Kw0KDQorDQoNCkNvbXBsZXRlIHlvdXIgYXBwbGljYXRpb24gd2l0aCBXaGF0bm90Lg0KDQpIaSBUcmks4oCo4oCoDQrCoA0KQXMgcGFydCBvZiB5b3VyIGFwcGxpY2F0aW9uIGZvciBTb2Z0d2FyZSBFbmdpbmVlciBJbnRlcm4sIFN1bW1lciAyMDI1IHdpdGggV2hhdG5vdCwgV2hhdG5vdCBoYXMgcGFydG5lcmVkIHdpdGggVW50YXBwZWQgdG8gbGVhcm4gbW9yZSBhYm91dCB3aGF0IG1ha2VzIHlvdSB1bmlxdWUu4oCo4oCoDQrCoMKgDQpZb3UgY2FuIGNvbXBsZXRlIHlvdXIgYXBwbGljYXRpb24gdG8gV2hhdG5vdCBieSBjcmVhdGluZyBhIHByb2ZpbGUgYW5kIGdhaW4gYWNjZXNzIHRvIHRoZSBtYW55IG90aGVyIGpvYiBvcHBvcnR1bml0aWVzIFVudGFwcGVkIGhhcyB0byBvZmZlci4NCg0KSmVubmEgVCwgUmVjcnVpdGVyIGZyb20gV2hhdG5vdCBzYXlzOg0KDQpIaSBUcmksDQoNClRoYW5rIHlvdSBmb3IgYXBwbHlpbmcgdG8gV2hhdG5vdCEgQXMgdGhlIG5leHQgc3RlcCBpbiBvdXIgaGlyaW5nIHByb2Nlc3MsIHBsZWFzZSBjb21wbGV0ZSB5b3VyIHByb2ZpbGUgb24gVW50YXBwZWQuDQoNCldlIHBhcnRuZXJlZCB3aXRoIFVudGFwcGVkIGJlY2F1c2Ugb3VyIHRlYW0gYXQgV2hhdG5vdCB2YWx1ZXMgYmVpbmcgYWJsZSB0byBzZWUgYW5kIHVuZGVyc3RhbmQgb3VyIGNhbmRpZGF0ZXMgYmV5b25kIHRoZWlyIHJlc3VtZXMuDQoNClRoYW5rcyBpbiBhZHZhbmNlIQ0KDQpXaGF0bm90IEVhcmx5IFRhbGVudCBUZWFtDQoNCkhpIFRyaSwNCg0KVGhhbmsgeW91IGZvciBhcHBseWluZyB0byBXaGF0bm90ISBBcyB0aGUgbmV4dCBzdGVwIGluIG91ciBoaXJpbmcgcHJvY2VzcywgcGxlYXNlIGNvbXBsZXRlIHlvdXIgcHJvZmlsZSBvbiBVbnRhcHBlZC4NCg0KV2UgcGFydG5lcmVkIHdpdGggVW50YXBwZWQgYmVjYXVzZSBvdXIgdGVhbSBhdCBXaGF0bm90IHZhbHVlcyBiZWluZyBhYmxlIHRvIHNlZSBhbmQgdW5kZXJzdGFuZCBvdXIgY2FuZGlkYXRlcyBiZXlvbmQgdGhlaXIgcmVzdW1lcy4NCg0KVGhhbmtzIGluIGFkdmFuY2UhDQoNCldoYXRub3QgRWFybHkgVGFsZW50IFRlYW0NCg0KSW1wb3J0YW50IG5vdGljZTogVG8gYm9vc3QgeW91ciBhcHBsaWNhdGlvbiBhdCBXaGF0bm90LCBjb21wbGV0ZSB5b3VyIHByb2ZpbGUgYnkgT2N0b2JlciAxMywgMjAyNA0KDQpJbXBvcnRhbnQgbm90aWNlOiBUbyBib29zdCB5b3VyIGFwcGxpY2F0aW9uIGF0IFdoYXRub3QsIGNvbXBsZXRlIHlvdXIgcHJvZmlsZSBieSBPY3RvYmVyIDEzLCAyMDI0DQoNCltDb21wbGV0ZSB5b3VyIGFwcGxpY2F0aW9uIG9uIFVudGFwcGVkXShodHRwczovL3VudGFwcGVkLmlvL2FwcC9qb2JzLzM2YjE1NzQwLThlZTItNGZiMi05MGZiLTc1NWU5YTBjMGI0MC9hcHBseT9yZWY9ZW1haWwmdXRtX3NvdXJjZT1XaGF0bm90JnV0bV9tZWRpdW09ZW1haWwmdXRtX2NhbXBhaWduPWF0c19pbnZpdGVfZnJvbV9vcmdhbml6YXRpb24mdXRtX2NvbnRlbnQ9Y3RhYnV0dG9uJmVtYWlsPXN0ZXBoZW5sdW9uZzI0JTQwZ21haWwuY29tJmF0c19hY2NvdW50X2xpbmtfdG9rZW49OWNTYlJBMmYwVHZRd21oeE5VV0FjYkRMUThDb1haVG4mc3R1ZGVudF9mdWxsX25hbWU9VHJpK0N1b25nK0x1b25nJmxvZ29fdXJsPWh0dHBzJTNBJTJGJTJGanVtcHN0YXJ0LXN0YXRpYy5zMy5hbWF6b25hd3MuY29tJTJGYmFja2VuZCUyRl9fc2l6ZWRfXyUyRm9yZ2FuaXphdGlvbnMlMkZvcmdhbml6YXRpb24lMkZ6XzB2N2I4SFMzcXk0cjFjZUxEZUVRLXRodW1ibmFpbC0yMDB4MjAwLnBuZyZvcmdhbml6YXRpb25faWQ9d2hhdG5vdCZvcmdhbml6YXRpb25fbmFtZT1XaGF0bm90JnJvbGVfaWQ9MzZiMTU3NDAtOGVlMi00ZmIyLTkwZmItNzU1ZTlhMGMwYjQwJnJvbGVfdHlwZT1KT0Imcm9sZV90aXRsZT1Tb2Z0d2FyZStFbmdpbmVlcitJbnRlcm4lMkMrU3VtbWVyKzIwMjUpDQoNCltDb21wbGV0ZSBhcHBsaWNhdGlvbiBvbiBVbnRhcHBlZF0oaHR0cHM6Ly91bnRhcHBlZC5pby9hcHAvam9icy8zNmIxNTc0MC04ZWUyLTRmYjItOTBmYi03NTVlOWEwYzBiNDAvYXBwbHk_cmVmPWVtYWlsJnV0bV9zb3VyY2U9V2hhdG5vdCZ1dG1fbWVkaXVtPWVtYWlsJnV0bV9jYW1wYWlnbj1hdHNfaW52aXRlX2Zyb21fb3JnYW5pemF0aW9uJnV0bV9jb250ZW50PWN0YWJ1dHRvbiZlbWFpbD1zdGVwaGVubHVvbmcyNCU0MGdtYWlsLmNvbSZhdHNfYWNjb3VudF9saW5rX3Rva2VuPTljU2JSQTJmMFR2UXdtaHhOVVdBY2JETFE4Q29YWlRuJnN0dWRlbnRfZnVsbF9uYW1lPVRyaStDdW9uZytMdW9uZyZsb2dvX3VybD1odHRwcyUzQSUyRiUyRmp1bXBzdGFydC1zdGF0aWMuczMuYW1hem9uYXdzLmNvbSUyRmJhY2tlbmQlMkZfX3NpemVkX18lMkZvcmdhbml6YXRpb25zJTJGb3JnYW5pemF0aW9uJTJGel8wdjdiOEhTM3F5NHIxY2VMRGVFUS10aHVtYm5haWwtMjAweDIwMC5wbmcmb3JnYW5pemF0aW9uX2lkPXdoYXRub3Qmb3JnYW5pemF0aW9uX25hbWU9V2hhdG5vdCZyb2xlX2lkPTM2YjE1NzQwLThlZTItNGZiMi05MGZiLTc1NWU5YTBjMGI0MCZyb2xlX3R5cGU9Sk9CJnJvbGVfdGl0bGU9U29mdHdhcmUrRW5naW5lZXIrSW50ZXJuJTJDK1N1bW1lcisyMDI1KQ0KDQpGQVE6IFdoYXQgZWxzZSBkbyBJIGdldCBmcm9tIFVudGFwcGVkP8KgDQoNCvCfmoAgQm9vc3RlZCBXaGF0bm90IGFwcGxpY2F0aW9uLiBBZnRlciBzaWduaW5nIHVwLCB5b3VyIGFwcGxpY2F0aW9uIHdpbGwgc2hvdyB1cCBmaXJzdCBmb3IgV2hhdG5vdC7CoA0KDQrCoPCfpJcgVGhvdXNhbmRzIG9mIG90aGVyIG9wcG9ydHVuaXRpZXMuIE91ciBwYXJ0bmVycyBpbmNsdWRlIEx5ZnQsIERvb3JEYXNoIGFuZCBEZWxvaXR0ZSwgYW5kIHJhbmdlIGZyb20gc21hbGwgc3RhcnR1cHMgdG8gRm9ydHVuZSA1MDBzLg0KDQrwn5OpIEFwcGxpY2F0aW9ucyBjb21lIHRvIHlvdS4gT3ZlciA1MDBrIHJlY3J1aXRlciBtZXNzYWdlcyBoYXZlIGJlZW4gc2VudCB0byBqb2Igc2Vla2VycyBpbiB0aGUgcGFzdCB5ZWFyLg0KDQpZb3UgYXJlIHJlY2VpdmluZyBlbWFpbCBub3RpZmljYXRpb25zIGZyb20gVW50YXBwZWQuIFtVbnN1YnNjcmliZV0oaHR0cHM6Ly9tYW5hZ2Uua21haWwtbGlzdHMuY29tL3N1YnNjcmlwdGlvbnMvdW5zdWJzY3JpYmU_YT1VYmlycWomYz0wMUo5SDgyVjRYMk41UEZFM0dDREZGMzVBRCZrPTAzYzE3NmRkNDdjNTg1ZjYxZmI3MWE1ZGI2Nzk3MTExJm09UzI2OERIJnI9NjJuQUt3SykuICBJZiB5b3UgbmVlZCBhc3Npc3RhbmNlIG9yIGhhdmUgcXVlc3Rpb25zLCBwbGVhc2UgYXNrIHVzIGF0IFtpbmZvQHVudGFwcGVkLmlvXShtYWlsdG86aW5mb0B1bnRhcHBlZC5pbykNCsKgDQrCqSAyMDIyIFVudGFwcGVkIExhYnMsIEluYy4gODYwNSBTYW50YSBNb25pY2EgQmx2ZC4gU3VpdGUgODQ1NjEgV2VzdCBIb2xseXdvb2QsIENBIDkwMDY5"
named_ents = tagger(text)
maxda = max(named_ents, key=lambda x: x['score'])
print(maxda['word'])
print(named_ents)

ValueError: max() iterable argument is empty

In [78]:
# from transformers import pipeline

# tagger = pipeline(task='ner', aggregation_strategy='max')
# def get_word_with_max_score(data):
#     if not data:  # Check if the list is empty
#         return None
#     named_ents = tagger(data)  
#     max_item = max(named_ents, key=lambda x: x['score'])
#     return max_item['word']

# new_json = defaultdict(list)
# temp_list = []
# for app_status, data in datas.items():
#     for email, email_data in data.items():
#         for mailId, mailCategories, mailSubject, mailContent, mailTime in email_data:
#             mailCompany = get_word_with_max_score(mailContent)
        
#             new_json[email].append([mailId, mailCategories, mailCompany, mailSubject, mailContent, mailTime])
# print(new_json)

from transformers import pipeline
from collections import defaultdict

# Load the NER model once
tagger = pipeline(task='ner', aggregation_strategy='max')

def get_word_with_max_score(contents):
    """
    This function processes a batch of email contents and returns a list of the
    company name (or the word with the highest NER score) for each email.
    """
    if not contents:
        return None
    
    named_ents = tagger(contents)  # Process all email contents in batch
    result = []
    
    # Each email will have a list of named entities. Extract max score word for each
    for ents in named_ents:
        if ents:  # Check if there are named entities for this email
            max_item = max(ents, key=lambda x: x['score'])
            result.append(max_item['word'])
        else:
            result.append(None)  # No named entities found
    return result

new_json = defaultdict(list)
temp_list = []

for app_status, data in datas['app_focused'].items():
    email_contents = [mailContent for _, _, _, mailContent, _ in data]
    
    # Process all email contents in a batch
    company_names = get_word_with_max_score(email_contents)

    i = 0
    for mailId, mailCategories, mailSubject, mailContent, mailTime in email_data:
        mailCompany = company_names[i]
        i += 1
        new_json[email].append([mailId, mailCategories, mailCompany, mailSubject, mailContent, mailTime])

print(new_json)



No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision 4c53496 (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is

KeyboardInterrupt: 

TypeError: string indices must be integers, not 'str'

In [140]:
import re

def find_company_with_regex(content):
    # A simple regex that looks for capitalized words or company suffixes
    pattern = r'\b[A-Z][a-z]*\s(?:Inc|Ltd|Corp|LLC|Group|Technologies)\b'
    match = re.search(pattern, content)
    if match:
        return match.group(0)
    return None
print(find_company_with_regex("Hi Tri Cuong Luong,Thank you for applying to the Data Science Internship - Summer 2025 at Klaviyo.We received an overwhelming number of qualified applicants and are honored that so many talented candidates (like you!) are interested in working with us. Although your background is impressive, we've decided to move forward with other candidates at this time.We know how hard the job search can be and we thank you for investing the time in applying to Klaviyo. We do hope you'll keep us in mind for future opportunities.Curious to learn more about Klaviyo? Check us out onLinkedIn ( https://www.linkedin.com/company/klaviyo/ ),BuiltIn Boston ( https://www.builtinboston.com/company/klaviyo ) ,Comparably ( https://www.comparably.com/companies/klaviyo ),Glassdoor, ( https://www.glassdoor.com/Overview/Working-at-Klaviyo-EI_IE1169266.11,18.htm ) Instagram ( https://www.instagram.com/lifeatklaviyo/ ) and ourEngineering Blog ( https://klaviyo.tech/ ).Best of luck with your search,Klaviyo Campus Recruiting Team")


None
