In [67]:
import os
import torch
import pandas as pd
from transformers import AutoTokenizer, DistilBertForSequenceClassification
from sklearn.model_selection import train_test_split
from datasets import Dataset
import regex as re
import spacy

In [68]:
nlp = spacy.load('en_core_web_sm')

In [69]:
path = r"C:/Hackathon/Gmail RAG/Data"
os.chdir(path)

data = pd.read_csv(r"balanced_emails.csv")

In [70]:
data["text"].head()

0    Subject : Meeting Confirmation\nBody : Hi, I w...
1    Subject : Feedback Request\nBody : Hello, I ho...
2    Subject : Project Update Needed\nBody : Hi, Co...
3    Subject : Invoice Approval\nBody : Dear [Name]...
4    Subject : Event RSVP\nBody : Hello, We are org...
Name: text, dtype: object

In [71]:
def has_linkedin_url(text):
    url_pattern = r"https : \/\/(?:www\.)?linkedin\.com[\w\-_\/.?&=]*"
    if re.search(url_pattern, text):
        return 1
    return 0 

In [72]:
def clean_text_with_linkedin_url(df):
    if(df["has_linkedin_url"] == 1):
        initial_cleaned_text = re.sub(r'[^A-Za-z0-9]', ' ', df["text"])
        
        not_needed = ["https", "linkedin", "null", "www", "com", "email"]
        doc = nlp(initial_cleaned_text)
        cleaned_text = []

        for token in doc:
        
            is_valid_token = (
                " " not in token.text.lower()
                and token.text.lower() not in not_needed
                and not token.is_stop 
                and token.is_alpha 
            )
            
            if is_valid_token:
                cleaned_text.append(token.text.lower())

        cleaned_text = " ".join(cleaned_text)

        return cleaned_text
    else:
        return df["text"]



In [73]:
data["has_linkedin_url"] = data["text"].apply(has_linkedin_url)
data["cleaned_text"] = data.apply(clean_text_with_linkedin_url, axis=1)

In [74]:
data.to_csv("cleaned_urls.csv")    

In [8]:
print(cleaned_text)

subject   application Data Engineer   Startup Meddon Talent body   update Meddon Talent                                                             email intended Divyaprakash Rathinasabapathy   AI Engineer   Learn included   https     www linkedin com help linkedin answer 4788   lang en   lipi urn   3Ali   3Apage   3Aemail email jobs application rejected 01   3B86PtA0uZR3K6PEPicgB4dw   3D   3D   midToken AQEOOGEuxIqB A   midSig 2hX9tZ6RpwRXA1   trk eml email jobs application rejected 01 SecurityHelp 0 textfooterglimmer   trkEmail eml email jobs application rejected 01 SecurityHelp 0 textfooterglimmer null ddvsp4 m659l110 z0 null null   eid ddvsp4 m659l110 z0   otpToken MWEwNTE3ZTMxNTI2YzFjNmIwMjQwNGVkNDUxOWU0YmQ4ZmM5ZDY0MTllYWY4ODYxNzljNTAwNjY0YjVmNWFmMWY0ZDVkZmEwNDFmMGNmZDIwM2JmZDViYjNjNzM5NmFmMzg1OWQ5MzE3ZTQzNDljNzI5NjBlNywxLDE   3D receiving LinkedIn notification emails   Unsubscribe   https     www linkedin com comm psettings email unsubscribe   lipi urn   3Ali   3Apage   3Aemail 

In [9]:
import spacy

In [10]:
nlp = spacy.load('en_core_web_sm')

In [12]:
filtered_tokens = [token.text for token in doc if token.text.lower() in nlp.vocab]


In [13]:
filtered_text = ' '.join(filtered_tokens)

In [14]:
filtered_text

'subject    application    body    update                                                              email intended    AI    included    https      www linkedin com help linkedin answer 4788    lang en    lipi urn          email jobs application rejected 01             A       trk eml email jobs application rejected 01 0 textfooterglimmer    eml email jobs application rejected 01 0 textfooterglimmer null ddvsp4 m659l110 z0 null null    eid ddvsp4 m659l110 z0       receiving LinkedIn notification emails    Unsubscribe    https      www linkedin com comm psettings email unsubscribe    lipi urn          email jobs application rejected 01             A       trk eml email jobs application rejected 01 unsubscribe 0 textfooterglimmer    eml email jobs application rejected 01 unsubscribe 0 textfooterglimmer null ddvsp4 m659l110 z0 null null    eid ddvsp4 m659l110 z0    loid Help    https      www linkedin com help linkedin answer 67    lang en    lipi urn          email jobs application rej

In [None]:



nlp = spacy.load('en_core_web_sm')
text = cleaned_text

not_needed = ["https", "linkedin", "null", "www", "com", "email"]


doc = nlp(text)
cleaned_text = []


for token in doc:
 
    is_valid_token = (
        " " not in token.text.lower()
        and token.text.lower() not in not_needed
        and not token.is_stop 
        and token.is_alpha 
    )
    
    if is_valid_token:
        cleaned_text.append(token.text.lower())
        print(token.text, " ", token.pos_)

cleaned_text = " ".join(cleaned_text)



subject   ADJ
application   NOUN
data   NOUN
engineer   NOUN
startup   NOUN
meddon   VERB
talent   NOUN
body   NOUN
update   NOUN
meddon   PROPN
talent   NOUN
intended   VERB
divyaprakash   NOUN
rathinasabapathy   NOUN
ai   NOUN
engineer   NOUN
learn   VERB
included   VERB
help   AUX
answer   VERB
lang   PROPN
en   X
lipi   ADJ
urn   NOUN
jobs   NOUN
application   NOUN
rejected   VERB
midtoken   VERB
aqeoogeuxiqb   NOUN
midsig   NOUN
trk   VERB
eml   ADJ
jobs   NOUN
application   NOUN
rejected   VERB
securityhelp   NOUN
textfooterglimmer   PROPN
trkemail   NOUN
eml   ADJ
jobs   NOUN
application   NOUN
rejected   VERB
securityhelp   NOUN
textfooterglimmer   PROPN
eid   PROPN
otptoken   VERB
receiving   VERB
notification   NOUN
emails   NOUN
unsubscribe   ADJ
comm   NOUN
psettings   NOUN
unsubscribe   VERB
lipi   VERB
urn   NOUN
jobs   NOUN
application   NOUN
rejected   VERB
midtoken   VERB
aqeoogeuxiqb   NOUN
midsig   NOUN
trk   VERB
eml   ADJ
jobs   NOUN
application   NOUN
rejected   V