In [80]:
import httpx
import pandas as pd
import os
import re

In [89]:
# set directory path
dir_path = '../emails'

# iterate over each file in the directory
for file_name in os.listdir(dir_path):
    # read CSV file and convert to dataframe
    if file_name.endswith('.csv'):
        file_path = os.path.join(dir_path, file_name)
        df = pd.read_csv(file_path)

In [90]:
# Convert rows with multiple items into multiple rows.

# Split the comma-separated values into lists
df['names_of_the_contact_person'] = df['names_of_the_contact_person'].str.split(',')
df['emails'] = df['emails'].str.split(',')

# Explode the lists into separate rows
df = df.explode('names_of_the_contact_person', ignore_index=True)
df = df.explode('emails', ignore_index=True)

# Strip whitespace from the columns
df['names_of_the_contact_person'] = df['names_of_the_contact_person'].str.strip()
df['emails'] = df['emails'].str.strip()

In [91]:
# Fix emails - First round
df['emails'] = df['emails'] \
            .str.replace(' ', '') \
            .str.replace('[at]', '@') \
            .str.replace('[AT]', '@') \
            .str.replace('{at}', '@') \
            .str.replace('(at)', '@') \
            .str.replace('_at_', '@') \
            .str.replace('-at-', '@') \
            .str.replace('[dot]', '.') \
            .str.replace('(dot)', '.') \
            .str.replace('_dot_', '.')

In [96]:
# Check if email is valid first round

count_of_valid_emails = 0
count_of_invalid_emails = 0

for index, row in df.iterrows():
    email = str(row['emails']) 
    if(re.match(r"[^@]+@[^@]+\.[^@]+", email)):
        count_of_valid_emails += 1
        df.loc[index, "email_is_valid"] = True
    else:
        count_of_invalid_emails += 1
        df.loc[index, "email_is_valid"] = False
        
print(f"Num of Valid Emails = {count_of_valid_emails}")
print(f"Num of Invalid Emails = {count_of_invalid_emails}")

Num of Valid Emails = 268
Num of Invalid Emails = 18


In [106]:
# Fix emails - Second round

df.loc[df["email_is_valid"] == False, "emails"] = df["emails"] \
            .str.replace('at', '@') \
            .str.replace('AT', '@') \
            .str.replace('dot', '.') \
            .str.replace('DOT', '.')

In [108]:
# Check if email is valid the second time

count_of_valid_emails = 0
count_of_invalid_emails = 0

for index, row in df.iterrows():
    email = str(row['emails']) 
    if(re.match(r"[^@]+@[^@]+\.[^@]+", email)):
        count_of_valid_emails += 1
        df.loc[index, "email_is_valid"] = True
    else:
        count_of_invalid_emails += 1
        df.loc[index, "email_is_valid"] = False
        
print(f"Num of Valid Emails = {count_of_valid_emails}")
print(f"Num of Invalid Emails = {count_of_invalid_emails}")

Num of Valid Emails = 280
Num of Invalid Emails = 6


In [109]:
df

Unnamed: 0,names_of_the_contact_person,emails,name,company_homepage_link,email_is_valid
0,,first_name.last_name@brightspec.com,BrightSpec,https://www.brightspec.com/,True
1,,careers@brightspec.com,BrightSpec,https://www.brightspec.com/,True
2,,info@takeone.video,TakeOne,https://www.takeone.video/,True
3,,jsaewitz@masslight.com,MassLight,https://masslight.com,True
4,Aleksi,aleksi@,ClimateAligned,,False
...,...,...,...,...,...
281,,praful@sarama.app,Sarama,,True
282,,jobs@bytro.com,Bytro,https://bytro.com/,True
283,Joniel E.,joniel@modash.io,Modash.io,,True
284,Stevie Buckley,stevie.buckley@xapien.com,Xapien AI,,True


In [111]:
df

Unnamed: 0,names_of_the_contact_person,emails,name,company_homepage_link,email_is_valid
0,,first_name.last_name@brightspec.com,BrightSpec,https://www.brightspec.com/,True
1,,careers@brightspec.com,BrightSpec,https://www.brightspec.com/,True
2,,info@takeone.video,TakeOne,https://www.takeone.video/,True
3,,jsaewitz@masslight.com,MassLight,https://masslight.com,True
4,Aleksi,aleksi@,ClimateAligned,,False
...,...,...,...,...,...
281,,praful@sarama.app,Sarama,,True
282,,jobs@bytro.com,Bytro,https://bytro.com/,True
283,Joniel E.,joniel@modash.io,Modash.io,,True
284,Stevie Buckley,stevie.buckley@xapien.com,Xapien AI,,True
