In [1]:
import pandas as pd 
import re
from Levenshtein import distance as levenshtein_distance
from jellyfish import jaro_winkler_similarity
from Levenshtein import ratio
import Levenshtein
import jellyfish

In [2]:
import pandas as pd
import re

# ----------------------------
# Step 1: Define Validation Logic
# ----------------------------
def is_valid_local_part_only(local_part):
    # Rule 1: Length between 6 and 30 characters
    if len(local_part) < 6 or len(local_part) > 30:
        return False

    # Rule 2: Only letters, digits, and at most one dot (no repetition)
    if not re.fullmatch(r'[A-Za-z0-9]+(\.[A-Za-z0-9]+)?', local_part):
        return False

    # Rule 3: Should not be digits only
    if local_part.isdigit():
        return False

    # Rule 4: Must contain at least one letter
    if not re.search(r'[A-Za-z]', local_part):
        return False

    return True

def is_valid_email(email):
    parts = email.split('@')
    
    # Must have exactly one '@'
    if len(parts) != 2:
        return False

    local_part = parts[0]
    return is_valid_local_part_only(local_part)

# ----------------------------
# Step 2: Read Excel File
# ----------------------------
file_path = "G:\LG_Task_internship\Task_3\Excel_Task3.xlsx" # <-- Change this to your actual file name
email_column = "Email"           # <-- Make sure this matches the column name in your file

df = pd.read_excel(file_path)

# Clean extra spaces in emails
df[email_column] = df[email_column].astype(str).str.strip()

# ----------------------------
# Step 3: Check Validity
# ----------------------------
df['IsValid'] = df[email_column].apply(is_valid_email)

# ----------------------------
# Step 4: Split Valid and Invalid Emails
# ----------------------------
valid_emails = df[df['IsValid']][email_column].reset_index(drop=True)
invalid_emails = df[~df['IsValid']][email_column].reset_index(drop=True)

# Make sure both columns have same length
max_len = max(len(valid_emails), len(invalid_emails))
valid_emails = valid_emails.reindex(range(max_len))
invalid_emails = invalid_emails.reindex(range(max_len))

# ----------------------------
# Step 5: Save to Excel File
# ----------------------------
result_df = pd.DataFrame({
    'Valid Emails': valid_emails,
    'Invalid Emails': invalid_emails
})




In [3]:
print(len(invalid_emails))
print(len(valid_emails))

56
56


In [4]:
# Extract only local parts (before @)
# ----------------------------
valid_locals = valid_emails.str.split('@').str[0].dropna().tolist()
invalid_locals = invalid_emails.str.split('@').str[0].dropna().tolist()

# Compare Using Levenshtein and Jaro-Winkler
# ----------------------------
lev_scores = []
jw_scores = []

for invalid in invalid_locals:
    best_lev = 0
    best_jw = 0
    for valid in valid_locals:
        lev = Levenshtein.ratio(invalid, valid)
        jw = jellyfish.jaro_winkler_similarity(invalid, valid)
        best_lev = max(best_lev, lev)
        best_jw = max(best_jw, jw)
    lev_scores.append(best_lev)
    jw_scores.append(best_jw)

# ----------------------------
# Print and Compare Average Similarity
# ----------------------------
avg_lev = sum(lev_scores) / len(lev_scores) if lev_scores else 0
avg_jw = sum(jw_scores) / len(jw_scores) if jw_scores else 0

print("🔍 Average Levenshtein Similarity:", round(avg_lev, 4))
print("🔍 Average Jaro-Winkler Similarity:", round(avg_jw, 4))

if avg_lev > avg_jw:
    print("✅ Levenshtein is more accurate for me.")
else:
    print("✅ Jaro-Winkler is more accurate for me.")


🔍 Average Levenshtein Similarity: 0.4127
🔍 Average Jaro-Winkler Similarity: 0.6022
✅ Jaro-Winkler is more accurate for me.
