# This file contain code for email valid and invalid python code and their avg accuracy using different method such as 

# levesthein , jaro walker, cosine similarity


# This code is for checking  email validity or invalidity

In [6]:
import pandas as pd
import re

# Set of explicitly allowed domains
allowed_domains = {"gmail.com", "yahoo.com", "outlook.com"}

# Define the email validation function
def is_valid_email(email):
    # Basic email structure
    if not re.match(r"^[^@]+@[^@]+\.[^@]+$", email):
        return False

    try:
        local_part, domain = email.split("@", 1)
    except ValueError:
        return False

    # LOCAL PART VALIDATION
    if len(local_part) < 6 or len(local_part) > 340:
        return False
    if local_part.startswith('.') or local_part.endswith('.'):
        return False
    if '..' in local_part:
        return False
    if not re.fullmatch(r'[a-zA-Z0-9._]+', local_part):  # no hyphens allowed
        return False
    if "_" in local_part:
        return False

    # DOMAIN VALIDATION
    if '..' in domain:
        return False
    if domain != domain.lower():  # all lowercase only
        return False

    # Valid endings check
    if not (domain.endswith(".com") or domain.endswith(".in") or domain.endswith(".ac.in")):
        return False

    # Accept if it's in allowed_domains
    if domain in allowed_domains:
        return True

    # Accept if it's a valid .ac.in domain like "du.ac.in"
    if domain.endswith(".ac.in"):
        if re.fullmatch(r'[a-z0-9.-]+\.ac\.in', domain):
            return True

    return False

# Load the Excel file

df = pd.read_excel(r"G:\Python_Practice\shuffled_emails.xlsx")

# Assume emails are in "Email" column
emails = df["Email"]

# Apply the validator
validation_results = emails.apply(is_valid_email)

# Separate valid and invalid emails
valid_emails = emails[validation_results]
invalid_emails = emails[~validation_results]

# Print results
print("✅ Valid Emails:")
print(valid_emails)

print("\n❌ Invalid Emails:")
print(invalid_emails)



✅ Valid Emails:
1           paperboat61@gmail.com
2            redriver88@gmail.com
6         coolcucumber3@gmail.com
12         marblecake27@gmail.com
13        sweetdreams90@gmail.com
14        forestqueen87@gmail.com
16        crazybanana60@gmail.com
20         funnybunny87@gmail.com
23      littlemountain8@gmail.com
25          spicytaco91@gmail.com
30         funnybunny17@gmail.com
33             rocky007@gmail.com
35            bluesky43@gmail.com
37       brightlantern5@gmail.com
38          wildflower9@gmail.com
41          dustyroad18@gmail.com
45        silverarrow32@gmail.com
48      twinklingstar95@gmail.com
58        midnightowl84@gmail.com
62        silverstone73@gmail.com
63          snowflake78@gmail.com
64       mysticforest22@gmail.com
65            lilybee92@gmail.com
67      goldensparrow77@gmail.com
69          magicrain35@gmail.com
70         greengrass45@gmail.com
71          happyfeet85@gmail.com
72         fluffybear29@gmail.com
74         shadowwolf99@gmail.co

In [None]:
print(len(valid_emails))  # count of valid emails


43


In [None]:
print(len(invalid_emails)) # count of invalid emails

59


# This code is for accuracy

#  For Cosine Similarity

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load Excel file
file_path = r"G:\Python_Practice\validated_emails_output1.xlsx" # Replace with your actual file path
df = pd.read_excel(file_path)

# Step 2: Replace with your actual column names if different
valid_emails = df["Valid Emails"].dropna().astype(str).tolist()
invalid_emails = df["Invalid Emails"].dropna().astype(str).tolist()
# Step 3: Combine for vectorization
all_emails = valid_emails + invalid_emails

# Step 4: Vectorize using character-level n-grams
vectorizer = CountVectorizer(analyzer='char_wb', ngram_range=(2, 4))
vectorizer.fit(all_emails)

# Step 5: Transform valid and invalid emails
valid_vectors = vectorizer.transform(valid_emails)
invalid_vectors = vectorizer.transform(invalid_emails)

# Step 6: Calculate cosine similarity and find closest matches
results = []
for idx, inv_vec in enumerate(invalid_vectors):
    similarities = cosine_similarity(inv_vec, valid_vectors).flatten()
    best_match_index = similarities.argmax()
    results.append({
        "Invalid Email": invalid_emails[idx],
        "Closest Valid Email": valid_emails[best_match_index],
        "Cosine Similarity": round(similarities[best_match_index], 4)
    })

# Step 7: Create and display result DataFrame
result_df = pd.DataFrame(results)
print(result_df)




               Invalid Email       Closest Valid Email  Cosine Similarity
0        john..123@gmail.com        sunny123@gmail.com             0.6489
1          .mike45@gmail.com    greengrass45@gmail.com             0.5604
2          amy_@56@gmail.com  cottonclouds56@gmail.com             0.5534
3       robert#123@gmail.com        sunny123@gmail.com             0.6325
4          sara$56@gmail.com  cottonclouds56@gmail.com             0.5534
5          anil%99@gmail.com    shadowwolf99@gmail.com             0.5749
6          tim@@45@gmail.com    greengrass45@gmail.com             0.5604
7          kate-23@gmail.com        sunny123@gmail.com             0.6288
8        peter*&89@gmail.com     wildflower9@gmail.com             0.5257
9         lisa.12.@gmail.com       lilybee92@gmail.com             0.5492
10         mark!56@gmail.com    marblecake27@gmail.com             0.5830
11          ron&33@gmail.com        sunny123@gmail.com             0.5893
12          bob:45@gmail.com    greeng

# To Calculate Average Cosine Similarity

In [11]:
import pandas as pd

# Load the result file
file_path = r"G:\Python_Practice\email_similarity_results.xlsx"
result_df = pd.read_excel(file_path)

# Calculate the average cosine similarity (assuming the column name is correct)
average_cosine_similarity = result_df["Cosine Similarity"].mean()

# Print the result
print(f"Average Cosine Similarity: {round(average_cosine_similarity, 4)}")

Average Cosine Similarity: 0.5607


# To calaculate Jaro winkler 

In [12]:
import pandas as pd
from Levenshtein import distance as levenshtein_distance
from jellyfish import jaro_winkler_similarity

# Load the Excel file (update your path)
file_path = r"G:\Python_Practice\validated_emails_output1.xlsx"  # Replace with actual path
df = pd.read_excel(file_path)

# Use the correct column names with space
valid_emails = df["Valid Emails"].dropna().astype(str).tolist()
invalid_emails = df["Invalid Emails"].dropna().astype(str).tolist()

# Store results
results = []

for invalid in invalid_emails:
    best_levenshtein_score = float('inf')  # Lower is better
    best_jaro_score = 0  # Higher is better

    best_levenshtein_match = None
    best_jaro_match = None

    for valid in valid_emails:
        lev_dist = levenshtein_distance(invalid, valid)
        jaro_sim = jaro_winkler_similarity(invalid, valid)

        # Track best Levenshtein match (smallest distance)
        if lev_dist < best_levenshtein_score:
            best_levenshtein_score = lev_dist
            best_levenshtein_match = valid

        # Track best Jaro-Winkler match (highest similarity)
        if jaro_sim > best_jaro_score:
            best_jaro_score = jaro_sim
            best_jaro_match = valid

    results.append({
        "Invalid Email": invalid,
        "Best Match (Levenshtein)": best_levenshtein_match,
        "Levenshtein Distance": best_levenshtein_score,
        "Best Match (Jaro-Winkler)": best_jaro_match,
        "Jaro-Winkler Similarity": round(best_jaro_score, 4)
    })

# Convert to DataFrame and export
result_df = pd.DataFrame(results)
print(result_df)




               Invalid Email Best Match (Levenshtein)  Levenshtein Distance  \
0        john..123@gmail.com       sunny123@gmail.com                     5   
1          .mike45@gmail.com       rocky007@gmail.com                     7   
2          amy_@56@gmail.com         alex99@gmial.com                     8   
3       robert#123@gmail.com       sunny123@gmail.com                     7   
4          sara$56@gmail.com       sunny123@gmail.com                     7   
5          anil%99@gmail.com         alex99@gmial.com                     6   
6          tim@@45@gmail.com       sunny123@gmail.com                     8   
7          kate-23@gmail.com       sunny123@gmail.com                     6   
8        peter*&89@gmail.com     speedyfox1@gmail.com                     8   
9         lisa.12.@gmail.com       sunny123@gmail.com                     6   
10         mark!56@gmail.com       rocky007@gmail.com                     7   
11          ron&33@gmail.com       sunny123@gmail.co

# To Calculate the accuracy of jaro winkler

In [15]:
average_jaro_similarity = result_df["Jaro-Winkler Similarity"].mean()
print(f"\n📊 Average Jaro-Winkler Similarity: {round(average_jaro_similarity, 4)}")


📊 Average Jaro-Winkler Similarity: 0.8027


# To Calculate Levesthein method 

In [16]:
import pandas as pd

# Load the result file (ensure the path is correct)
file_path = r"G:\Python_Practice\email_similarity_jaro_levenshtein.xlsx"
result_df = pd.read_excel(file_path)

# Display the column names to confirm structure
print(result_df.columns)

# Function to calculate Levenshtein Similarity based on the 'Levenshtein Distance'
def calculate_levenshtein_similarity(row):
    # Extract the relevant values
    levenshtein_distance = row["Levenshtein Distance"]
    invalid_email = row["Invalid Email"]
    best_match_email = row["Best Match (Levenshtein)"]  # The column with the best match
    
    # Calculate Levenshtein similarity
    levenshtein_similarity = 1 - (levenshtein_distance / max(len(invalid_email), len(best_match_email)))
    
    return levenshtein_similarity

# Apply the function to calculate Levenshtein Similarity for each row
result_df["Levenshtein Similarity"] = result_df.apply(calculate_levenshtein_similarity, axis=1)

# Display the updated DataFrame
print(result_df)




Index(['Invalid Email', 'Best Match (Levenshtein)', 'Levenshtein Distance',
       'Best Match (Jaro-Winkler)', 'Jaro-Winkler Similarity'],
      dtype='object')
               Invalid Email Best Match (Levenshtein)  Levenshtein Distance  \
0        john..123@gmail.com       sunny123@gmail.com                     5   
1          .mike45@gmail.com       rocky007@gmail.com                     7   
2          amy_@56@gmail.com         alex99@gmial.com                     8   
3       robert#123@gmail.com       sunny123@gmail.com                     7   
4          sara$56@gmail.com       sunny123@gmail.com                     7   
5          anil%99@gmail.com         alex99@gmial.com                     6   
6          tim@@45@gmail.com       sunny123@gmail.com                     8   
7          kate-23@gmail.com       sunny123@gmail.com                     6   
8        peter*&89@gmail.com     speedyfox1@gmail.com                     8   
9         lisa.12.@gmail.com       sunny123@gmai

# To Calculate the accuracy of Levesthein Method

In [17]:
# Calculate the average Levenshtein similarity
average_levenshtein = result_df["Levenshtein Similarity"].mean()

# Print the average Levenshtein similarity
print(f"Average Levenshtein Similarity: {round(average_levenshtein, 4)}")


Average Levenshtein Similarity: 0.631


# Therefore Highest accuracy is by (1) Jaro winkler which is 80% (2) Leveshtein Method(63%) (3) cosine simialrity(56%)

# To build Ml Model

In [19]:
import pandas as pd

# Load the Excel file
file_path = r"G:\Python_Practice\validated_emails_output1.xlsx"
df = pd.read_excel(file_path)

# Display the first few rows of the DataFrame to understand its structure
print(df.head(10))

# Assume that the columns are named 'Invalid Email', 'Best Match (Levenshtein)', 'Levenshtein Distance', 
# 'Best Match (Jaro-Winkler)', and 'Jaro-Winkler Similarity'


             Valid Emails        Invalid Emails
0        alex99@gmial.com   john..123@gmail.com
1      sunny123@gmail.com     .mike45@gmail.com
2     lilybee92@gmail.com     amy_@56@gmail.com
3      rocky007@gmail.com  robert#123@gmail.com
4   snowflake78@gmail.com     sara$56@gmail.com
5   tinytiger23@gmail.com     anil%99@gmail.com
6  shadowwolf99@gmail.com     tim@@45@gmail.com
7    speedyfox1@gmail.com     kate-23@gmail.com
8  greengrass45@gmail.com   peter*&89@gmail.com
9    redriver88@gmail.com    lisa.12.@gmail.com


In [23]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from fuzzywuzzy import fuzz

# Step 1: Load the Excel file containing the valid and invalid emails
file_path = r"G:\Python_Practice\validated_emails_output1.xlsx"
df = pd.read_excel(file_path)

# Step 2: Preprocess data
valid_emails = df["Valid Emails"].dropna().astype(str).tolist()
invalid_emails = df["Invalid Emails"].dropna().astype(str).tolist()

# Step 3: Calculate Jaro-Winkler Similarity
def jaro_winkler_similarity(str1, str2):
    return fuzz.jaro_winkler(str1, str2)

# Step 4: Create Features (Jaro-Winkler Similarity) and Labels
X = []
y = []

# Generate pairs of valid and invalid emails
for valid_email in valid_emails:
    for invalid_email in invalid_emails:
        jw_sim = jaro_winkler_similarity(valid_email, invalid_email)
        X.append([jw_sim])
        y.append(1)  # Label 1 for valid emails

for invalid_email in invalid_emails:
    for valid_email in valid_emails:
        jw_sim = jaro_winkler_similarity(valid_email, invalid_email)
        X.append([jw_sim])
        y.append(0)  # Label 0 for invalid emails

# Convert to numpy arrays
X = np.array(X)
y = np.array(y)

# Step 5: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the model
model = make_pipeline(StandardScaler(), LogisticRegression())
model.fit(X_train, y_train)

# Step 7: Predictions and Evaluation
y_pred = model.predict(X_test)

# Step 8: Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Step 9: Print the results
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Step 10: Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Step 11: Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))


AttributeError: module 'fuzzywuzzy.fuzz' has no attribute 'jaro_winkler'

In [22]:
from jaro_winkler import jaro_winkler

# Jaro-Winkler Similarity using jaro-winkler package
similarity = jaro_winkler("email1@example.com", "email2@example.com")
print(similarity)


ModuleNotFoundError: No module named 'jaro_winkler'

In [24]:
import jellyfish
import pandas as pd

# Load your Excel file
df = pd.read_excel(r"G:\Python_Practice\validated_emails_output1.xlsx")

# Drop NaN and convert to string
valid_emails = df["Valid Emails"].dropna().astype(str).tolist()
invalid_emails = df["Invalid Emails"].dropna().astype(str).tolist()

# Find best Jaro-Winkler matches
results = []
for invalid in invalid_emails:
    best_score = 0
    best_match = ""
    for valid in valid_emails:
        score = jellyfish.jaro_winkler_similarity(invalid, valid)
        if score > best_score:
            best_score = score
            best_match = valid
    results.append({
        "Invalid Email": invalid,
        "Best Match (Jaro-Winkler)": best_match,
        "Jaro-Winkler Similarity": round(best_score, 4)
    })

# Save or display the result
result_df = pd.DataFrame(results)
print(result_df)



# Average similarity
average_score = result_df["Jaro-Winkler Similarity"].mean()
print(f"\n📊 Average Jaro-Winkler Similarity: {round(average_score, 4)}")


               Invalid Email Best Match (Jaro-Winkler)  \
0        john..123@gmail.com        sunny123@gmail.com   
1          .mike45@gmail.com       bluesky43@gmail.com   
2          amy_@56@gmail.com     happyfeet55@gmail.com   
3       robert#123@gmail.com        rocky007@gmail.com   
4          sara$56@gmail.com     happyfeet55@gmail.com   
5          anil%99@gmail.com          alex99@gmial.com   
6          tim@@45@gmail.com     tinytiger23@gmail.com   
7          kate-23@gmail.com          alex99@gmial.com   
8        peter*&89@gmail.com     happyfeet85@gmail.com   
9         lisa.12.@gmail.com     dustyroad18@gmail.com   
10         mark!56@gmail.com    marblecake27@gmail.com   
11          ron&33@gmail.com        rocky007@gmail.com   
12          bob:45@gmail.com       bluesky43@gmail.com   
13        nancy/78@gmail.com    funnybunny87@gmail.com   
14        julia=22@gmail.com    fluffybear29@gmail.com   
15        kelly?99@gmail.com       lilybee92@gmail.com   
16       vicky

In [25]:
import pandas as pd
import jellyfish

# Load data
df = pd.read_excel(r"G:\Python_Practice\validated_emails_output1.xlsx")
valid_emails = df["Valid Emails"].dropna().astype(str).tolist()
invalid_emails = df["Invalid Emails"].dropna().astype(str).tolist()

# Create training data: (invalid, closest valid, similarity score)
training_data = []
for invalid in invalid_emails:
    best_score = 0
    best_match = ""
    for valid in valid_emails:
        score = jellyfish.jaro_winkler_similarity(invalid, valid)
        if score > best_score:
            best_score = score
            best_match = valid
    # Let's define "correctable" as similarity >= 0.85
    label = 1 if best_score >= 0.85 else 0
    training_data.append({
        "Invalid Email": invalid,
        "Closest Valid Email": best_match,
        "Jaro-Winkler Similarity": best_score,
        "Correctable": label
    })

# Convert to DataFrame
train_df = pd.DataFrame(training_data)
train_df.to_excel("email_training_data.xlsx", index=False)


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Feature and target
X = train_df[["Jaro-Winkler Similarity"]]
y = train_df["Correctable"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"🔍 ML Model Accuracy: {round(accuracy * 100, 2)}%")


🔍 ML Model Accuracy: 100.0%


In [27]:
import jellyfish

def predict_email_correction(invalid_email, valid_emails, model, threshold=0.85):
    best_score = 0
    best_match = ""
    
    # Find best Jaro-Winkler match
    for valid in valid_emails:
        score = jellyfish.jaro_winkler_similarity(invalid_email, valid)
        if score > best_score:
            best_score = score
            best_match = valid
    
    # Use ML model to predict
    prediction = model.predict([[best_score]])[0]

    if prediction == 1:
        return {
            "Invalid Email": invalid_email,
            "Best Match": best_match,
            "Jaro-Winkler Similarity": round(best_score, 4),
            "Prediction": "Correctable",
            "Corrected Email": best_match
        }
    else:
        return {
            "Invalid Email": invalid_email,
            "Best Match": best_match,
            "Jaro-Winkler Similarity": round(best_score, 4),
            "Prediction": "Unfixable",
            "Corrected Email": None
        }

# Example usage
new_invalid_email = "jhon.doe@gmil.com"  # Example
result = predict_email_correction(new_invalid_email, valid_emails, model)
print(result)


{'Invalid Email': 'jhon.doe@gmil.com', 'Best Match': 'speedyfox1@gmail.com', 'Jaro-Winkler Similarity': 0.7408, 'Prediction': 'Unfixable', 'Corrected Email': None}




In [28]:
test_emails = [
    "jhon.doe@gmil.com", "sarah99@gnail.com", "mike_tyson@yahoo.com", "elena.f@outlok.com", "jack_23@icloud.com",
    "kevin.smith@hotmail.con", "amy_lee@yhaoo.com", "peter@company", "lucy@mailinator.com", "noah.w@rediffmail.com",
    "emma@gnail.com", "william-007@protonmail.com", "ava@icloud.con", "liam.j@ymail.com", "sophia@outlook",
    "james.bond@spycom", "olivia@gmaill.com", "ben_10@cartoon.net", "harry.potter@hogwards.com", "ron_w@outlook.com",
    "hermione@gmial.com", "tony.stark@ironmn.com", "bruce.wayne@batman.net", "clark.kent@superman.co", "steve@roger.com",
    "natasha.r@shield.org", "thor@asgard.com", "loki@trickster.co", "wanda@maxmoff.com", "vision@ai.com",
    "peter.parker@webslinger.net", "dr.strange@sorcerer.com", "nick.fury@eye.com", "hulk@green.com", "tchalla@waknda.com",
    "shuri@techqueen.com", "okoye@guard.com", "sam.wilson@falcon.com", "bucky.barnes@winter.com", "rocket@raccoon.net",
    "groot@iam.com", "drax@destroyer.net", "gamora@greenmail.com", "nebula@blue.net", "mantis@empath.com",
    "peter.quill@starlord.com", "mj@zendaya.com", "ned@techguy.com", "may.parker@aunt.com", "flash@bully.com",
    "jake.g@ghoulmail.com", "light.yagami@deathnote.com", "l.lawliet@detective.org", "ryuk@shinigami.jp", "misa@misa.com",
    "near@genius.com", "mello@crazy.com", "soichiro@chief.jp", "rem@notebook.jp", "teru.mikami@fanatic.jp",
    "gon@hunter.com", "killua@zoldyck.com", "kurapika@chains.com", "leorio@doc.net", "hisoka@joker.net",
    "naruto@hokage.com", "sasuke@uchiha.net", "sakura@medic.com", "kakashi@copy.com", "hinata@byakugan.com",
    "neji@genius.net", "shikamaru@lazy.com", "ino@flower.com", "choji@foodie.com", "asuma@sensei.com",
    "minato@flash.com", "kushina@redhair.com", "jiraiya@legend.com", "tsunade@slugs.com", "orochimaru@snake.net",
    "pain@akatsuki.org", "itachi@crow.net", "kisame@shark.com", "deidara@boom.com", "tobi@mask.com",
    "madara@uchiha.com", "obito@konoha.net", "konan@paper.org", "zetsu@weird.com", "hidan@immortal.net",
    "kakuzu@money.com", "yamato@wood.com", "sai@root.com", "temari@wind.com", "gaara@sand.com",
    "kankuro@puppet.com", "rocklee@brows.net", "tenten@weapons.com", "guy@power.com", "iruka@teacher.net",
    "shino@bugs.com", "kiba@akamaru.com", "karin@sensor.net", "suigetsu@water.net", "juugo@rage.com"
]


In [29]:
for email in test_emails:
    result = predict_email_correction(email, valid_emails, model)
    print(result)


{'Invalid Email': 'jhon.doe@gmil.com', 'Best Match': 'speedyfox1@gmail.com', 'Jaro-Winkler Similarity': 0.7408, 'Prediction': 'Unfixable', 'Corrected Email': None}
{'Invalid Email': 'sarah99@gnail.com', 'Best Match': 'alex99@gmial.com', 'Jaro-Winkler Similarity': 0.7631, 'Prediction': 'Unfixable', 'Corrected Email': None}
{'Invalid Email': 'mike_tyson@yahoo.com', 'Best Match': 'silentstorm76@gmail.com', 'Jaro-Winkler Similarity': 0.6615, 'Prediction': 'Unfixable', 'Corrected Email': None}
{'Invalid Email': 'elena.f@outlok.com', 'Best Match': 'snowflake78@gmail.com', 'Jaro-Winkler Similarity': 0.6522, 'Prediction': 'Unfixable', 'Corrected Email': None}
{'Invalid Email': 'jack_23@icloud.com', 'Best Match': 'sunny123@gmail.com', 'Jaro-Winkler Similarity': 0.5926, 'Prediction': 'Unfixable', 'Corrected Email': None}
{'Invalid Email': 'kevin.smith@hotmail.con', 'Best Match': 'silentstorm76@gmail.com', 'Jaro-Winkler Similarity': 0.6929, 'Prediction': 'Unfixable', 'Corrected Email': None}
{'In



In [31]:
import pandas as pd

# Define test dataset
test_data = {
    "Invalid Email": [
        "jhon.doe@gmil.com",
        "alex99@gmial.com",
        "emma@gnail.com",
        "xyz@outlok.com",
        "wrongemail@abcd.com"  # suppose this one is unfixable
    ],
    "Expected Correction": [
        "jhon.doe@gmail.com",
        "alex99@gmail.com",
        "emma@gmail.com",
        "xyz@outlook.com",
        None  # None if it cannot be corrected
    ]
}

# Create DataFrame
test_df = pd.DataFrame(test_data)



In [32]:
results = []

for index, row in test_df.iterrows():
    invalid = row['Invalid Email']
    expected = row['Expected Correction']
    
    predicted, status = correct_email_function(invalid)  # replace with your function
    
    results.append({
        "Invalid Email": invalid,
        "Expected": expected,
        "Predicted": predicted,
        "Match": expected == predicted
    })

result_df = pd.DataFrame(results)

# Show results
print(result_df)

# Accuracy
accuracy = result_df['Match'].mean()
print(f"\nExact Match Accuracy: {accuracy:.2%}")


NameError: name 'correct_email_function' is not defined

In [36]:
import pandas as pd

# Load the data from Excel
df = pd.read_excel(r"G:\Python_Practice\validated_emails_output1.xlsx")

# Check the column names to ensure they are correct
print(df.columns)


Index(['Valid Emails', 'Invalid Emails'], dtype='object')


In [37]:
# Combine valid and invalid emails into a single list
valid_emails = df['Valid Emails'].dropna()  # Drop NaN values if any
invalid_emails = df['Invalid Emails'].dropna()  # Drop NaN values if any

# Create a new DataFrame with email and label (1 for valid, 0 for invalid)
emails = pd.concat([valid_emails, invalid_emails], ignore_index=True)
labels = [1] * len(valid_emails) + [0] * len(invalid_emails)

# Create a final DataFrame
data = pd.DataFrame({'email': emails, 'label': labels})

# Display the first few rows of the final DataFrame
print(data.head())


                   email  label
0       alex99@gmial.com      1
1     sunny123@gmail.com      1
2    lilybee92@gmail.com      1
3     rocky007@gmail.com      1
4  snowflake78@gmail.com      1


In [38]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['email'], data['label'], test_size=0.2, random_state=42)

# Vectorize the emails
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_vect, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_vect)

# Evaluate the model
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.50      1.00      0.67        10
           1       0.00      0.00      0.00        10

    accuracy                           0.50        20
   macro avg       0.25      0.50      0.33        20
weighted avg       0.25      0.50      0.33        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [39]:
new_emails = ['newvalid@email.com', 'invalidemail@domain', 'example@domain.com']
new_emails_vect = vectorizer.transform(new_emails)
predictions = model.predict(new_emails_vect)

for email, prediction in zip(new_emails, predictions):
    print(f"Email: {email} - {'Valid' if prediction == 1 else 'Invalid'}")


Email: newvalid@email.com - Invalid
Email: invalidemail@domain - Invalid
Email: example@domain.com - Invalid
