Skip to content

Commit

Permalink
feat: add email address normalisation (#1206)
Browse files Browse the repository at this point in the history
* feat: add email address normalisation

- normalise Gmail addresses:
  - remove subaddresses.
  - remove dots/periods.
  - lower-case usernames.
  - standardise on `gmail.com`.
- include this in `syntax.normalized_email` in the output.

relates #952

* test: add check for idempotency

verify that normalisation is idempotent (i.e. normalising an
already-normalised email results in no further changes.)

* fix: normalize by username/password

update the `normalize_email()` signature to accept `username` and
`domain` separately: these have been split in an earlier stage.
  • Loading branch information
PsypherPunk committed Oct 26, 2022
1 parent fcec5e7 commit f8ec348
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 7 deletions.
4 changes: 2 additions & 2 deletions backend/tests/check_email.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ use reacher_backend::routes::{check_email::post::EndpointRequest, create_routes}
use warp::http::StatusCode;
use warp::test::request;

const FOO_BAR_RESPONSE: &str = r#"{"input":"foo@bar","is_reachable":"invalid","misc":{"is_disposable":false,"is_role_account":false,"gravatar_url":null},"mx":{"accepts_mail":false,"records":[]},"smtp":{"can_connect_smtp":false,"has_full_inbox":false,"is_catch_all":false,"is_deliverable":false,"is_disabled":false},"syntax":{"address":null,"domain":"","is_valid_syntax":false,"username":"","suggestion":null}}"#;
const FOO_BAR_BAZ_RESPONSE: &str = r#"{"input":"foo@bar.baz","is_reachable":"invalid","misc":{"is_disposable":false,"is_role_account":false,"gravatar_url":null},"mx":{"accepts_mail":false,"records":[]},"smtp":{"can_connect_smtp":false,"has_full_inbox":false,"is_catch_all":false,"is_deliverable":false,"is_disabled":false},"syntax":{"address":"foo@bar.baz","domain":"bar.baz","is_valid_syntax":true,"username":"foo","suggestion":null}}"#;
const FOO_BAR_RESPONSE: &str = r#"{"input":"foo@bar","is_reachable":"invalid","misc":{"is_disposable":false,"is_role_account":false,"gravatar_url":null},"mx":{"accepts_mail":false,"records":[]},"smtp":{"can_connect_smtp":false,"has_full_inbox":false,"is_catch_all":false,"is_deliverable":false,"is_disabled":false},"syntax":{"address":null,"domain":"","is_valid_syntax":false,"username":"","normalized_email":null,"suggestion":null}}"#;
const FOO_BAR_BAZ_RESPONSE: &str = r#"{"input":"foo@bar.baz","is_reachable":"invalid","misc":{"is_disposable":false,"is_role_account":false,"gravatar_url":null},"mx":{"accepts_mail":false,"records":[]},"smtp":{"can_connect_smtp":false,"has_full_inbox":false,"is_catch_all":false,"is_deliverable":false,"is_disabled":false},"syntax":{"address":"foo@bar.baz","domain":"bar.baz","is_valid_syntax":true,"username":"foo","normalized_email":"foo@bar.baz","suggestion":null}}"#;

#[tokio::test]
async fn test_input_foo_bar() {
Expand Down
1 change: 1 addition & 0 deletions core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
pub mod gravatar;
pub mod misc;
pub mod mx;
mod normalize;
pub mod smtp;
pub mod syntax;
mod util;
Expand Down
70 changes: 70 additions & 0 deletions core/src/normalize.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
pub fn normalize_email(username: &str, domain: &str) -> String {
match domain {
"gmail.com" | "googlemail.com" => normalize_gmail(username),
_ => format!("{}@{}", username, domain),
}
}

/// Normalize a Gmail address.
///
/// See Gmail username
/// [restrictions](https://support.google.com/mail/answer/9211434?hl=en-GB).
///
/// - removes
/// [sub-addresses](https://support.google.com/a/users/answer/9282734?hl=en#zippy=%2Clearn-how)
/// (i.e. parts after a `+` character.)
/// - removes [dots](https://support.google.com/mail/answer/7436150).
/// - converts to lower-case.
/// - [replaces](https://support.google.com/mail/answer/10313?hl=en-GB#zippy=%2Cgetting-messages-sent-to-an-googlemailcom-address)
/// `googlemail.com` with `gmail.com`.
fn normalize_gmail(username: &str) -> String {
let username = match username.split_once('+') {
Some((username, _)) => username,
_ => username,
}
.chars()
.filter_map(|c| match c.to_ascii_lowercase() {
'.' => None,
lower => Some(lower),
})
.collect::<String>();

format!("{}@gmail.com", username)
}

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn test_gmail_removes_periods() {
assert_eq!(normalize_email("a.b.c", "gmail.com"), "abc@gmail.com");
}

#[test]
fn test_gmail_removes_subaddress() {
assert_eq!(normalize_email("abc+123", "gmail.com"), "abc@gmail.com");
}

#[test]
fn test_gmail_uses_gmail_com() {
assert_eq!(normalize_email("abc", "googlemail.com"), "abc@gmail.com");
}

#[test]
fn test_gmail() {
assert_eq!(
normalize_email("A.B.C+123", "googlemail.com"),
"abc@gmail.com"
);
}

#[test]
fn test_gmail_idempotent() {
let normalized = normalize_email("A.B.C+123", "googlemail.com");

let (username, domain) = normalized.rsplit_once('@').unwrap();

assert_eq!(normalize_email(username, domain), normalized);
}
}
18 changes: 16 additions & 2 deletions core/src/syntax.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ use levenshtein::levenshtein;
use serde::{Deserialize, Serialize};
use std::str::FromStr;

use crate::normalize::normalize_email;

/// Syntax information after parsing an email address
#[derive(Debug, Eq, PartialEq, Deserialize, Serialize)]
pub struct SyntaxDetails {
Expand All @@ -33,6 +35,9 @@ pub struct SyntaxDetails {
/// The username, before "@". It will be the empty string if the email
/// address if ill-formed.
pub username: String,
/// The normalized form of `address`. It will be `None` if the email
/// address is ill-formed.
pub normalized_email: Option<String>,
pub suggestion: Option<String>,
}

Expand All @@ -43,6 +48,7 @@ impl Default for SyntaxDetails {
domain: "".into(),
is_valid_syntax: false,
username: "".into(),
normalized_email: None,
suggestion: None,
}
}
Expand All @@ -61,6 +67,7 @@ pub fn check_syntax(email_address: &str) -> SyntaxDetails {
domain: "".into(),
is_valid_syntax: false,
username: "".into(),
normalized_email: None,
suggestion: None,
};
}
Expand All @@ -71,27 +78,30 @@ pub fn check_syntax(email_address: &str) -> SyntaxDetails {
domain: "".into(),
is_valid_syntax: false,
username: "".into(),
normalized_email: None,
suggestion: None,
}
}
};

let iter: &str = email_address.as_ref();
let mut iter = iter.split('@');
let username = iter
let username: String = iter
.next()
.expect("We checked above that email is valid. qed.")
.into();
let domain = iter
let domain: String = iter
.next()
.expect("We checked above that email is valid. qed.")
.into();
let normalized_email = normalize_email(&username, &domain);

SyntaxDetails {
address: Some(email_address),
domain,
is_valid_syntax: true,
username,
normalized_email: Some(normalized_email),
suggestion: None,
}
}
Expand Down Expand Up @@ -136,6 +146,7 @@ mod tests {
domain: "".into(),
is_valid_syntax: false,
username: "".into(),
normalized_email: None,
suggestion: None,
}
);
Expand All @@ -150,6 +161,7 @@ mod tests {
domain: "".into(),
is_valid_syntax: false,
username: "".into(),
normalized_email: None,
suggestion: None,
}
);
Expand All @@ -164,6 +176,7 @@ mod tests {
domain: "bar.com".into(),
is_valid_syntax: true,
username: "foo".into(),
normalized_email: Some("foo@bar.com".into()),
suggestion: None,
}
);
Expand All @@ -176,6 +189,7 @@ mod tests {
domain: "gmali.com".into(),
is_valid_syntax: true,
username: "test".into(),
normalized_email: Some("test@gmali.com".into()),
suggestion: None,
};
get_similar_mail_provider(&mut syntax);
Expand Down
6 changes: 3 additions & 3 deletions core/src/util/input_output.rs
Original file line number Diff line number Diff line change
Expand Up @@ -418,20 +418,20 @@ mod tests {
let res = dummy_response_with_message("blacklist");
let actual = serde_json::to_string(&res).unwrap();
// Make sure the `description` is present with IpBlacklisted.
let expected = r#"{"input":"foo","is_reachable":"unknown","misc":{"is_disposable":false,"is_role_account":false,"gravatar_url":null},"mx":{"accepts_mail":false,"records":[]},"smtp":{"error":{"type":"SmtpError","message":"transient: blacklist"},"description":"IpBlacklisted"},"syntax":{"address":null,"domain":"","is_valid_syntax":false,"username":"","suggestion":null}}"#;
let expected = r#"{"input":"foo","is_reachable":"unknown","misc":{"is_disposable":false,"is_role_account":false,"gravatar_url":null},"mx":{"accepts_mail":false,"records":[]},"smtp":{"error":{"type":"SmtpError","message":"transient: blacklist"},"description":"IpBlacklisted"},"syntax":{"address":null,"domain":"","is_valid_syntax":false,"username":"","normalized_email":null,"suggestion":null}}"#;
assert_eq!(expected, actual);

let res =
dummy_response_with_message("Client host rejected: cannot find your reverse hostname");
let actual = serde_json::to_string(&res).unwrap();
// Make sure the `description` is present with NeedsRDNs.
let expected = r#"{"input":"foo","is_reachable":"unknown","misc":{"is_disposable":false,"is_role_account":false,"gravatar_url":null},"mx":{"accepts_mail":false,"records":[]},"smtp":{"error":{"type":"SmtpError","message":"transient: Client host rejected: cannot find your reverse hostname"},"description":"NeedsRDNS"},"syntax":{"address":null,"domain":"","is_valid_syntax":false,"username":"","suggestion":null}}"#;
let expected = r#"{"input":"foo","is_reachable":"unknown","misc":{"is_disposable":false,"is_role_account":false,"gravatar_url":null},"mx":{"accepts_mail":false,"records":[]},"smtp":{"error":{"type":"SmtpError","message":"transient: Client host rejected: cannot find your reverse hostname"},"description":"NeedsRDNS"},"syntax":{"address":null,"domain":"","is_valid_syntax":false,"username":"","normalized_email":null,"suggestion":null}}"#;
assert_eq!(expected, actual);

let res = dummy_response_with_message("foobar");
let actual = serde_json::to_string(&res).unwrap();
// Make sure the `description` is NOT present.
let expected = r#"{"input":"foo","is_reachable":"unknown","misc":{"is_disposable":false,"is_role_account":false,"gravatar_url":null},"mx":{"accepts_mail":false,"records":[]},"smtp":{"error":{"type":"SmtpError","message":"transient: foobar"}},"syntax":{"address":null,"domain":"","is_valid_syntax":false,"username":"","suggestion":null}}"#;
let expected = r#"{"input":"foo","is_reachable":"unknown","misc":{"is_disposable":false,"is_role_account":false,"gravatar_url":null},"mx":{"accepts_mail":false,"records":[]},"smtp":{"error":{"type":"SmtpError","message":"transient: foobar"}},"syntax":{"address":null,"domain":"","is_valid_syntax":false,"username":"","normalized_email":null,"suggestion":null}}"#;
assert_eq!(expected, actual);
}
}

0 comments on commit f8ec348

Please sign in to comment.