diff --git a/README.md b/README.md index 0845d02..e6daed2 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,14 @@ iex(3)> PublicSuffix.public_suffix("mysite.foo.bar.com") "com" iex(4)> PublicSuffix.public_suffix("mysite.foo.bar.co.uk") "co.uk" +iex(5)> PublicSuffix.prevailing_rule("mysite.foo.bar.com") +"com" +iex(6)> PublicSuffix.prevailing_rule("mysite.example") +"*" +iex(7)> PublicSuffix.matches_explicit_rule?("mysite.foo.bar.com") +true +iex(8)> PublicSuffix.matches_explicit_rule?("mysite.example") +false ``` The publicsuffix.org data file contains both official ICANN records diff --git a/lib/public_suffix.ex b/lib/public_suffix.ex index b0f3ce4..2fc843c 100644 --- a/lib/public_suffix.ex +++ b/lib/public_suffix.ex @@ -26,7 +26,6 @@ defmodule PublicSuffix do iex> public_suffix("foo.github.io") "github.io" """ - @spec public_suffix(String.t) :: nil | String.t @spec public_suffix(String.t, options) :: nil | String.t def public_suffix(domain, options \\ []) when is_binary(domain) do parse_domain(domain, options, 0) @@ -55,13 +54,73 @@ defmodule PublicSuffix do iex> registrable_domain("foo.github.io") "foo.github.io" """ - @spec registrable_domain(String.t) :: nil | String.t @spec registrable_domain(String.t, options) :: nil | String.t def registrable_domain(domain, options \\ []) when is_binary(domain) do # "The registered or registrable domain is the public suffix plus one additional label." parse_domain(domain, options, 1) end + @doc """ + Parses the provided domain and returns the prevailing rule based on the + publicsuffix.org rules. If no rules match, the prevailing rule is "*", + unless the provided domain has a leading dot, in which case the input is + invalid and the function returns `nil`. + + ## Examples + + iex> prevailing_rule("foo.bar.com") + "com" + iex> prevailing_rule("co.uk") + "co.uk" + iex> prevailing_rule("foo.ck") + "*.ck" + iex> prevailing_rule("foobar.example") + "*" + + You can use the `ignore_private` keyword to exclude private (non-ICANN) domains. + + iex> prevailing_rule("foo.github.io", ignore_private: false) + "github.io" + iex> prevailing_rule("foo.github.io", ignore_private: true) + "io" + iex> prevailing_rule("foo.github.io") + "github.io" + """ + @spec prevailing_rule(String.t, options) :: nil | String.t + def prevailing_rule(domain, options \\ []) + def prevailing_rule("." <> _domain, _), do: nil + def prevailing_rule(domain, options) when is_binary(domain) do + domain + |> String.downcase + |> String.split(".") + |> find_prevailing_rule(options) + |> case do + {:exception, rule} -> "!" <> Enum.join(rule, ".") + {_, rule} -> Enum.join(rule, ".") + end + end + + @doc """ + Checks whether the provided domain matches an existing rule in the + publicsuffix.org rules. + + ## Examples + + iex> matches_explicit_rule?("foo.bar.com") + true + iex> matches_explicit_rule?("com") + true + iex> matches_explicit_rule?("foobar.example") + false + + You can use the `ignore_private` keyword to exclude private (non-ICANN) domains. + """ + @spec matches_explicit_rule?(String.t | nil) :: boolean + def matches_explicit_rule?(nil), do: false + def matches_explicit_rule?(domain, options \\ []) when is_binary(domain) do + !(prevailing_rule(domain, options) in [nil, "*"]) + end + # Inputs with a leading dot should be treated as a special case. # see https://github.com/publicsuffix/list/issues/208 defp parse_domain("." <> _domain, _, _), do: nil @@ -82,6 +141,11 @@ defmodule PublicSuffix do num_labels = labels |> find_prevailing_rule(options) + |> case do + # "If the prevailing rule is a exception rule, modify it by removing the leftmost label." + {:exception, labels} -> tl(labels) + {_, labels} -> labels + end |> length |> +(extra_label_parts) @@ -99,7 +163,7 @@ defmodule PublicSuffix do find_prevailing_exception_rule(labels, allowed_rule_types) || find_prevailing_normal_rule(labels, allowed_rule_types) || # "If no rules match, the prevailing rule is "*"." - ["*"] + {:normal, ["*"]} end data_file = Path.expand("../data/public_suffix_list.dat", __DIR__) @@ -128,8 +192,7 @@ defmodule PublicSuffix do defp find_prevailing_exception_rule([], _allowed_rule_types), do: nil defp find_prevailing_exception_rule([_ | suffix] = domain_labels, allowed_rule_types) do if @exception_rules[domain_labels] in allowed_rule_types do - # "If the prevailing rule is a exception rule, modify it by removing the leftmost label." - suffix + {:exception, domain_labels} else find_prevailing_exception_rule(suffix, allowed_rule_types) end @@ -140,9 +203,9 @@ defmodule PublicSuffix do defp find_prevailing_normal_rule([], _allowed_rule_types), do: nil defp find_prevailing_normal_rule([_ | suffix] = domain_labels, allowed_rule_types) do cond do - @exact_match_rules[domain_labels] in allowed_rule_types -> domain_labels + @exact_match_rules[domain_labels] in allowed_rule_types -> {:normal, domain_labels} # TODO: "Wildcards are not restricted to appear only in the leftmost position" - @wild_card_rules[["*" | suffix]] in allowed_rule_types -> domain_labels + @wild_card_rules[["*" | suffix]] in allowed_rule_types -> {:normal, ["*"] ++ suffix} true -> find_prevailing_normal_rule(suffix, allowed_rule_types) end end diff --git a/test/public_suffix_test.exs b/test/public_suffix_test.exs index 6d14c9c..e7ae02e 100644 --- a/test/public_suffix_test.exs +++ b/test/public_suffix_test.exs @@ -28,6 +28,87 @@ defmodule PublicSuffix.PublicSuffixTest do end end + test_cases_prevailing_private = [ + {"exact match", "foo.github.io", "github.io", "io"}, + {"wildcard", "foo.bar.api.githubcloud.com", "*.api.githubcloud.com", "com"}, + ] + + for {rule_type, input, expected_with_private, expected_without_private} <- test_cases_prevailing_private do + @input input + @expected_with_private expected_with_private + @expected_without_private expected_without_private + + test "`prevailing_rule` includes private domains by default (#{rule_type})" do + assert prevailing_rule(@input) == @expected_with_private + end + + test "`prevailing_rule` includes private domains if passed `ignore_private: false` (#{rule_type})" do + assert prevailing_rule(@input, ignore_private: false) == @expected_with_private + end + + test "`prevailing_rule` excludes private domains if passed `ignore_private: true` (#{rule_type})" do + assert prevailing_rule(@input, ignore_private: true) == @expected_without_private + end + end + + test_cases_prevailing = [ + {"leading dot", ".com", nil}, + {"unlisted TLD", "example", "*"}, + {"unlisted TLD", "example.example", "*"}, + {"TLD with only 1 rule", "biz", "biz"}, + {"TLD with only 1 rule", "domain.biz", "biz"}, + {"TLD with some 2-level rules", "uk.com", "uk.com"}, + {"TLD with some 2-level rules", "example.uk.com", "uk.com"}, + {"TLD with only 1 (wildcard) rule", "mm", "*"}, + {"TLD with only 1 (wildcard) rule", "c.mm", "*.mm"}, + {"TLD with only 1 (wildcard) rule", "b.c.mm", "*.mm"}, + {"more complex TLD", "kyoto.jp", "kyoto.jp"}, + {"more complex TLD", "test.kyoto.jp", "kyoto.jp"}, + {"more complex TLD", "ide.kyoto.jp", "ide.kyoto.jp"}, + {"more complex TLD", "b.ide.kyoto.jp", "ide.kyoto.jp"}, + {"more complex TLD", "a.b.ide.kyoto.jp", "ide.kyoto.jp"}, + {"more complex TLD", "c.kobe.jp", "*.kobe.jp"}, + {"more complex TLD", "b.c.kobe.jp", "*.kobe.jp"}, + {"more complex TLD", "city.kobe.jp", "!city.kobe.jp"}, + {"more complex TLD", "www.city.kobe.jp", "!city.kobe.jp"}, + {"TLD with a wildcard rule and exceptions", "ck", "*"}, + {"TLD with a wildcard rule and exceptions", "test.ck", "*.ck"}, + {"TLD with a wildcard rule and exceptions", "b.test.ck", "*.ck"}, + {"TLD with a wildcard rule and exceptions", "www.ck", "!www.ck"}, + {"TLD with a wildcard rule and exceptions", "www.www.ck", "!www.ck"}, + ] + + for {rule_type, input, expected_output} <- test_cases_prevailing do + @input input + @expected_output expected_output + + test "`prevailing_rule` returns `#{to_string(expected_output)}` if passed `#{input}` (#{rule_type})" do + assert prevailing_rule(@input) == @expected_output + end + end + + test_cases_matches_explicit = [ + {"listed TLD only", "com", true}, + {"TLD with only 1 (wildcard) rule", "mm", false}, + {"TLD with only 1 (wildcard) rule", "b.mm", true}, + {"TLD with a wildcard rule and exceptions", "ck", false}, + {"TLD with a wildcard rule and exceptions", "b.ck", true}, + {"TLD with a wildcard rule and exceptions", "www.ck", true}, + {"domain with leading dot", ".com", false}, + {"unlisted TLD", "example", false}, + {"empty string", "", false}, + {"nil", nil, false}, + ] + + for {rule_type, input, expected_output} <- test_cases_matches_explicit do + @input input + @expected_output expected_output + + test "`matches_explicit_rule?` returns `#{to_string(expected_output)}` if passed `#{input}` (#{rule_type})" do + assert matches_explicit_rule?(@input) == @expected_output + end + end + test "unicode domains are correctly NFKC normalized when punycoding them" do # Both of these strings are different unicode forms of "ábc.co.uk". # The example came from: