In [1]:
from pregex.classes import AnyLetter, AnyDigit, AnyFrom
from pregex.quantifiers import Optional, AtLeastAtMost
from pregex.operators import Either
from pregex.groups import Capture
from pregex.pre import Pregex

# Define main sub-patterns.
http_protocol = Optional("http" + Optional('s') + "://")

www = Optional("www.")

any_alphanum = AnyLetter() | AnyDigit()

domain_name = \
    any_alphanum + \
    AtLeastAtMost(any_alphanum | AnyFrom("-", "."), min=1, max=61) + \
    any_alphanum

tld = "." + Either("com", "org")

ip_octet = AtLeastAtMost(AnyDigit(), min=1, max=3)

port_number = 4 * AnyDigit()

# Combine sub-patterns together.
pre: Pregex = \
    http_protocol + \
    Either(
        www + Capture(domain_name) + tld,
        3 * (ip_octet + ".") + ip_octet + ":" + port_number
    )

In [3]:
regex = pre.get_pattern()

In [5]:
regex

'(?:https?:\\/\\/)?(?:www\\.)?([A-Za-z\\d][\\d\\-a-z.A-Z]{1,61}[A-Za-z\\d])\\.(?:com|org)|(?:\\d{1,3}\\.){3}\\d{1,3}:\\d{4}'

In [6]:

text = "text--192.168.1.1:8000--text--http://www.wikipedia.orghttps://youtube.com--text"

matches = pre.get_matches(text)


In [7]:
matches

['192.168.1.1:8000', 'http://www.wikipedia.org', 'https://youtube.com']