# Regex Problems

In [1]:
# Import module
import re

## 1. *Efficiently* Get a list of all complete URLs that use https protocol

In [2]:
urls = ['http://www.domain.com',
        'https://somedomain.com',
        'http://my-domain-123.net',
        'https://google.com',
        'http://www.foo.com',
        'https://bar-baz3.com',
        'ftp://domain2.com']

In [3]:
# Compile pattern describing URL string
pattern = re.compile(r"https://[a-z0-9.-]+\.\w{3}")

# Return list of matches
complete_https = []
for url in urls:
    result = pattern.findall(url)
    if result:
        complete_https.append(result)
complete_https

[['https://somedomain.com'], ['https://google.com'], ['https://bar-baz3.com']]

## 2. Get domains (without protocols) (including extension, e.g. .com) for URLs with both http and https protocols.

In [4]:
# Compile pattern describing URL string
pattern = re.compile(r"https?://([a-z0-9.-]+\.\w{3})")

# Return list of matches
domains = []
for url in urls:
    result = pattern.findall(url)
    if result:
        domains.append(result)
domains

[['www.domain.com'],
 ['somedomain.com'],
 ['my-domain-123.net'],
 ['google.com'],
 ['www.foo.com'],
 ['bar-baz3.com']]

## 3. Below is a list of language codes. Determine how many are some form of English

* English codes will start with En, en, or EN

In [5]:
languages = ['Ar','It','it','En','En_gb','jp','en_GB','EN_IE','en-NZ','en','es','ES-es']

In [6]:
# Compile pattern describing URL string
pattern = re.compile(r"en([-_][A-Za-z]{2})?", flags = re.IGNORECASE)

# Return list of matches
english = []
for language in languages:
    result = pattern.findall(language)
    if result:
        english.append(result)
len(english)

6

## 4. Fix all langauge codes so that they are formatted as such:
* first two letters are lower case
* codes with region endings use hyphen and not underscore
* the region endings are upper case

In [7]:
# Compile pattern describing URL string
pattern = re.compile(r"([A-Za-z]{2})[-_]?([A-Za-z]{2})?")

# Return list of matches
formatted = []
for language in languages:
    result = re.search(pattern, language)
    if result:
        if result.group(2):
            formatted.append(result.group(1).lower() + '-' + result.group(2).upper())
        else:
            formatted.append(result.group(1).lower())
formatted

['ar',
 'it',
 'it',
 'en',
 'en-GB',
 'jp',
 'en-GB',
 'en-IE',
 'en-NZ',
 'en',
 'es',
 'es-ES']